diff options
Diffstat (limited to 'drivers/xen')
84 files changed, 19598 insertions, 8088 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 9e02d60a364b..f9a35ed266ec 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -1,70 +1,83 @@ +# SPDX-License-Identifier: GPL-2.0-only menu "Xen driver support" depends on XEN config XEN_BALLOON bool "Xen memory balloon driver" - depends on !ARM default y help The balloon driver allows the Xen domain to request more memory from the system to expand the domain's memory allocation, or alternatively return unneeded memory to the system. -config XEN_SELFBALLOONING - bool "Dynamically self-balloon kernel memory to target" - depends on XEN && XEN_BALLOON && CLEANCACHE && SWAP && XEN_TMEM - default n - help - Self-ballooning dynamically balloons available kernel memory driven - by the current usage of anonymous memory ("committed AS") and - controlled by various sysfs-settable parameters. Configuring - FRONTSWAP is highly recommended; if it is not configured, self- - ballooning is disabled by default. If FRONTSWAP is configured, - frontswap-selfshrinking is enabled by default but can be disabled - with the 'tmem.selfshrink=0' kernel boot parameter; and self-ballooning - is enabled by default but can be disabled with the 'tmem.selfballooning=0' - kernel boot parameter. Note that systems without a sufficiently - large swap device should not enable self-ballooning. - config XEN_BALLOON_MEMORY_HOTPLUG bool "Memory hotplug support for Xen balloon driver" - default n depends on XEN_BALLOON && MEMORY_HOTPLUG + default y help Memory hotplug support for Xen balloon driver allows expanding memory available for the system above limit declared at system startup. It is very useful on critical systems which require long run without rebooting. + It's also very useful for non PV domains to obtain unpopulated physical + memory ranges to use in order to map foreign memory or grants. + Memory could be hotplugged in following steps: - 1) dom0: xl mem-max <domU> <maxmem> + 1) target domain: ensure that memory auto online policy is in + effect by checking /sys/devices/system/memory/auto_online_blocks + file (should be 'online'). + + 2) control domain: xl mem-max <target-domain> <maxmem> where <maxmem> is >= requested memory size, - 2) dom0: xl mem-set <domU> <memory> + 3) control domain: xl mem-set <target-domain> <memory> where <memory> is requested memory size; alternatively memory could be added by writing proper value to /sys/devices/system/xen_memory/xen_memory0/target or - /sys/devices/system/xen_memory/xen_memory0/target_kb on dumU, + /sys/devices/system/xen_memory/xen_memory0/target_kb on the + target domain. - 3) domU: for i in /sys/devices/system/memory/memory*/state; do \ - [ "`cat "$i"`" = offline ] && echo online > "$i"; done + Alternatively, if memory auto onlining was not requested at step 1 + the newly added memory can be manually onlined in the target domain + by doing the following: - Memory could be onlined automatically on domU by adding following line to udev rules: + for i in /sys/devices/system/memory/memory*/state; do \ + [ "`cat "$i"`" = offline ] && echo online > "$i"; done + + or by adding the following line to udev rules: SUBSYSTEM=="memory", ACTION=="add", RUN+="/bin/sh -c '[ -f /sys$devpath/state ] && echo online > /sys$devpath/state'" - In that case step 3 should be omitted. +config XEN_MEMORY_HOTPLUG_LIMIT + int "Hotplugged memory limit (in GiB) for a PV guest" + default 512 + depends on XEN_HAVE_PVMMU + depends on MEMORY_HOTPLUG + help + Maximum amount of memory (in GiB) that a PV guest can be + expanded to when using memory hotplug. + + A PV guest can have more memory than this limit if is + started with a larger maximum. + + This value is used to allocate enough space in internal + tables needed for physical memory administration. -config XEN_SCRUB_PAGES - bool "Scrub pages before returning them to system" +config XEN_SCRUB_PAGES_DEFAULT + bool "Scrub pages before returning them to system by default" depends on XEN_BALLOON default y help Scrub pages before returning them to the system for reuse by other domains. This makes sure that any confidential data - is not accidentally visible to other domains. Is it more - secure, but slightly less efficient. + is not accidentally visible to other domains. It is more + secure, but slightly less efficient. This can be controlled with + xen_scrub_pages=0 parameter and + /sys/devices/system/xen_memory/xen_memory0/scrub_pages. + This option only sets the default value. + If in doubt, say yes. config XEN_DEV_EVTCHN @@ -78,8 +91,7 @@ config XEN_DEV_EVTCHN config XEN_BACKEND bool "Backend driver support" - depends on XEN_DOM0 - default y + default XEN_DOM0 help Support for backend device drivers that provide I/O services to other virtual machines. @@ -96,27 +108,27 @@ config XENFS If in doubt, say yes. config XEN_COMPAT_XENFS - bool "Create compatibility mount point /proc/xen" - depends on XENFS - default y - help - The old xenstore userspace tools expect to find "xenbus" - under /proc/xen, but "xenbus" is now found at the root of the - xenfs filesystem. Selecting this causes the kernel to create - the compatibility mount point /proc/xen if it is running on - a xen platform. - If in doubt, say yes. + bool "Create compatibility mount point /proc/xen" + depends on XENFS + default y + help + The old xenstore userspace tools expect to find "xenbus" + under /proc/xen, but "xenbus" is now found at the root of the + xenfs filesystem. Selecting this causes the kernel to create + the compatibility mount point /proc/xen if it is running on + a xen platform. + If in doubt, say yes. config XEN_SYS_HYPERVISOR - bool "Create xen entries under /sys/hypervisor" - depends on SYSFS - select SYS_HYPERVISOR - default y - help - Create entries under /sys/hypervisor describing the Xen - hypervisor environment. When running native or in another - virtual environment, /sys/hypervisor will still be present, - but will have no xen contents. + bool "Create xen entries under /sys/hypervisor" + depends on SYSFS + select SYS_HYPERVISOR + default y + help + Create entries under /sys/hypervisor describing the Xen + hypervisor environment. When running native or in another + virtual environment, /sys/hypervisor will still be present, + but will have no xen contents. config XEN_XENBUS_FRONTEND tristate @@ -126,9 +138,21 @@ config XEN_GNTDEV depends on XEN default m select MMU_NOTIFIER + select FIND_NORMAL_PAGE help Allows userspace processes to use grants. +config XEN_GNTDEV_DMABUF + bool "Add support for dma-buf grant access device driver extension" + depends on XEN_GNTDEV && XEN_GRANT_DMA_ALLOC + select DMA_SHARED_BUFFER + help + Allows userspace processes and kernel modules to use Xen backed + dma-buf implementation. With this extension grant references to + the pages of an imported dma-buf can be exported for other domain + use and grant references coming from a foreign domain can be + converted into a local dma-buf for local export. + config XEN_GRANT_DEV_ALLOC tristate "User-space grant reference allocator driver" depends on XEN @@ -138,23 +162,54 @@ config XEN_GRANT_DEV_ALLOC to other domains. This can be used to implement frontend drivers or as part of an inter-domain shared memory channel. +config XEN_GRANT_DMA_ALLOC + bool "Allow allocating DMA capable buffers with grant reference module" + depends on XEN && HAS_DMA + help + Extends grant table module API to allow allocating DMA capable + buffers and mapping foreign grant references on top of it. + The resulting buffer is similar to one allocated by the balloon + driver in that proper memory reservation is made by + ({increase|decrease}_reservation and VA mappings are updated if + needed). + This is useful for sharing foreign buffers with HW drivers which + cannot work with scattered buffers provided by the balloon driver, + but require DMAable memory instead. + config SWIOTLB_XEN def_bool y - depends on PCI && X86 + depends on ARCH_HAS_DMA_OPS + depends on XEN_PV || ARM || ARM64 select SWIOTLB -config XEN_TMEM - tristate - depends on !ARM - default m if (CLEANCACHE || FRONTSWAP) +config XEN_PCI_STUB + bool + +config XEN_PCIDEV_STUB + tristate "Xen PCI-device stub driver" + depends on PCI && !X86 && XEN + depends on XEN_BACKEND + select XEN_PCI_STUB + default m help - Shim to interface in-kernel Transcendent Memory hooks - (e.g. cleancache and frontswap) to Xen tmem hypercalls. + The PCI device stub driver provides limited version of the PCI + device backend driver without para-virtualized support for guests. + If you select this to be a module, you will need to make sure no + other driver has bound to the device(s) you want to make visible to + other guests. + + The "hide" parameter (only applicable if backend driver is compiled + into the kernel) allows you to bind the PCI devices to this module + from the default device drivers. The argument is the list of PCI BDFs: + xen-pciback.hide=(03:00.0)(04:00.0) + + If in doubt, say m. config XEN_PCIDEV_BACKEND tristate "Xen PCI-device backend driver" depends on PCI && X86 && XEN depends on XEN_BACKEND + select XEN_PCI_STUB default m help The PCI device backend driver allows the kernel to export arbitrary @@ -174,72 +229,147 @@ config XEN_PCIDEV_BACKEND If in doubt, say m. -config XEN_PRIVCMD - tristate - depends on XEN - default m - -config XEN_STUB - bool "Xen stub drivers" - depends on XEN && X86_64 && BROKEN - default n +config XEN_PVCALLS_FRONTEND + tristate "XEN PV Calls frontend driver" + depends on INET && XEN + select XEN_XENBUS_FRONTEND help - Allow kernel to install stub drivers, to reserve space for Xen drivers, - i.e. memory hotplug and cpu hotplug, and to block native drivers loaded, - so that real Xen drivers can be modular. - - To enable Xen features like cpu and memory hotplug, select Y here. - -config XEN_ACPI_HOTPLUG_MEMORY - tristate "Xen ACPI memory hotplug" - depends on XEN_DOM0 && XEN_STUB && ACPI - default n + Experimental frontend for the Xen PV Calls protocol + (https://xenbits.xen.org/docs/unstable/misc/pvcalls.html). It + sends a small set of POSIX calls to the backend, which + implements them. + +config XEN_PVCALLS_BACKEND + tristate "XEN PV Calls backend driver" + depends on INET && XEN && XEN_BACKEND help - This is Xen ACPI memory hotplug. + Experimental backend for the Xen PV Calls protocol + (https://xenbits.xen.org/docs/unstable/misc/pvcalls.html). It + allows PV Calls frontends to send POSIX calls to the backend, + which implements them. - Currently Xen only support ACPI memory hot-add. If you want - to hot-add memory at runtime (the hot-added memory cannot be - removed until machine stop), select Y/M here, otherwise select N. + If in doubt, say n. -config XEN_ACPI_HOTPLUG_CPU - tristate "Xen ACPI cpu hotplug" - depends on XEN_DOM0 && XEN_STUB && ACPI - select ACPI_CONTAINER - default n +config XEN_SCSI_BACKEND + tristate "XEN SCSI backend driver" + depends on XEN && XEN_BACKEND && TARGET_CORE help - Xen ACPI cpu enumerating and hotplugging + The SCSI backend driver allows the kernel to export its SCSI Devices + to other guests via a high-performance shared-memory interface. + Only needed for systems running as XEN driver domains (e.g. Dom0) and + if guests need generic access to SCSI devices. - For hotplugging, currently Xen only support ACPI cpu hotadd. - If you want to hotadd cpu at runtime (the hotadded cpu cannot - be removed until machine stop), select Y/M here. +config XEN_PRIVCMD + tristate "Xen hypercall passthrough driver" + depends on XEN + default m + help + The hypercall passthrough driver allows privileged user programs to + perform Xen hypercalls. This driver is normally required for systems + running as Dom0 to perform privileged operations, but in some + disaggregated Xen setups this driver might be needed for other + domains, too. + +config XEN_PRIVCMD_EVENTFD + bool "Xen Ioeventfd and irqfd support" + depends on XEN_PRIVCMD && XEN_VIRTIO && EVENTFD + help + Using the ioeventfd / irqfd mechanism a virtio backend running in a + daemon can speed up interrupt delivery from / to a guest. config XEN_ACPI_PROCESSOR tristate "Xen ACPI processor" - depends on XEN && X86 && ACPI_PROCESSOR && CPU_FREQ + depends on XEN && XEN_DOM0 && X86 && ACPI_PROCESSOR && CPU_FREQ default m help - This ACPI processor uploads Power Management information to the Xen + This ACPI processor uploads Power Management information to the Xen hypervisor. To do that the driver parses the Power Management data and uploads said information to the Xen hypervisor. Then the Xen hypervisor can - select the proper Cx and Pxx states. It also registers itslef as the + select the proper Cx and Pxx states. It also registers itself as the SMM so that other drivers (such as ACPI cpufreq scaling driver) will not load. - To compile this driver as a module, choose M here: the module will be + To compile this driver as a module, choose M here: the module will be called xen_acpi_processor If you do not know what to choose, select M here. If the CPUFREQ drivers are built in, select Y here. config XEN_MCE_LOG bool "Xen platform mcelog" - depends on XEN_DOM0 && X86_64 && X86_MCE - default n + depends on XEN_PV_DOM0 && X86_MCE help Allow kernel fetching MCE error from Xen platform and converting it into Linux mcelog format for mcelog tools config XEN_HAVE_PVMMU - bool + bool + +config XEN_EFI + def_bool y + depends on (ARM || ARM64 || X86_64) && EFI + +config XEN_AUTO_XLATE + def_bool y + depends on ARM || ARM64 || XEN_PVHVM + help + Support for auto-translated physmap guests. + +config XEN_ACPI + def_bool y + depends on X86 && ACPI + +config XEN_SYMS + bool "Xen symbols" + depends on X86 && XEN_DOM0 && XENFS + default y if KALLSYMS + help + Exports hypervisor symbols (along with their types and addresses) via + /proc/xen/xensyms file, similar to /proc/kallsyms + +config XEN_HAVE_VPMU + bool + +config XEN_FRONT_PGDIR_SHBUF + tristate + +config XEN_UNPOPULATED_ALLOC + bool "Use unpopulated memory ranges for guest mappings" + depends on ZONE_DEVICE + default XEN_BACKEND || XEN_GNTDEV || XEN_DOM0 + help + Use unpopulated memory ranges in order to create mappings for guest + memory regions, including grant maps and foreign pages. This avoids + having to balloon out RAM regions in order to obtain physical memory + space to create such mappings. + +config XEN_GRANT_DMA_IOMMU + bool + select IOMMU_API + +config XEN_GRANT_DMA_OPS + bool + +config XEN_VIRTIO + bool "Xen virtio support" + depends on ARCH_HAS_DMA_OPS + depends on VIRTIO + select XEN_GRANT_DMA_OPS + select XEN_GRANT_DMA_IOMMU if OF + help + Enable virtio support for running as Xen guest. Depending on the + guest type this will require special support on the backend side + (qemu or kernel, depending on the virtio device types used). + + If in doubt, say n. + +config XEN_VIRTIO_FORCE_GRANT + bool "Require Xen virtio support to use grants" + depends on XEN_VIRTIO + help + Require virtio for Xen guests to use grant mappings. + This will avoid the need to give the backend the right to map all + of the guest memory. This will need support on the backend side + (e.g. qemu or kernel, depending on the virtio device types used). endmenu diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index eabd0ee1c2bc..c0503f1c7d5b 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,40 +1,43 @@ -ifneq ($(CONFIG_ARM),y) -obj-y += manage.o +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o -endif -obj-$(CONFIG_X86) += fallback.o -obj-y += grant-table.o features.o events.o balloon.o +obj-y += grant-table.o features.o balloon.o manage.o time.o +obj-y += mem-reservation.o +obj-y += events/ obj-y += xenbus/ -nostackp := $(call cc-option, -fno-stack-protector) -CFLAGS_features.o := $(nostackp) +CFLAGS_features.o := -fno-stack-protector +dom0-$(CONFIG_ARM64) += arm-device.o dom0-$(CONFIG_PCI) += pci.o dom0-$(CONFIG_USB_SUPPORT) += dbgp.o -dom0-$(CONFIG_ACPI) += acpi.o $(xen-pad-y) +dom0-$(CONFIG_XEN_ACPI) += acpi.o $(xen-pad-y) xen-pad-$(CONFIG_X86) += xen-acpi-pad.o dom0-$(CONFIG_X86) += pcpu.o obj-$(CONFIG_XEN_DOM0) += $(dom0-y) obj-$(CONFIG_BLOCK) += biomerge.o -obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += xen-balloon.o -obj-$(CONFIG_XEN_SELFBALLOONING) += xen-selfballoon.o obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o obj-$(CONFIG_XEN_GRANT_DEV_ALLOC) += xen-gntalloc.o obj-$(CONFIG_XENFS) += xenfs/ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o -obj-$(CONFIG_XEN_PVHVM) += platform-pci.o -obj-$(CONFIG_XEN_TMEM) += tmem.o +obj-$(CONFIG_XEN_PVHVM_GUEST) += platform-pci.o obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o obj-$(CONFIG_XEN_MCE_LOG) += mcelog.o -obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback/ +obj-$(CONFIG_XEN_PCI_STUB) += xen-pciback/ obj-$(CONFIG_XEN_PRIVCMD) += xen-privcmd.o -obj-$(CONFIG_XEN_STUB) += xen-stub.o -obj-$(CONFIG_XEN_ACPI_HOTPLUG_MEMORY) += xen-acpi-memhotplug.o -obj-$(CONFIG_XEN_ACPI_HOTPLUG_CPU) += xen-acpi-cpuhotplug.o obj-$(CONFIG_XEN_ACPI_PROCESSOR) += xen-acpi-processor.o +obj-$(CONFIG_XEN_EFI) += efi.o +obj-$(CONFIG_XEN_SCSI_BACKEND) += xen-scsiback.o +obj-$(CONFIG_XEN_AUTO_XLATE) += xlate_mmu.o +obj-$(CONFIG_XEN_PVCALLS_BACKEND) += pvcalls-back.o +obj-$(CONFIG_XEN_PVCALLS_FRONTEND) += pvcalls-front.o xen-evtchn-y := evtchn.o xen-gntdev-y := gntdev.o +xen-gntdev-$(CONFIG_XEN_GNTDEV_DMABUF) += gntdev-dmabuf.o xen-gntalloc-y := gntalloc.o -xen-privcmd-y := privcmd.o +xen-privcmd-y := privcmd.o privcmd-buf.o +obj-$(CONFIG_XEN_FRONT_PGDIR_SHBUF) += xen-front-pgdir-shbuf.o +obj-$(CONFIG_XEN_UNPOPULATED_ALLOC) += unpopulated-alloc.o +obj-$(CONFIG_XEN_GRANT_DMA_OPS) += grant-dma-ops.o +obj-$(CONFIG_XEN_GRANT_DMA_IOMMU) += grant-dma-iommu.o diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c index 119d42a2bf57..d2ee605c5ca1 100644 --- a/drivers/xen/acpi.c +++ b/drivers/xen/acpi.c @@ -30,33 +30,122 @@ * IN THE SOFTWARE. */ +#include <linux/pci.h> #include <xen/acpi.h> #include <xen/interface/platform.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> -int xen_acpi_notify_hypervisor_state(u8 sleep_state, - u32 pm1a_cnt, u32 pm1b_cnt) +static int xen_acpi_notify_hypervisor_state(u8 sleep_state, + u32 val_a, u32 val_b, + bool extended) { + unsigned int bits = extended ? 8 : 16; + struct xen_platform_op op = { .cmd = XENPF_enter_acpi_sleep, .interface_version = XENPF_INTERFACE_VERSION, - .u = { - .enter_acpi_sleep = { - .pm1a_cnt_val = (u16)pm1a_cnt, - .pm1b_cnt_val = (u16)pm1b_cnt, - .sleep_state = sleep_state, - }, + .u.enter_acpi_sleep = { + .val_a = (u16)val_a, + .val_b = (u16)val_b, + .sleep_state = sleep_state, + .flags = extended ? XENPF_ACPI_SLEEP_EXTENDED : 0, }, }; - if ((pm1a_cnt & 0xffff0000) || (pm1b_cnt & 0xffff0000)) { - WARN(1, "Using more than 16bits of PM1A/B 0x%x/0x%x!" - "Email xen-devel@lists.xensource.com Thank you.\n", \ - pm1a_cnt, pm1b_cnt); + if (WARN((val_a & (~0 << bits)) || (val_b & (~0 << bits)), + "Using more than %u bits of sleep control values %#x/%#x!" + "Email xen-devel@lists.xen.org - Thank you.\n", \ + bits, val_a, val_b)) return -1; - } - HYPERVISOR_dom0_op(&op); + HYPERVISOR_platform_op(&op); return 1; } + +int xen_acpi_notify_hypervisor_sleep(u8 sleep_state, + u32 pm1a_cnt, u32 pm1b_cnt) +{ + return xen_acpi_notify_hypervisor_state(sleep_state, pm1a_cnt, + pm1b_cnt, false); +} + +int xen_acpi_notify_hypervisor_extended_sleep(u8 sleep_state, + u32 val_a, u32 val_b) +{ + return xen_acpi_notify_hypervisor_state(sleep_state, val_a, + val_b, true); +} + +struct acpi_prt_entry { + struct acpi_pci_id id; + u8 pin; + acpi_handle link; + u32 index; +}; + +int xen_acpi_get_gsi_info(struct pci_dev *dev, + int *gsi_out, + int *trigger_out, + int *polarity_out) +{ + int gsi; + u8 pin; + struct acpi_prt_entry *entry; + int trigger = ACPI_LEVEL_SENSITIVE; + int polarity = acpi_irq_model == ACPI_IRQ_MODEL_GIC ? + ACPI_ACTIVE_HIGH : ACPI_ACTIVE_LOW; + + if (!dev || !gsi_out || !trigger_out || !polarity_out) + return -EINVAL; + + pin = dev->pin; + if (!pin) + return -EINVAL; + + entry = acpi_pci_irq_lookup(dev, pin); + if (entry) { + if (entry->link) + gsi = acpi_pci_link_allocate_irq(entry->link, + entry->index, + &trigger, &polarity, + NULL); + else + gsi = entry->index; + } else + gsi = -1; + + if (gsi < 0) + return -EINVAL; + + *gsi_out = gsi; + *trigger_out = trigger; + *polarity_out = polarity; + + return 0; +} +EXPORT_SYMBOL_GPL(xen_acpi_get_gsi_info); + +static get_gsi_from_sbdf_t get_gsi_from_sbdf; +static DEFINE_RWLOCK(get_gsi_from_sbdf_lock); + +void xen_acpi_register_get_gsi_func(get_gsi_from_sbdf_t func) +{ + write_lock(&get_gsi_from_sbdf_lock); + get_gsi_from_sbdf = func; + write_unlock(&get_gsi_from_sbdf_lock); +} +EXPORT_SYMBOL_GPL(xen_acpi_register_get_gsi_func); + +int xen_acpi_get_gsi_from_sbdf(u32 sbdf) +{ + int ret = -EOPNOTSUPP; + + read_lock(&get_gsi_from_sbdf_lock); + if (get_gsi_from_sbdf) + ret = get_gsi_from_sbdf(sbdf); + read_unlock(&get_gsi_from_sbdf_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(xen_acpi_get_gsi_from_sbdf); diff --git a/drivers/xen/arm-device.c b/drivers/xen/arm-device.c new file mode 100644 index 000000000000..87493f92291f --- /dev/null +++ b/drivers/xen/arm-device.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2015, Linaro Limited, Shannon Zhao + */ + +#include <linux/platform_device.h> +#include <linux/acpi.h> +#include <xen/xen.h> +#include <xen/page.h> +#include <xen/interface/memory.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> + +static int xen_unmap_device_mmio(const struct resource *resources, + unsigned int count) +{ + unsigned int i, j, nr; + int rc = 0; + const struct resource *r; + struct xen_remove_from_physmap xrp; + + for (i = 0; i < count; i++) { + r = &resources[i]; + nr = DIV_ROUND_UP(resource_size(r), XEN_PAGE_SIZE); + if ((resource_type(r) != IORESOURCE_MEM) || (nr == 0)) + continue; + + for (j = 0; j < nr; j++) { + xrp.domid = DOMID_SELF; + xrp.gpfn = XEN_PFN_DOWN(r->start) + j; + rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, + &xrp); + if (rc) + return rc; + } + } + + return rc; +} + +static int xen_map_device_mmio(const struct resource *resources, + unsigned int count) +{ + unsigned int i, j, nr; + int rc = 0; + const struct resource *r; + xen_pfn_t *gpfns; + xen_ulong_t *idxs; + int *errs; + + for (i = 0; i < count; i++) { + struct xen_add_to_physmap_range xatp = { + .domid = DOMID_SELF, + .space = XENMAPSPACE_dev_mmio + }; + + r = &resources[i]; + nr = DIV_ROUND_UP(resource_size(r), XEN_PAGE_SIZE); + if ((resource_type(r) != IORESOURCE_MEM) || (nr == 0)) + continue; + + gpfns = kcalloc(nr, sizeof(xen_pfn_t), GFP_KERNEL); + idxs = kcalloc(nr, sizeof(xen_ulong_t), GFP_KERNEL); + errs = kcalloc(nr, sizeof(int), GFP_KERNEL); + if (!gpfns || !idxs || !errs) { + kfree(gpfns); + kfree(idxs); + kfree(errs); + rc = -ENOMEM; + goto unmap; + } + + for (j = 0; j < nr; j++) { + /* + * The regions are always mapped 1:1 to DOM0 and this is + * fine because the memory map for DOM0 is the same as + * the host (except for the RAM). + */ + gpfns[j] = XEN_PFN_DOWN(r->start) + j; + idxs[j] = XEN_PFN_DOWN(r->start) + j; + } + + xatp.size = nr; + + set_xen_guest_handle(xatp.gpfns, gpfns); + set_xen_guest_handle(xatp.idxs, idxs); + set_xen_guest_handle(xatp.errs, errs); + + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp); + kfree(gpfns); + kfree(idxs); + kfree(errs); + if (rc) + goto unmap; + } + + return rc; + +unmap: + xen_unmap_device_mmio(resources, i); + return rc; +} + +static int xen_platform_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct platform_device *pdev = to_platform_device(data); + int r = 0; + + if (pdev->num_resources == 0 || pdev->resource == NULL) + return NOTIFY_OK; + + switch (action) { + case BUS_NOTIFY_ADD_DEVICE: + r = xen_map_device_mmio(pdev->resource, pdev->num_resources); + break; + case BUS_NOTIFY_DEL_DEVICE: + r = xen_unmap_device_mmio(pdev->resource, pdev->num_resources); + break; + default: + return NOTIFY_DONE; + } + if (r) + dev_err(&pdev->dev, "Platform: Failed to %s device %s MMIO!\n", + action == BUS_NOTIFY_ADD_DEVICE ? "map" : + (action == BUS_NOTIFY_DEL_DEVICE ? "unmap" : "?"), + pdev->name); + + return NOTIFY_OK; +} + +static struct notifier_block platform_device_nb = { + .notifier_call = xen_platform_notifier, +}; + +static int __init register_xen_platform_notifier(void) +{ + if (!xen_initial_domain() || acpi_disabled) + return 0; + + return bus_register_notifier(&platform_bus_type, &platform_device_nb); +} + +arch_initcall(register_xen_platform_notifier); + +#ifdef CONFIG_ARM_AMBA +#include <linux/amba/bus.h> + +static int xen_amba_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct amba_device *adev = to_amba_device(data); + int r = 0; + + switch (action) { + case BUS_NOTIFY_ADD_DEVICE: + r = xen_map_device_mmio(&adev->res, 1); + break; + case BUS_NOTIFY_DEL_DEVICE: + r = xen_unmap_device_mmio(&adev->res, 1); + break; + default: + return NOTIFY_DONE; + } + if (r) + dev_err(&adev->dev, "AMBA: Failed to %s device %s MMIO!\n", + action == BUS_NOTIFY_ADD_DEVICE ? "map" : + (action == BUS_NOTIFY_DEL_DEVICE ? "unmap" : "?"), + adev->dev.init_name); + + return NOTIFY_OK; +} + +static struct notifier_block amba_device_nb = { + .notifier_call = xen_amba_notifier, +}; + +static int __init register_xen_amba_notifier(void) +{ + if (!xen_initial_domain() || acpi_disabled) + return 0; + + return bus_register_notifier(&amba_bustype, &amba_device_nb); +} + +arch_initcall(register_xen_amba_notifier); +#endif diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 2a2ef97697b2..49c3f9926394 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -38,12 +38,15 @@ #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt +#include <linux/cpu.h> #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/cred.h> #include <linux/errno.h> -#include <linux/module.h> +#include <linux/freezer.h> +#include <linux/kthread.h> #include <linux/mm.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/mutex.h> @@ -52,10 +55,13 @@ #include <linux/notifier.h> #include <linux/memory.h> #include <linux/memory_hotplug.h> +#include <linux/percpu-defs.h> +#include <linux/slab.h> +#include <linux/sysctl.h> +#include <linux/moduleparam.h> +#include <linux/jiffies.h> #include <asm/page.h> -#include <asm/pgalloc.h> -#include <asm/pgtable.h> #include <asm/tlb.h> #include <asm/xen/hypervisor.h> @@ -67,21 +73,57 @@ #include <xen/balloon.h> #include <xen/features.h> #include <xen/page.h> +#include <xen/mem-reservation.h> + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "xen." + +static uint __read_mostly balloon_boot_timeout = 180; +module_param(balloon_boot_timeout, uint, 0444); + +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +static int xen_hotplug_unpopulated; + +static const struct ctl_table balloon_table[] = { + { + .procname = "hotplug_unpopulated", + .data = &xen_hotplug_unpopulated, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +}; + +#else +#define xen_hotplug_unpopulated 0 +#endif + +/* + * Use one extent per PAGE_SIZE to avoid to break down the page into + * multiple frame. + */ +#define EXTENT_ORDER (fls(XEN_PFN_PER_PAGE) - 1) /* - * balloon_process() state: + * balloon_thread() state: * * BP_DONE: done or nothing to do, + * BP_WAIT: wait to be rescheduled, * BP_EAGAIN: error, go to sleep, * BP_ECANCELED: error, balloon operation canceled. */ -enum bp_state { +static enum bp_state { BP_DONE, + BP_WAIT, BP_EAGAIN, BP_ECANCELED -}; +} balloon_state = BP_DONE; +/* Main waiting point for xen-balloon thread. */ +static DECLARE_WAIT_QUEUE_HEAD(balloon_thread_wq); static DEFINE_MUTEX(balloon_mutex); @@ -89,30 +131,24 @@ struct balloon_stats balloon_stats; EXPORT_SYMBOL_GPL(balloon_stats); /* We increase/decrease in batches which fit in a page */ -static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)]; +static xen_pfn_t frame_list[PAGE_SIZE / sizeof(xen_pfn_t)]; + /* List of ballooned pages, threaded through the mem_map array. */ static LIST_HEAD(ballooned_pages); - -/* Main work function, always executed in process context. */ -static void balloon_process(struct work_struct *work); -static DECLARE_DELAYED_WORK(balloon_worker, balloon_process); +static DECLARE_WAIT_QUEUE_HEAD(balloon_wq); /* When ballooning out (allocating memory to return to Xen) we don't really want the kernel to try too hard since that can trigger the oom killer. */ #define GFP_BALLOON \ (GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC) -static void scrub_page(struct page *page) -{ -#ifdef CONFIG_XEN_SCRUB_PAGES - clear_highpage(page); -#endif -} - /* balloon_append: add the given page to the balloon. */ -static void __balloon_append(struct page *page) +static void balloon_append(struct page *page) { + if (!PageOffline(page)) + __SetPageOffline(page); + /* Lowmem is re-populated first, so highmem pages go at list tail. */ if (PageHighMem(page)) { list_add_tail(&page->lru, &ballooned_pages); @@ -121,26 +157,22 @@ static void __balloon_append(struct page *page) list_add(&page->lru, &ballooned_pages); balloon_stats.balloon_low++; } -} + inc_node_page_state(page, NR_BALLOON_PAGES); -static void balloon_append(struct page *page) -{ - __balloon_append(page); - adjust_managed_page_count(page, -1); + wake_up(&balloon_wq); } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ -static struct page *balloon_retrieve(bool prefer_highmem) +static struct page *balloon_retrieve(bool require_lowmem) { struct page *page; if (list_empty(&ballooned_pages)) return NULL; - if (prefer_highmem) - page = list_entry(ballooned_pages.prev, struct page, lru); - else - page = list_entry(ballooned_pages.next, struct page, lru); + page = list_entry(ballooned_pages.next, struct page, lru); + if (require_lowmem && PageHighMem(page)) + return NULL; list_del(&page->lru); if (PageHighMem(page)) @@ -148,18 +180,12 @@ static struct page *balloon_retrieve(bool prefer_highmem) else balloon_stats.balloon_low--; - adjust_managed_page_count(page, 1); + __ClearPageOffline(page); + dec_node_page_state(page, NR_BALLOON_PAGES); return page; } -static struct page *balloon_first_page(void) -{ - if (list_empty(&ballooned_pages)) - return NULL; - return list_entry(ballooned_pages.next, struct page, lru); -} - static struct page *balloon_next_page(struct page *page) { struct list_head *next = page->lru.next; @@ -168,12 +194,15 @@ static struct page *balloon_next_page(struct page *page) return list_entry(next, struct page, lru); } -static enum bp_state update_schedule(enum bp_state state) +static void update_schedule(void) { - if (state == BP_DONE) { + if (balloon_state == BP_WAIT || balloon_state == BP_ECANCELED) + return; + + if (balloon_state == BP_DONE) { balloon_stats.schedule_delay = 1; balloon_stats.retry_count = 1; - return BP_DONE; + return; } ++balloon_stats.retry_count; @@ -182,7 +211,8 @@ static enum bp_state update_schedule(enum bp_state state) balloon_stats.retry_count > balloon_stats.max_retry_count) { balloon_stats.schedule_delay = 1; balloon_stats.retry_count = 1; - return BP_ECANCELED; + balloon_state = BP_ECANCELED; + return; } balloon_stats.schedule_delay <<= 1; @@ -190,81 +220,146 @@ static enum bp_state update_schedule(enum bp_state state) if (balloon_stats.schedule_delay > balloon_stats.max_schedule_delay) balloon_stats.schedule_delay = balloon_stats.max_schedule_delay; - return BP_EAGAIN; + balloon_state = BP_EAGAIN; } #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG -static long current_credit(void) +static void release_memory_resource(struct resource *resource) { - return balloon_stats.target_pages - balloon_stats.current_pages - - balloon_stats.hotplug_pages; + if (!resource) + return; + + /* + * No need to reset region to identity mapped since we now + * know that no I/O can be in this region + */ + release_resource(resource); + kfree(resource); } -static bool balloon_is_inflated(void) +static struct resource *additional_memory_resource(phys_addr_t size) { - if (balloon_stats.balloon_low || balloon_stats.balloon_high || - balloon_stats.balloon_hotplug) - return true; - else - return false; -} + struct resource *res; + int ret; -/* - * reserve_additional_memory() adds memory region of size >= credit above - * max_pfn. New region is section aligned and size is modified to be multiple - * of section size. Those features allow optimal use of address space and - * establish proper alignment when this function is called first time after - * boot (last section not fully populated at boot time contains unused memory - * pages with PG_reserved bit not set; online_pages_range() does not allow page - * onlining in whole range if first onlined page does not have PG_reserved - * bit set). Real size of added memory is established at page onlining stage. - */ + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return NULL; + + res->name = "System RAM"; + res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + + ret = allocate_resource(&iomem_resource, res, + size, 0, -1, + PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL); + if (ret < 0) { + pr_err("Cannot allocate new System RAM resource\n"); + kfree(res); + return NULL; + } -static enum bp_state reserve_additional_memory(long credit) + return res; +} + +static enum bp_state reserve_additional_memory(void) { + long credit; + struct resource *resource; int nid, rc; - u64 hotplug_start_paddr; - unsigned long balloon_hotplug = credit; + unsigned long balloon_hotplug; - hotplug_start_paddr = PFN_PHYS(SECTION_ALIGN_UP(max_pfn)); - balloon_hotplug = round_up(balloon_hotplug, PAGES_PER_SECTION); - nid = memory_add_physaddr_to_nid(hotplug_start_paddr); + credit = balloon_stats.target_pages + balloon_stats.target_unpopulated + - balloon_stats.total_pages; - rc = add_memory(nid, hotplug_start_paddr, balloon_hotplug << PAGE_SHIFT); + /* + * Already hotplugged enough pages? Wait for them to be + * onlined. + */ + if (credit <= 0) + return BP_WAIT; - if (rc) { - pr_info("%s: add_memory() failed: %i\n", __func__, rc); - return BP_EAGAIN; + balloon_hotplug = round_up(credit, PAGES_PER_SECTION); + + resource = additional_memory_resource(balloon_hotplug * PAGE_SIZE); + if (!resource) + goto err; + + nid = memory_add_physaddr_to_nid(resource->start); + +#ifdef CONFIG_XEN_HAVE_PVMMU + /* + * We don't support PV MMU when Linux and Xen is using + * different page granularity. + */ + BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); + + /* + * add_memory() will build page tables for the new memory so + * the p2m must contain invalid entries so the correct + * non-present PTEs will be written. + * + * If a failure occurs, the original (identity) p2m entries + * are not restored since this region is now known not to + * conflict with any devices. + */ + if (xen_pv_domain()) { + unsigned long pfn, i; + + pfn = PFN_DOWN(resource->start); + for (i = 0; i < balloon_hotplug; i++) { + if (!set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY)) { + pr_warn("set_phys_to_machine() failed, no memory added\n"); + goto err; + } + } } +#endif - balloon_hotplug -= credit; + /* + * add_memory_resource() will call online_pages() which in its turn + * will call xen_online_page() callback causing deadlock if we don't + * release balloon_mutex here. Unlocking here is safe because the + * callers drop the mutex before trying again. + */ + mutex_unlock(&balloon_mutex); + /* add_memory_resource() requires the device_hotplug lock */ + lock_device_hotplug(); + rc = add_memory_resource(nid, resource, MHP_MERGE_RESOURCE); + unlock_device_hotplug(); + mutex_lock(&balloon_mutex); - balloon_stats.hotplug_pages += credit; - balloon_stats.balloon_hotplug = balloon_hotplug; + if (rc) { + pr_warn("Cannot add additional memory (%i)\n", rc); + goto err; + } - return BP_DONE; + balloon_stats.total_pages += balloon_hotplug; + + return BP_WAIT; + err: + release_memory_resource(resource); + return BP_ECANCELED; } -static void xen_online_page(struct page *page) +static void xen_online_page(struct page *page, unsigned int order) { - __online_page_set_limits(page); + unsigned long i, size = (1 << order); + unsigned long start_pfn = page_to_pfn(page); + struct page *p; + pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn); mutex_lock(&balloon_mutex); - - __balloon_append(page); - - if (balloon_stats.hotplug_pages) - --balloon_stats.hotplug_pages; - else - --balloon_stats.balloon_hotplug; - + for (i = 0; i < size; i++) { + p = pfn_to_page(start_pfn + i); + balloon_append(p); + } mutex_unlock(&balloon_mutex); } static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v) { if (val == MEM_ONLINE) - schedule_delayed_work(&balloon_worker, 0); + wake_up(&balloon_thread_wq); return NOTIFY_OK; } @@ -274,69 +369,45 @@ static struct notifier_block xen_memory_nb = { .priority = 0 }; #else -static long current_credit(void) +static enum bp_state reserve_additional_memory(void) { - unsigned long target = balloon_stats.target_pages; - - target = min(target, - balloon_stats.current_pages + - balloon_stats.balloon_low + - balloon_stats.balloon_high); - - return target - balloon_stats.current_pages; + balloon_stats.target_pages = balloon_stats.current_pages + + balloon_stats.target_unpopulated; + return BP_ECANCELED; } +#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ -static bool balloon_is_inflated(void) +static long current_credit(void) { - if (balloon_stats.balloon_low || balloon_stats.balloon_high) - return true; - else - return false; + return balloon_stats.target_pages - balloon_stats.current_pages; } -static enum bp_state reserve_additional_memory(long credit) +static bool balloon_is_inflated(void) { - balloon_stats.target_pages = balloon_stats.current_pages; - return BP_DONE; + return balloon_stats.balloon_low || balloon_stats.balloon_high; } -#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ static enum bp_state increase_reservation(unsigned long nr_pages) { int rc; - unsigned long pfn, i; + unsigned long i; struct page *page; - struct xen_memory_reservation reservation = { - .address_bits = 0, - .extent_order = 0, - .domid = DOMID_SELF - }; - -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG - if (!balloon_stats.balloon_low && !balloon_stats.balloon_high) { - nr_pages = min(nr_pages, balloon_stats.balloon_hotplug); - balloon_stats.hotplug_pages += nr_pages; - balloon_stats.balloon_hotplug -= nr_pages; - return BP_DONE; - } -#endif if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); - page = balloon_first_page(); + page = list_first_entry_or_null(&ballooned_pages, struct page, lru); for (i = 0; i < nr_pages; i++) { if (!page) { nr_pages = i; break; } - frame_list[i] = page_to_pfn(page); + + frame_list[i] = page_to_xen_pfn(page); page = balloon_next_page(page); } - set_xen_guest_handle(reservation.extent_start, frame_list); - reservation.nr_extents = nr_pages; - rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); + rc = xenmem_reservation_increase(nr_pages, frame_list); if (rc <= 0) return BP_EAGAIN; @@ -344,26 +415,14 @@ static enum bp_state increase_reservation(unsigned long nr_pages) page = balloon_retrieve(false); BUG_ON(page == NULL); - pfn = page_to_pfn(page); - BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) && - phys_to_machine_mapping_valid(pfn)); - - set_phys_to_machine(pfn, frame_list[i]); + xenmem_reservation_va_mapping_update(1, &page, &frame_list[i]); -#ifdef CONFIG_XEN_HAVE_PVMMU - /* Link back into the page tables if not highmem. */ - if (xen_pv_domain() && !PageHighMem(page)) { - int ret; - ret = HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - mfn_pte(frame_list[i], PAGE_KERNEL), - 0); - BUG_ON(ret); - } -#endif - - /* Relinquish the page back to the allocator. */ - __free_reserved_page(page); + /* + * Relinquish the page back to the allocator. Note that + * some pages, including ones added via xen_online_page(), might + * not be marked reserved; free_reserved_page() will handle that. + */ + free_reserved_page(page); } balloon_stats.current_pages += rc; @@ -374,23 +433,10 @@ static enum bp_state increase_reservation(unsigned long nr_pages) static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) { enum bp_state state = BP_DONE; - unsigned long pfn, i; - struct page *page; + unsigned long i; + struct page *page, *tmp; int ret; - struct xen_memory_reservation reservation = { - .address_bits = 0, - .extent_order = 0, - .domid = DOMID_SELF - }; - -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG - if (balloon_stats.hotplug_pages) { - nr_pages = min(nr_pages, balloon_stats.hotplug_pages); - balloon_stats.hotplug_pages -= nr_pages; - balloon_stats.balloon_hotplug += nr_pages; - return BP_DONE; - } -#endif + LIST_HEAD(pages); if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); @@ -402,36 +448,38 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) state = BP_EAGAIN; break; } + adjust_managed_page_count(page, -1); + xenmem_reservation_scrub_page(page); + list_add(&page->lru, &pages); + } - pfn = page_to_pfn(page); - frame_list[i] = pfn_to_mfn(pfn); + /* + * Ensure that ballooned highmem pages don't have kmaps. + * + * Do this before changing the p2m as kmap_flush_unused() + * reads PTEs to obtain pages (and hence needs the original + * p2m entry). + */ + kmap_flush_unused(); - scrub_page(page); + /* + * Setup the frame, update direct mapping, invalidate P2M, + * and add to balloon. + */ + i = 0; + list_for_each_entry_safe(page, tmp, &pages, lru) { + frame_list[i++] = xen_page_to_gfn(page); -#ifdef CONFIG_XEN_HAVE_PVMMU - if (xen_pv_domain() && !PageHighMem(page)) { - ret = HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - __pte_ma(0), 0); - BUG_ON(ret); - } -#endif - } + xenmem_reservation_va_mapping_reset(1, &page); - /* Ensure that ballooned highmem pages don't have kmaps. */ - kmap_flush_unused(); - flush_tlb_all(); + list_del(&page->lru); - /* No more mappings: invalidate P2M and add to balloon. */ - for (i = 0; i < nr_pages; i++) { - pfn = mfn_to_pfn(frame_list[i]); - __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); - balloon_append(pfn_to_page(pfn)); + balloon_append(page); } - set_xen_guest_handle(reservation.extent_start, frame_list); - reservation.nr_extents = nr_pages; - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); + flush_tlb_all(); + + ret = xenmem_reservation_decrease(nr_pages, frame_list); BUG_ON(ret != nr_pages); balloon_stats.current_pages -= nr_pages; @@ -440,44 +488,79 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) } /* - * We avoid multiple worker processes conflicting via the balloon mutex. + * Stop waiting if either state is BP_DONE and ballooning action is + * needed, or if the credit has changed while state is not BP_DONE. + */ +static bool balloon_thread_cond(long credit) +{ + if (balloon_state == BP_DONE) + credit = 0; + + return current_credit() != credit || kthread_should_stop(); +} + +/* + * As this is a kthread it is guaranteed to run as a single instance only. * We may of course race updates of the target counts (which are protected * by the balloon lock), or with changes to the Xen hard limit, but we will * recover from these in time. */ -static void balloon_process(struct work_struct *work) +static int balloon_thread(void *unused) { - enum bp_state state = BP_DONE; long credit; + unsigned long timeout; + + set_freezable(); + for (;;) { + switch (balloon_state) { + case BP_DONE: + case BP_ECANCELED: + timeout = 3600 * HZ; + break; + case BP_EAGAIN: + timeout = balloon_stats.schedule_delay * HZ; + break; + case BP_WAIT: + timeout = HZ; + break; + } - mutex_lock(&balloon_mutex); + credit = current_credit(); + + wait_event_freezable_timeout(balloon_thread_wq, + balloon_thread_cond(credit), timeout); + + if (kthread_should_stop()) + return 0; + + mutex_lock(&balloon_mutex); - do { credit = current_credit(); if (credit > 0) { if (balloon_is_inflated()) - state = increase_reservation(credit); + balloon_state = increase_reservation(credit); else - state = reserve_additional_memory(credit); + balloon_state = reserve_additional_memory(); } - if (credit < 0) - state = decrease_reservation(-credit, GFP_BALLOON); + if (credit < 0) { + long n_pages; - state = update_schedule(state); + n_pages = min(-credit, si_mem_available()); + balloon_state = decrease_reservation(n_pages, + GFP_BALLOON); + if (balloon_state == BP_DONE && n_pages != -credit && + n_pages < totalreserve_pages) + balloon_state = BP_EAGAIN; + } -#ifndef CONFIG_PREEMPT - if (need_resched()) - schedule(); -#endif - } while (credit && state == BP_DONE); + update_schedule(); - /* Schedule more work if there is some still to be done. */ - if (state == BP_EAGAIN) - schedule_delayed_work(&balloon_worker, balloon_stats.schedule_delay * HZ); + mutex_unlock(&balloon_mutex); - mutex_unlock(&balloon_mutex); + cond_resched(); + } } /* Resets the Xen limit, sets new target, and kicks off processing. */ @@ -485,56 +568,99 @@ void balloon_set_new_target(unsigned long target) { /* No need for lock. Not read-modify-write updates. */ balloon_stats.target_pages = target; - schedule_delayed_work(&balloon_worker, 0); + wake_up(&balloon_thread_wq); } EXPORT_SYMBOL_GPL(balloon_set_new_target); +static int add_ballooned_pages(unsigned int nr_pages) +{ + enum bp_state st; + + if (xen_hotplug_unpopulated) { + st = reserve_additional_memory(); + if (st != BP_ECANCELED) { + int rc; + + mutex_unlock(&balloon_mutex); + rc = wait_event_interruptible(balloon_wq, + !list_empty(&ballooned_pages)); + mutex_lock(&balloon_mutex); + return rc ? -ENOMEM : 0; + } + } + + if (si_mem_available() < nr_pages) + return -ENOMEM; + + st = decrease_reservation(nr_pages, GFP_USER); + if (st != BP_DONE) + return -ENOMEM; + + return 0; +} + /** - * alloc_xenballooned_pages - get pages that have been ballooned out + * xen_alloc_ballooned_pages - get pages that have been ballooned out * @nr_pages: Number of pages to get * @pages: pages returned - * @highmem: allow highmem pages * @return 0 on success, error otherwise */ -int alloc_xenballooned_pages(int nr_pages, struct page **pages, bool highmem) +int xen_alloc_ballooned_pages(unsigned int nr_pages, struct page **pages) { - int pgno = 0; + unsigned int pgno = 0; struct page *page; + int ret; + mutex_lock(&balloon_mutex); + + balloon_stats.target_unpopulated += nr_pages; + while (pgno < nr_pages) { - page = balloon_retrieve(highmem); - if (page && (highmem || !PageHighMem(page))) { + page = balloon_retrieve(true); + if (page) { pages[pgno++] = page; +#ifdef CONFIG_XEN_HAVE_PVMMU + /* + * We don't support PV MMU when Linux and Xen is using + * different page granularity. + */ + BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); + + if (xen_pv_domain()) { + ret = xen_alloc_p2m_entry(page_to_pfn(page)); + if (ret < 0) + goto out_undo; + } +#endif } else { - enum bp_state st; - if (page) - balloon_append(page); - st = decrease_reservation(nr_pages - pgno, - highmem ? GFP_HIGHUSER : GFP_USER); - if (st != BP_DONE) + ret = add_ballooned_pages(nr_pages - pgno); + if (ret < 0) goto out_undo; } } mutex_unlock(&balloon_mutex); return 0; out_undo: - while (pgno) - balloon_append(pages[--pgno]); - /* Free the memory back to the kernel soon */ - schedule_delayed_work(&balloon_worker, 0); mutex_unlock(&balloon_mutex); - return -ENOMEM; + xen_free_ballooned_pages(pgno, pages); + /* + * NB: xen_free_ballooned_pages will only subtract pgno pages, but since + * target_unpopulated is incremented with nr_pages at the start we need + * to remove the remaining ones also, or accounting will be screwed. + */ + balloon_stats.target_unpopulated -= nr_pages - pgno; + return ret; } -EXPORT_SYMBOL(alloc_xenballooned_pages); +EXPORT_SYMBOL(xen_alloc_ballooned_pages); /** - * free_xenballooned_pages - return pages retrieved with get_ballooned_pages + * xen_free_ballooned_pages - return pages retrieved with get_ballooned_pages * @nr_pages: Number of pages * @pages: pages to return */ -void free_xenballooned_pages(int nr_pages, struct page **pages) +void xen_free_ballooned_pages(unsigned int nr_pages, struct page **pages) { - int i; + unsigned int i; mutex_lock(&balloon_mutex); @@ -543,77 +669,138 @@ void free_xenballooned_pages(int nr_pages, struct page **pages) balloon_append(pages[i]); } + balloon_stats.target_unpopulated -= nr_pages; + /* The balloon may be too large now. Shrink it if needed. */ if (current_credit()) - schedule_delayed_work(&balloon_worker, 0); + wake_up(&balloon_thread_wq); mutex_unlock(&balloon_mutex); } -EXPORT_SYMBOL(free_xenballooned_pages); +EXPORT_SYMBOL(xen_free_ballooned_pages); -static void __init balloon_add_region(unsigned long start_pfn, - unsigned long pages) +static int __init balloon_add_regions(void) { + unsigned long start_pfn, pages; unsigned long pfn, extra_pfn_end; - struct page *page; - - /* - * If the amount of usable memory has been limited (e.g., with - * the 'mem' command line parameter), don't add pages beyond - * this limit. - */ - extra_pfn_end = min(max_pfn, start_pfn + pages); - - for (pfn = start_pfn; pfn < extra_pfn_end; pfn++) { - page = pfn_to_page(pfn); - /* totalram_pages and totalhigh_pages do not - include the boot-time balloon extension, so - don't subtract from it. */ - __balloon_append(page); + unsigned int i; + + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { + pages = xen_extra_mem[i].n_pfns; + if (!pages) + continue; + + start_pfn = xen_extra_mem[i].start_pfn; + + /* + * If the amount of usable memory has been limited (e.g., with + * the 'mem' command line parameter), don't add pages beyond + * this limit. + */ + extra_pfn_end = min(max_pfn, start_pfn + pages); + + for (pfn = start_pfn; pfn < extra_pfn_end; pfn++) + balloon_append(pfn_to_page(pfn)); + + /* + * Extra regions are accounted for in the physmap, but need + * decreasing from current_pages and target_pages to balloon + * down the initial allocation, because they are already + * accounted for in total_pages. + */ + pages = extra_pfn_end - start_pfn; + if (pages >= balloon_stats.current_pages || + pages >= balloon_stats.target_pages) { + WARN(1, "Extra pages underflow current target"); + return -ERANGE; + } + balloon_stats.current_pages -= pages; + balloon_stats.target_pages -= pages; } + + return 0; } static int __init balloon_init(void) { - int i; + struct task_struct *task; + int rc; if (!xen_domain()) return -ENODEV; pr_info("Initialising balloon driver\n"); - balloon_stats.current_pages = xen_pv_domain() - ? min(xen_start_info->nr_pages - xen_released_pages, max_pfn) - : max_pfn; + if (xen_released_pages >= get_num_physpages()) { + WARN(1, "Released pages underflow current target"); + return -ERANGE; + } + + balloon_stats.current_pages = get_num_physpages() - xen_released_pages; balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; + balloon_stats.total_pages = balloon_stats.current_pages; balloon_stats.schedule_delay = 1; balloon_stats.max_schedule_delay = 32; balloon_stats.retry_count = 1; - balloon_stats.max_retry_count = RETRY_UNLIMITED; + balloon_stats.max_retry_count = 4; #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG - balloon_stats.hotplug_pages = 0; - balloon_stats.balloon_hotplug = 0; - set_online_page_callback(&xen_online_page); register_memory_notifier(&xen_memory_nb); + register_sysctl_init("xen/balloon", balloon_table); #endif - /* - * Initialize the balloon with pages from the extra memory - * regions (see arch/x86/xen/setup.c). - */ - for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) - if (xen_extra_mem[i].size) - balloon_add_region(PFN_UP(xen_extra_mem[i].start), - PFN_DOWN(xen_extra_mem[i].size)); + rc = balloon_add_regions(); + if (rc) + return rc; + + task = kthread_run(balloon_thread, NULL, "xen-balloon"); + if (IS_ERR(task)) { + pr_err("xen-balloon thread could not be started, ballooning will not work!\n"); + return PTR_ERR(task); + } + + /* Init the xen-balloon driver. */ + xen_balloon_init(); return 0; } - subsys_initcall(balloon_init); -MODULE_LICENSE("GPL"); +static int __init balloon_wait_finish(void) +{ + long credit, last_credit = 0; + unsigned long last_changed = 0; + + if (!xen_domain()) + return -ENODEV; + + /* PV guests don't need to wait. */ + if (xen_pv_domain() || !current_credit()) + return 0; + + pr_notice("Waiting for initial ballooning down having finished.\n"); + + while ((credit = current_credit()) < 0) { + if (credit != last_credit) { + last_changed = jiffies; + last_credit = credit; + } + if (balloon_state == BP_ECANCELED) { + pr_warn_once("Initial ballooning failed, %ld pages need to be freed.\n", + -credit); + if (time_is_before_eq_jiffies(last_changed + HZ * balloon_boot_timeout)) + panic("Initial ballooning failed!\n"); + } + + schedule_timeout_interruptible(HZ / 10); + } + + pr_notice("Initial ballooning down finished.\n"); + + return 0; +} +late_initcall_sync(balloon_wait_finish); diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c index 0edb91c0de6b..05a286d24f14 100644 --- a/drivers/xen/biomerge.c +++ b/drivers/xen/biomerge.c @@ -1,15 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/bio.h> -#include <linux/io.h> #include <linux/export.h> +#include <xen/xen.h> #include <xen/page.h> +/* check if @page can be merged with 'vec1' */ bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, - const struct bio_vec *vec2) + const struct page *page) { - unsigned long mfn1 = pfn_to_mfn(page_to_pfn(vec1->bv_page)); - unsigned long mfn2 = pfn_to_mfn(page_to_pfn(vec2->bv_page)); +#if XEN_PAGE_SIZE == PAGE_SIZE + unsigned long bfn1 = pfn_to_bfn(page_to_pfn(vec1->bv_page)); + unsigned long bfn2 = pfn_to_bfn(page_to_pfn(page)); - return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) && - ((mfn1 == mfn2) || ((mfn1+1) == mfn2)); + return bfn1 + PFN_DOWN(vec1->bv_offset + vec1->bv_len) == bfn2; +#else + /* + * XXX: Add support for merging bio_vec when using different page + * size in Xen and Linux. + */ + return false; +#endif } -EXPORT_SYMBOL(xen_biovec_phys_mergeable); diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index cc6513a176b0..b96b11e2b571 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt #include <linux/notifier.h> @@ -11,17 +12,23 @@ static void enable_hotplug_cpu(int cpu) { if (!cpu_present(cpu)) - arch_register_cpu(cpu); + xen_arch_register_cpu(cpu); set_cpu_present(cpu, true); } static void disable_hotplug_cpu(int cpu) { - if (cpu_present(cpu)) - arch_unregister_cpu(cpu); - - set_cpu_present(cpu, false); + if (!cpu_is_hotpluggable(cpu)) + return; + lock_device_hotplug(); + if (cpu_online(cpu)) + device_offline(get_cpu_device(cpu)); + if (!cpu_online(cpu) && cpu_present(cpu)) { + xen_arch_unregister_cpu(cpu); + set_cpu_present(cpu, false); + } + unlock_device_hotplug(); } static int vcpu_online(unsigned int cpu) @@ -47,7 +54,7 @@ static int vcpu_online(unsigned int cpu) } static void vcpu_hotplug(unsigned int cpu) { - if (!cpu_possible(cpu)) + if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) return; switch (vcpu_online(cpu)) { @@ -55,7 +62,6 @@ static void vcpu_hotplug(unsigned int cpu) enable_hotplug_cpu(cpu); break; case 0: - (void)cpu_down(cpu); disable_hotplug_cpu(cpu); break; default: @@ -64,13 +70,12 @@ static void vcpu_hotplug(unsigned int cpu) } static void handle_vcpu_hotplug_event(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { unsigned int cpu; char *cpustr; - const char *node = vec[XS_WATCH_PATH]; - cpustr = strstr(node, "cpu/"); + cpustr = strstr(path, "cpu/"); if (cpustr != NULL) { sscanf(cpustr, "cpu/%u", &cpu); vcpu_hotplug(cpu); @@ -88,10 +93,8 @@ static int setup_cpu_watcher(struct notifier_block *notifier, (void)register_xenbus_watch(&cpu_watch); for_each_possible_cpu(cpu) { - if (vcpu_online(cpu) == 0) { - (void)cpu_down(cpu); - set_cpu_present(cpu, false); - } + if (vcpu_online(cpu) == 0) + disable_hotplug_cpu(cpu); } return NOTIFY_DONE; @@ -102,7 +105,11 @@ static int __init setup_vcpu_hotplug_event(void) static struct notifier_block xsn_cpu = { .notifier_call = setup_cpu_watcher }; - if (!xen_pv_domain()) +#ifdef CONFIG_X86 + if (!xen_pv_domain() && !xen_pvh_domain()) +#else + if (!xen_domain()) +#endif return -ENODEV; register_xenstore_notifier(&xsn_cpu); @@ -110,5 +117,5 @@ static int __init setup_vcpu_hotplug_event(void) return 0; } -arch_initcall(setup_vcpu_hotplug_event); +late_initcall(setup_vcpu_hotplug_event); diff --git a/drivers/xen/dbgp.c b/drivers/xen/dbgp.c index f3ccc80a455f..cfb5de31d860 100644 --- a/drivers/xen/dbgp.c +++ b/drivers/xen/dbgp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include <linux/pci.h> #include <linux/usb.h> #include <linux/usb/ehci_def.h> @@ -19,7 +20,7 @@ static int xen_dbgp_op(struct usb_hcd *hcd, int op) dbgp.op = op; #ifdef CONFIG_PCI - if (ctrlr->bus == &pci_bus_type) { + if (dev_is_pci(ctrlr)) { const struct pci_dev *pdev = to_pci_dev(ctrlr); dbgp.u.pci.seg = pci_domain_nr(pdev->bus); diff --git a/drivers/xen/efi.c b/drivers/xen/efi.c new file mode 100644 index 000000000000..fb321cd6415a --- /dev/null +++ b/drivers/xen/efi.c @@ -0,0 +1,355 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * EFI support for Xen. + * + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> + * Copyright (C) 1999-2002 Hewlett-Packard Co. + * David Mosberger-Tang <davidm@hpl.hp.com> + * Stephane Eranian <eranian@hpl.hp.com> + * Copyright (C) 2005-2008 Intel Co. + * Fenghua Yu <fenghua.yu@intel.com> + * Bibo Mao <bibo.mao@intel.com> + * Chandramouli Narayanan <mouli@linux.intel.com> + * Huang Ying <ying.huang@intel.com> + * Copyright (C) 2011 Novell Co. + * Jan Beulich <JBeulich@suse.com> + * Copyright (C) 2011-2012 Oracle Co. + * Liang Tang <liang.tang@oracle.com> + * Copyright (c) 2014 Oracle Co., Daniel Kiper + */ + +#include <linux/bug.h> +#include <linux/efi.h> +#include <linux/init.h> +#include <linux/string.h> + +#include <xen/interface/xen.h> +#include <xen/interface/platform.h> +#include <xen/page.h> +#include <xen/xen.h> +#include <xen/xen-ops.h> + +#include <asm/page.h> + +#include <asm/xen/hypercall.h> + +#define INIT_EFI_OP(name) \ + {.cmd = XENPF_efi_runtime_call, \ + .u.efi_runtime_call.function = XEN_EFI_##name, \ + .u.efi_runtime_call.misc = 0} + +#define efi_data(op) (op.u.efi_runtime_call) + +static efi_status_t xen_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) +{ + struct xen_platform_op op = INIT_EFI_OP(get_time); + + if (HYPERVISOR_platform_op(&op) < 0) + return EFI_UNSUPPORTED; + + if (tm) { + BUILD_BUG_ON(sizeof(*tm) != sizeof(efi_data(op).u.get_time.time)); + memcpy(tm, &efi_data(op).u.get_time.time, sizeof(*tm)); + } + + if (tc) { + tc->resolution = efi_data(op).u.get_time.resolution; + tc->accuracy = efi_data(op).u.get_time.accuracy; + tc->sets_to_zero = !!(efi_data(op).misc & + XEN_EFI_GET_TIME_SET_CLEARS_NS); + } + + return efi_data(op).status; +} + +static efi_status_t xen_efi_set_time(efi_time_t *tm) +{ + struct xen_platform_op op = INIT_EFI_OP(set_time); + + BUILD_BUG_ON(sizeof(*tm) != sizeof(efi_data(op).u.set_time)); + memcpy(&efi_data(op).u.set_time, tm, sizeof(*tm)); + + if (HYPERVISOR_platform_op(&op) < 0) + return EFI_UNSUPPORTED; + + return efi_data(op).status; +} + +static efi_status_t xen_efi_get_wakeup_time(efi_bool_t *enabled, + efi_bool_t *pending, + efi_time_t *tm) +{ + struct xen_platform_op op = INIT_EFI_OP(get_wakeup_time); + + if (HYPERVISOR_platform_op(&op) < 0) + return EFI_UNSUPPORTED; + + if (tm) { + BUILD_BUG_ON(sizeof(*tm) != sizeof(efi_data(op).u.get_wakeup_time)); + memcpy(tm, &efi_data(op).u.get_wakeup_time, sizeof(*tm)); + } + + if (enabled) + *enabled = !!(efi_data(op).misc & XEN_EFI_GET_WAKEUP_TIME_ENABLED); + + if (pending) + *pending = !!(efi_data(op).misc & XEN_EFI_GET_WAKEUP_TIME_PENDING); + + return efi_data(op).status; +} + +static efi_status_t xen_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) +{ + struct xen_platform_op op = INIT_EFI_OP(set_wakeup_time); + + BUILD_BUG_ON(sizeof(*tm) != sizeof(efi_data(op).u.set_wakeup_time)); + if (enabled) + efi_data(op).misc = XEN_EFI_SET_WAKEUP_TIME_ENABLE; + if (tm) + memcpy(&efi_data(op).u.set_wakeup_time, tm, sizeof(*tm)); + else + efi_data(op).misc |= XEN_EFI_SET_WAKEUP_TIME_ENABLE_ONLY; + + if (HYPERVISOR_platform_op(&op) < 0) + return EFI_UNSUPPORTED; + + return efi_data(op).status; +} + +static efi_status_t xen_efi_get_variable(efi_char16_t *name, efi_guid_t *vendor, + u32 *attr, unsigned long *data_size, + void *data) +{ + struct xen_platform_op op = INIT_EFI_OP(get_variable); + + set_xen_guest_handle(efi_data(op).u.get_variable.name, name); + BUILD_BUG_ON(sizeof(*vendor) != + sizeof(efi_data(op).u.get_variable.vendor_guid)); + memcpy(&efi_data(op).u.get_variable.vendor_guid, vendor, sizeof(*vendor)); + efi_data(op).u.get_variable.size = *data_size; + set_xen_guest_handle(efi_data(op).u.get_variable.data, data); + + if (HYPERVISOR_platform_op(&op) < 0) + return EFI_UNSUPPORTED; + + *data_size = efi_data(op).u.get_variable.size; + if (attr) + *attr = efi_data(op).misc; + + return efi_data(op).status; +} + +static efi_status_t xen_efi_get_next_variable(unsigned long *name_size, + efi_char16_t *name, + efi_guid_t *vendor) +{ + struct xen_platform_op op = INIT_EFI_OP(get_next_variable_name); + + efi_data(op).u.get_next_variable_name.size = *name_size; + set_xen_guest_handle(efi_data(op).u.get_next_variable_name.name, name); + BUILD_BUG_ON(sizeof(*vendor) != + sizeof(efi_data(op).u.get_next_variable_name.vendor_guid)); + memcpy(&efi_data(op).u.get_next_variable_name.vendor_guid, vendor, + sizeof(*vendor)); + + if (HYPERVISOR_platform_op(&op) < 0) + return EFI_UNSUPPORTED; + + *name_size = efi_data(op).u.get_next_variable_name.size; + memcpy(vendor, &efi_data(op).u.get_next_variable_name.vendor_guid, + sizeof(*vendor)); + + return efi_data(op).status; +} + +static efi_status_t xen_efi_set_variable(efi_char16_t *name, efi_guid_t *vendor, + u32 attr, unsigned long data_size, + void *data) +{ + struct xen_platform_op op = INIT_EFI_OP(set_variable); + + set_xen_guest_handle(efi_data(op).u.set_variable.name, name); + efi_data(op).misc = attr; + BUILD_BUG_ON(sizeof(*vendor) != + sizeof(efi_data(op).u.set_variable.vendor_guid)); + memcpy(&efi_data(op).u.set_variable.vendor_guid, vendor, sizeof(*vendor)); + efi_data(op).u.set_variable.size = data_size; + set_xen_guest_handle(efi_data(op).u.set_variable.data, data); + + if (HYPERVISOR_platform_op(&op) < 0) + return EFI_UNSUPPORTED; + + return efi_data(op).status; +} + +static efi_status_t xen_efi_query_variable_info(u32 attr, u64 *storage_space, + u64 *remaining_space, + u64 *max_variable_size) +{ + struct xen_platform_op op = INIT_EFI_OP(query_variable_info); + + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) + return EFI_UNSUPPORTED; + + efi_data(op).u.query_variable_info.attr = attr; + + if (HYPERVISOR_platform_op(&op) < 0) + return EFI_UNSUPPORTED; + + *storage_space = efi_data(op).u.query_variable_info.max_store_size; + *remaining_space = efi_data(op).u.query_variable_info.remain_store_size; + *max_variable_size = efi_data(op).u.query_variable_info.max_size; + + return efi_data(op).status; +} + +static efi_status_t xen_efi_get_next_high_mono_count(u32 *count) +{ + struct xen_platform_op op = INIT_EFI_OP(get_next_high_monotonic_count); + + if (HYPERVISOR_platform_op(&op) < 0) + return EFI_UNSUPPORTED; + + *count = efi_data(op).misc; + + return efi_data(op).status; +} + +static efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules, + unsigned long count, unsigned long sg_list) +{ + struct xen_platform_op op = INIT_EFI_OP(update_capsule); + + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) + return EFI_UNSUPPORTED; + + set_xen_guest_handle(efi_data(op).u.update_capsule.capsule_header_array, + capsules); + efi_data(op).u.update_capsule.capsule_count = count; + efi_data(op).u.update_capsule.sg_list = sg_list; + + if (HYPERVISOR_platform_op(&op) < 0) + return EFI_UNSUPPORTED; + + return efi_data(op).status; +} + +static efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, + unsigned long count, u64 *max_size, int *reset_type) +{ + struct xen_platform_op op = INIT_EFI_OP(query_capsule_capabilities); + + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) + return EFI_UNSUPPORTED; + + set_xen_guest_handle(efi_data(op).u.query_capsule_capabilities.capsule_header_array, + capsules); + efi_data(op).u.query_capsule_capabilities.capsule_count = count; + + if (HYPERVISOR_platform_op(&op) < 0) + return EFI_UNSUPPORTED; + + *max_size = efi_data(op).u.query_capsule_capabilities.max_capsule_size; + *reset_type = efi_data(op).u.query_capsule_capabilities.reset_type; + + return efi_data(op).status; +} + +static void xen_efi_reset_system(int reset_type, efi_status_t status, + unsigned long data_size, efi_char16_t *data) +{ + switch (reset_type) { + case EFI_RESET_COLD: + case EFI_RESET_WARM: + xen_reboot(SHUTDOWN_reboot); + break; + case EFI_RESET_SHUTDOWN: + xen_reboot(SHUTDOWN_poweroff); + break; + default: + BUG(); + } +} + +/* + * Set XEN EFI runtime services function pointers. Other fields of struct efi, + * e.g. efi.systab, will be set like normal EFI. + */ +void __init xen_efi_runtime_setup(void) +{ + efi.get_time = xen_efi_get_time; + efi.set_time = xen_efi_set_time; + efi.get_wakeup_time = xen_efi_get_wakeup_time; + efi.set_wakeup_time = xen_efi_set_wakeup_time; + efi.get_variable = xen_efi_get_variable; + efi.get_next_variable = xen_efi_get_next_variable; + efi.set_variable = xen_efi_set_variable; + efi.set_variable_nonblocking = xen_efi_set_variable; + efi.query_variable_info = xen_efi_query_variable_info; + efi.query_variable_info_nonblocking = xen_efi_query_variable_info; + efi.update_capsule = xen_efi_update_capsule; + efi.query_capsule_caps = xen_efi_query_capsule_caps; + efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count; + efi.reset_system = xen_efi_reset_system; +} + +int efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md) +{ + static_assert(XEN_PAGE_SHIFT == EFI_PAGE_SHIFT, + "Mismatch between EFI_PAGE_SHIFT and XEN_PAGE_SHIFT"); + struct xen_platform_op op; + union xenpf_efi_info *info = &op.u.firmware_info.u.efi_info; + int rc; + + if (!efi_enabled(EFI_PARAVIRT) || efi_enabled(EFI_MEMMAP)) + return __efi_mem_desc_lookup(phys_addr, out_md); + phys_addr &= ~(u64)(EFI_PAGE_SIZE - 1); + op = (struct xen_platform_op) { + .cmd = XENPF_firmware_info, + .u.firmware_info = { + .type = XEN_FW_EFI_INFO, + .index = XEN_FW_EFI_MEM_INFO, + .u.efi_info.mem.addr = phys_addr, + .u.efi_info.mem.size = U64_MAX - phys_addr, + }, + }; + + rc = HYPERVISOR_platform_op(&op); + if (rc) { + pr_warn("Failed to lookup header 0x%llx in Xen memory map: error %d\n", + phys_addr, rc); + } + + out_md->phys_addr = info->mem.addr; + out_md->num_pages = info->mem.size >> EFI_PAGE_SHIFT; + out_md->type = info->mem.type; + out_md->attribute = info->mem.attr; + + return 0; +} + +bool __init xen_efi_config_table_is_usable(const efi_guid_t *guid, + unsigned long table) +{ + efi_memory_desc_t md; + int rc; + + if (!efi_enabled(EFI_PARAVIRT)) + return true; + + rc = efi_mem_desc_lookup(table, &md); + if (rc) + return false; + + switch (md.type) { + case EFI_RUNTIME_SERVICES_CODE: + case EFI_RUNTIME_SERVICES_DATA: + case EFI_ACPI_RECLAIM_MEMORY: + case EFI_ACPI_MEMORY_NVS: + case EFI_RESERVED_TYPE: + return true; + default: + return false; + } +} diff --git a/drivers/xen/events.c b/drivers/xen/events.c deleted file mode 100644 index a58ac435a9a4..000000000000 --- a/drivers/xen/events.c +++ /dev/null @@ -1,1906 +0,0 @@ -/* - * Xen event channels - * - * Xen models interrupts with abstract event channels. Because each - * domain gets 1024 event channels, but NR_IRQ is not that large, we - * must dynamically map irqs<->event channels. The event channels - * interface with the rest of the kernel by defining a xen interrupt - * chip. When an event is received, it is mapped to an irq and sent - * through the normal interrupt processing path. - * - * There are four kinds of events which can be mapped to an event - * channel: - * - * 1. Inter-domain notifications. This includes all the virtual - * device events, since they're driven by front-ends in another domain - * (typically dom0). - * 2. VIRQs, typically used for timers. These are per-cpu events. - * 3. IPIs. - * 4. PIRQs - Hardware interrupts. - * - * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 - */ - -#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt - -#include <linux/linkage.h> -#include <linux/interrupt.h> -#include <linux/irq.h> -#include <linux/module.h> -#include <linux/string.h> -#include <linux/bootmem.h> -#include <linux/slab.h> -#include <linux/irqnr.h> -#include <linux/pci.h> - -#ifdef CONFIG_X86 -#include <asm/desc.h> -#include <asm/ptrace.h> -#include <asm/irq.h> -#include <asm/idle.h> -#include <asm/io_apic.h> -#include <asm/xen/page.h> -#include <asm/xen/pci.h> -#endif -#include <asm/sync_bitops.h> -#include <asm/xen/hypercall.h> -#include <asm/xen/hypervisor.h> - -#include <xen/xen.h> -#include <xen/hvm.h> -#include <xen/xen-ops.h> -#include <xen/events.h> -#include <xen/interface/xen.h> -#include <xen/interface/event_channel.h> -#include <xen/interface/hvm/hvm_op.h> -#include <xen/interface/hvm/params.h> -#include <xen/interface/physdev.h> -#include <xen/interface/sched.h> -#include <asm/hw_irq.h> - -/* - * This lock protects updates to the following mapping and reference-count - * arrays. The lock does not need to be acquired to read the mapping tables. - */ -static DEFINE_MUTEX(irq_mapping_update_lock); - -static LIST_HEAD(xen_irq_list_head); - -/* IRQ <-> VIRQ mapping. */ -static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; - -/* IRQ <-> IPI mapping */ -static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; - -/* Interrupt types. */ -enum xen_irq_type { - IRQT_UNBOUND = 0, - IRQT_PIRQ, - IRQT_VIRQ, - IRQT_IPI, - IRQT_EVTCHN -}; - -/* - * Packed IRQ information: - * type - enum xen_irq_type - * event channel - irq->event channel mapping - * cpu - cpu this event channel is bound to - * index - type-specific information: - * PIRQ - physical IRQ, GSI, flags, and owner domain - * VIRQ - virq number - * IPI - IPI vector - * EVTCHN - - */ -struct irq_info { - struct list_head list; - int refcnt; - enum xen_irq_type type; /* type */ - unsigned irq; - unsigned short evtchn; /* event channel */ - unsigned short cpu; /* cpu bound */ - - union { - unsigned short virq; - enum ipi_vector ipi; - struct { - unsigned short pirq; - unsigned short gsi; - unsigned char flags; - uint16_t domid; - } pirq; - } u; -}; -#define PIRQ_NEEDS_EOI (1 << 0) -#define PIRQ_SHAREABLE (1 << 1) - -static int *evtchn_to_irq; -#ifdef CONFIG_X86 -static unsigned long *pirq_eoi_map; -#endif -static bool (*pirq_needs_eoi)(unsigned irq); - -/* - * Note sizeof(xen_ulong_t) can be more than sizeof(unsigned long). Be - * careful to only use bitops which allow for this (e.g - * test_bit/find_first_bit and friends but not __ffs) and to pass - * BITS_PER_EVTCHN_WORD as the bitmask length. - */ -#define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8) -/* - * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t - * array. Primarily to avoid long lines (hence the terse name). - */ -#define BM(x) (unsigned long *)(x) -/* Find the first set bit in a evtchn mask */ -#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD) - -static DEFINE_PER_CPU(xen_ulong_t [NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD], - cpu_evtchn_mask); - -/* Xen will never allocate port zero for any purpose. */ -#define VALID_EVTCHN(chn) ((chn) != 0) - -static struct irq_chip xen_dynamic_chip; -static struct irq_chip xen_percpu_chip; -static struct irq_chip xen_pirq_chip; -static void enable_dynirq(struct irq_data *data); -static void disable_dynirq(struct irq_data *data); - -/* Get info for IRQ */ -static struct irq_info *info_for_irq(unsigned irq) -{ - return irq_get_handler_data(irq); -} - -/* Constructors for packed IRQ information. */ -static void xen_irq_info_common_init(struct irq_info *info, - unsigned irq, - enum xen_irq_type type, - unsigned short evtchn, - unsigned short cpu) -{ - - BUG_ON(info->type != IRQT_UNBOUND && info->type != type); - - info->type = type; - info->irq = irq; - info->evtchn = evtchn; - info->cpu = cpu; - - evtchn_to_irq[evtchn] = irq; - - irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN); -} - -static void xen_irq_info_evtchn_init(unsigned irq, - unsigned short evtchn) -{ - struct irq_info *info = info_for_irq(irq); - - xen_irq_info_common_init(info, irq, IRQT_EVTCHN, evtchn, 0); -} - -static void xen_irq_info_ipi_init(unsigned cpu, - unsigned irq, - unsigned short evtchn, - enum ipi_vector ipi) -{ - struct irq_info *info = info_for_irq(irq); - - xen_irq_info_common_init(info, irq, IRQT_IPI, evtchn, 0); - - info->u.ipi = ipi; - - per_cpu(ipi_to_irq, cpu)[ipi] = irq; -} - -static void xen_irq_info_virq_init(unsigned cpu, - unsigned irq, - unsigned short evtchn, - unsigned short virq) -{ - struct irq_info *info = info_for_irq(irq); - - xen_irq_info_common_init(info, irq, IRQT_VIRQ, evtchn, 0); - - info->u.virq = virq; - - per_cpu(virq_to_irq, cpu)[virq] = irq; -} - -static void xen_irq_info_pirq_init(unsigned irq, - unsigned short evtchn, - unsigned short pirq, - unsigned short gsi, - uint16_t domid, - unsigned char flags) -{ - struct irq_info *info = info_for_irq(irq); - - xen_irq_info_common_init(info, irq, IRQT_PIRQ, evtchn, 0); - - info->u.pirq.pirq = pirq; - info->u.pirq.gsi = gsi; - info->u.pirq.domid = domid; - info->u.pirq.flags = flags; -} - -/* - * Accessors for packed IRQ information. - */ -static unsigned int evtchn_from_irq(unsigned irq) -{ - if (unlikely(WARN(irq < 0 || irq >= nr_irqs, "Invalid irq %d!\n", irq))) - return 0; - - return info_for_irq(irq)->evtchn; -} - -unsigned irq_from_evtchn(unsigned int evtchn) -{ - return evtchn_to_irq[evtchn]; -} -EXPORT_SYMBOL_GPL(irq_from_evtchn); - -static enum ipi_vector ipi_from_irq(unsigned irq) -{ - struct irq_info *info = info_for_irq(irq); - - BUG_ON(info == NULL); - BUG_ON(info->type != IRQT_IPI); - - return info->u.ipi; -} - -static unsigned virq_from_irq(unsigned irq) -{ - struct irq_info *info = info_for_irq(irq); - - BUG_ON(info == NULL); - BUG_ON(info->type != IRQT_VIRQ); - - return info->u.virq; -} - -static unsigned pirq_from_irq(unsigned irq) -{ - struct irq_info *info = info_for_irq(irq); - - BUG_ON(info == NULL); - BUG_ON(info->type != IRQT_PIRQ); - - return info->u.pirq.pirq; -} - -static enum xen_irq_type type_from_irq(unsigned irq) -{ - return info_for_irq(irq)->type; -} - -static unsigned cpu_from_irq(unsigned irq) -{ - return info_for_irq(irq)->cpu; -} - -static unsigned int cpu_from_evtchn(unsigned int evtchn) -{ - int irq = evtchn_to_irq[evtchn]; - unsigned ret = 0; - - if (irq != -1) - ret = cpu_from_irq(irq); - - return ret; -} - -#ifdef CONFIG_X86 -static bool pirq_check_eoi_map(unsigned irq) -{ - return test_bit(pirq_from_irq(irq), pirq_eoi_map); -} -#endif - -static bool pirq_needs_eoi_flag(unsigned irq) -{ - struct irq_info *info = info_for_irq(irq); - BUG_ON(info->type != IRQT_PIRQ); - - return info->u.pirq.flags & PIRQ_NEEDS_EOI; -} - -static inline xen_ulong_t active_evtchns(unsigned int cpu, - struct shared_info *sh, - unsigned int idx) -{ - return sh->evtchn_pending[idx] & - per_cpu(cpu_evtchn_mask, cpu)[idx] & - ~sh->evtchn_mask[idx]; -} - -static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) -{ - int irq = evtchn_to_irq[chn]; - - BUG_ON(irq == -1); -#ifdef CONFIG_SMP - cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu)); -#endif - - clear_bit(chn, BM(per_cpu(cpu_evtchn_mask, cpu_from_irq(irq)))); - set_bit(chn, BM(per_cpu(cpu_evtchn_mask, cpu))); - - info_for_irq(irq)->cpu = cpu; -} - -static void init_evtchn_cpu_bindings(void) -{ - int i; -#ifdef CONFIG_SMP - struct irq_info *info; - - /* By default all event channels notify CPU#0. */ - list_for_each_entry(info, &xen_irq_list_head, list) { - struct irq_desc *desc = irq_to_desc(info->irq); - cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); - } -#endif - - for_each_possible_cpu(i) - memset(per_cpu(cpu_evtchn_mask, i), - (i == 0) ? ~0 : 0, sizeof(*per_cpu(cpu_evtchn_mask, i))); -} - -static inline void clear_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_clear_bit(port, BM(&s->evtchn_pending[0])); -} - -static inline void set_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, BM(&s->evtchn_pending[0])); -} - -static inline int test_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - return sync_test_bit(port, BM(&s->evtchn_pending[0])); -} - - -/** - * notify_remote_via_irq - send event to remote end of event channel via irq - * @irq: irq of event channel to send event to - * - * Unlike notify_remote_via_evtchn(), this is safe to use across - * save/restore. Notifications on a broken connection are silently - * dropped. - */ -void notify_remote_via_irq(int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - notify_remote_via_evtchn(evtchn); -} -EXPORT_SYMBOL_GPL(notify_remote_via_irq); - -static void mask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, BM(&s->evtchn_mask[0])); -} - -static void unmask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - unsigned int cpu = get_cpu(); - int do_hypercall = 0, evtchn_pending = 0; - - BUG_ON(!irqs_disabled()); - - if (unlikely((cpu != cpu_from_evtchn(port)))) - do_hypercall = 1; - else { - /* - * Need to clear the mask before checking pending to - * avoid a race with an event becoming pending. - * - * EVTCHNOP_unmask will only trigger an upcall if the - * mask bit was set, so if a hypercall is needed - * remask the event. - */ - sync_clear_bit(port, BM(&s->evtchn_mask[0])); - evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0])); - - if (unlikely(evtchn_pending && xen_hvm_domain())) { - sync_set_bit(port, BM(&s->evtchn_mask[0])); - do_hypercall = 1; - } - } - - /* Slow path (hypercall) if this is a non-local port or if this is - * an hvm domain and an event is pending (hvm domains don't have - * their own implementation of irq_enable). */ - if (do_hypercall) { - struct evtchn_unmask unmask = { .port = port }; - (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); - } else { - struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); - - /* - * The following is basically the equivalent of - * 'hw_resend_irq'. Just like a real IO-APIC we 'lose - * the interrupt edge' if the channel is masked. - */ - if (evtchn_pending && - !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD, - BM(&vcpu_info->evtchn_pending_sel))) - vcpu_info->evtchn_upcall_pending = 1; - } - - put_cpu(); -} - -static void xen_irq_init(unsigned irq) -{ - struct irq_info *info; -#ifdef CONFIG_SMP - struct irq_desc *desc = irq_to_desc(irq); - - /* By default all event channels notify CPU#0. */ - cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); -#endif - - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (info == NULL) - panic("Unable to allocate metadata for IRQ%d\n", irq); - - info->type = IRQT_UNBOUND; - info->refcnt = -1; - - irq_set_handler_data(irq, info); - - list_add_tail(&info->list, &xen_irq_list_head); -} - -static int __must_check xen_allocate_irq_dynamic(void) -{ - int first = 0; - int irq; - -#ifdef CONFIG_X86_IO_APIC - /* - * For an HVM guest or domain 0 which see "real" (emulated or - * actual respectively) GSIs we allocate dynamic IRQs - * e.g. those corresponding to event channels or MSIs - * etc. from the range above those "real" GSIs to avoid - * collisions. - */ - if (xen_initial_domain() || xen_hvm_domain()) - first = get_nr_irqs_gsi(); -#endif - - irq = irq_alloc_desc_from(first, -1); - - if (irq >= 0) - xen_irq_init(irq); - - return irq; -} - -static int __must_check xen_allocate_irq_gsi(unsigned gsi) -{ - int irq; - - /* - * A PV guest has no concept of a GSI (since it has no ACPI - * nor access to/knowledge of the physical APICs). Therefore - * all IRQs are dynamically allocated from the entire IRQ - * space. - */ - if (xen_pv_domain() && !xen_initial_domain()) - return xen_allocate_irq_dynamic(); - - /* Legacy IRQ descriptors are already allocated by the arch. */ - if (gsi < NR_IRQS_LEGACY) - irq = gsi; - else - irq = irq_alloc_desc_at(gsi, -1); - - xen_irq_init(irq); - - return irq; -} - -static void xen_free_irq(unsigned irq) -{ - struct irq_info *info = irq_get_handler_data(irq); - - if (WARN_ON(!info)) - return; - - list_del(&info->list); - - irq_set_handler_data(irq, NULL); - - WARN_ON(info->refcnt > 0); - - kfree(info); - - /* Legacy IRQ descriptors are managed by the arch. */ - if (irq < NR_IRQS_LEGACY) - return; - - irq_free_desc(irq); -} - -static void pirq_query_unmask(int irq) -{ - struct physdev_irq_status_query irq_status; - struct irq_info *info = info_for_irq(irq); - - BUG_ON(info->type != IRQT_PIRQ); - - irq_status.irq = pirq_from_irq(irq); - if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) - irq_status.flags = 0; - - info->u.pirq.flags &= ~PIRQ_NEEDS_EOI; - if (irq_status.flags & XENIRQSTAT_needs_eoi) - info->u.pirq.flags |= PIRQ_NEEDS_EOI; -} - -static bool probing_irq(int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - - return desc && desc->action == NULL; -} - -static void eoi_pirq(struct irq_data *data) -{ - int evtchn = evtchn_from_irq(data->irq); - struct physdev_eoi eoi = { .irq = pirq_from_irq(data->irq) }; - int rc = 0; - - irq_move_irq(data); - - if (VALID_EVTCHN(evtchn)) - clear_evtchn(evtchn); - - if (pirq_needs_eoi(data->irq)) { - rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); - WARN_ON(rc); - } -} - -static void mask_ack_pirq(struct irq_data *data) -{ - disable_dynirq(data); - eoi_pirq(data); -} - -static unsigned int __startup_pirq(unsigned int irq) -{ - struct evtchn_bind_pirq bind_pirq; - struct irq_info *info = info_for_irq(irq); - int evtchn = evtchn_from_irq(irq); - int rc; - - BUG_ON(info->type != IRQT_PIRQ); - - if (VALID_EVTCHN(evtchn)) - goto out; - - bind_pirq.pirq = pirq_from_irq(irq); - /* NB. We are happy to share unless we are probing. */ - bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ? - BIND_PIRQ__WILL_SHARE : 0; - rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); - if (rc != 0) { - if (!probing_irq(irq)) - pr_info("Failed to obtain physical IRQ %d\n", irq); - return 0; - } - evtchn = bind_pirq.port; - - pirq_query_unmask(irq); - - evtchn_to_irq[evtchn] = irq; - bind_evtchn_to_cpu(evtchn, 0); - info->evtchn = evtchn; - -out: - unmask_evtchn(evtchn); - eoi_pirq(irq_get_irq_data(irq)); - - return 0; -} - -static unsigned int startup_pirq(struct irq_data *data) -{ - return __startup_pirq(data->irq); -} - -static void shutdown_pirq(struct irq_data *data) -{ - struct evtchn_close close; - unsigned int irq = data->irq; - struct irq_info *info = info_for_irq(irq); - int evtchn = evtchn_from_irq(irq); - - BUG_ON(info->type != IRQT_PIRQ); - - if (!VALID_EVTCHN(evtchn)) - return; - - mask_evtchn(evtchn); - - close.port = evtchn; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); - - bind_evtchn_to_cpu(evtchn, 0); - evtchn_to_irq[evtchn] = -1; - info->evtchn = 0; -} - -static void enable_pirq(struct irq_data *data) -{ - startup_pirq(data); -} - -static void disable_pirq(struct irq_data *data) -{ - disable_dynirq(data); -} - -int xen_irq_from_gsi(unsigned gsi) -{ - struct irq_info *info; - - list_for_each_entry(info, &xen_irq_list_head, list) { - if (info->type != IRQT_PIRQ) - continue; - - if (info->u.pirq.gsi == gsi) - return info->irq; - } - - return -1; -} -EXPORT_SYMBOL_GPL(xen_irq_from_gsi); - -/* - * Do not make any assumptions regarding the relationship between the - * IRQ number returned here and the Xen pirq argument. - * - * Note: We don't assign an event channel until the irq actually started - * up. Return an existing irq if we've already got one for the gsi. - * - * Shareable implies level triggered, not shareable implies edge - * triggered here. - */ -int xen_bind_pirq_gsi_to_irq(unsigned gsi, - unsigned pirq, int shareable, char *name) -{ - int irq = -1; - struct physdev_irq irq_op; - - mutex_lock(&irq_mapping_update_lock); - - irq = xen_irq_from_gsi(gsi); - if (irq != -1) { - pr_info("%s: returning irq %d for gsi %u\n", - __func__, irq, gsi); - goto out; - } - - irq = xen_allocate_irq_gsi(gsi); - if (irq < 0) - goto out; - - irq_op.irq = irq; - irq_op.vector = 0; - - /* Only the privileged domain can do this. For non-priv, the pcifront - * driver provides a PCI bus that does the call to do exactly - * this in the priv domain. */ - if (xen_initial_domain() && - HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { - xen_free_irq(irq); - irq = -ENOSPC; - goto out; - } - - xen_irq_info_pirq_init(irq, 0, pirq, gsi, DOMID_SELF, - shareable ? PIRQ_SHAREABLE : 0); - - pirq_query_unmask(irq); - /* We try to use the handler with the appropriate semantic for the - * type of interrupt: if the interrupt is an edge triggered - * interrupt we use handle_edge_irq. - * - * On the other hand if the interrupt is level triggered we use - * handle_fasteoi_irq like the native code does for this kind of - * interrupts. - * - * Depending on the Xen version, pirq_needs_eoi might return true - * not only for level triggered interrupts but for edge triggered - * interrupts too. In any case Xen always honors the eoi mechanism, - * not injecting any more pirqs of the same kind if the first one - * hasn't received an eoi yet. Therefore using the fasteoi handler - * is the right choice either way. - */ - if (shareable) - irq_set_chip_and_handler_name(irq, &xen_pirq_chip, - handle_fasteoi_irq, name); - else - irq_set_chip_and_handler_name(irq, &xen_pirq_chip, - handle_edge_irq, name); - -out: - mutex_unlock(&irq_mapping_update_lock); - - return irq; -} - -#ifdef CONFIG_PCI_MSI -int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc) -{ - int rc; - struct physdev_get_free_pirq op_get_free_pirq; - - op_get_free_pirq.type = MAP_PIRQ_TYPE_MSI; - rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq); - - WARN_ONCE(rc == -ENOSYS, - "hypervisor does not support the PHYSDEVOP_get_free_pirq interface\n"); - - return rc ? -1 : op_get_free_pirq.pirq; -} - -int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, - int pirq, const char *name, domid_t domid) -{ - int irq, ret; - - mutex_lock(&irq_mapping_update_lock); - - irq = xen_allocate_irq_dynamic(); - if (irq < 0) - goto out; - - irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_edge_irq, - name); - - xen_irq_info_pirq_init(irq, 0, pirq, 0, domid, 0); - ret = irq_set_msi_desc(irq, msidesc); - if (ret < 0) - goto error_irq; -out: - mutex_unlock(&irq_mapping_update_lock); - return irq; -error_irq: - mutex_unlock(&irq_mapping_update_lock); - xen_free_irq(irq); - return ret; -} -#endif - -int xen_destroy_irq(int irq) -{ - struct irq_desc *desc; - struct physdev_unmap_pirq unmap_irq; - struct irq_info *info = info_for_irq(irq); - int rc = -ENOENT; - - mutex_lock(&irq_mapping_update_lock); - - desc = irq_to_desc(irq); - if (!desc) - goto out; - - if (xen_initial_domain()) { - unmap_irq.pirq = info->u.pirq.pirq; - unmap_irq.domid = info->u.pirq.domid; - rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); - /* If another domain quits without making the pci_disable_msix - * call, the Xen hypervisor takes care of freeing the PIRQs - * (free_domain_pirqs). - */ - if ((rc == -ESRCH && info->u.pirq.domid != DOMID_SELF)) - pr_info("domain %d does not have %d anymore\n", - info->u.pirq.domid, info->u.pirq.pirq); - else if (rc) { - pr_warn("unmap irq failed %d\n", rc); - goto out; - } - } - - xen_free_irq(irq); - -out: - mutex_unlock(&irq_mapping_update_lock); - return rc; -} - -int xen_irq_from_pirq(unsigned pirq) -{ - int irq; - - struct irq_info *info; - - mutex_lock(&irq_mapping_update_lock); - - list_for_each_entry(info, &xen_irq_list_head, list) { - if (info->type != IRQT_PIRQ) - continue; - irq = info->irq; - if (info->u.pirq.pirq == pirq) - goto out; - } - irq = -1; -out: - mutex_unlock(&irq_mapping_update_lock); - - return irq; -} - - -int xen_pirq_from_irq(unsigned irq) -{ - return pirq_from_irq(irq); -} -EXPORT_SYMBOL_GPL(xen_pirq_from_irq); -int bind_evtchn_to_irq(unsigned int evtchn) -{ - int irq; - - mutex_lock(&irq_mapping_update_lock); - - irq = evtchn_to_irq[evtchn]; - - if (irq == -1) { - irq = xen_allocate_irq_dynamic(); - if (irq < 0) - goto out; - - irq_set_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_edge_irq, "event"); - - xen_irq_info_evtchn_init(irq, evtchn); - } else { - struct irq_info *info = info_for_irq(irq); - WARN_ON(info == NULL || info->type != IRQT_EVTCHN); - } - -out: - mutex_unlock(&irq_mapping_update_lock); - - return irq; -} -EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); - -static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) -{ - struct evtchn_bind_ipi bind_ipi; - int evtchn, irq; - - mutex_lock(&irq_mapping_update_lock); - - irq = per_cpu(ipi_to_irq, cpu)[ipi]; - - if (irq == -1) { - irq = xen_allocate_irq_dynamic(); - if (irq < 0) - goto out; - - irq_set_chip_and_handler_name(irq, &xen_percpu_chip, - handle_percpu_irq, "ipi"); - - bind_ipi.vcpu = cpu; - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, - &bind_ipi) != 0) - BUG(); - evtchn = bind_ipi.port; - - xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); - - bind_evtchn_to_cpu(evtchn, cpu); - } else { - struct irq_info *info = info_for_irq(irq); - WARN_ON(info == NULL || info->type != IRQT_IPI); - } - - out: - mutex_unlock(&irq_mapping_update_lock); - return irq; -} - -static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, - unsigned int remote_port) -{ - struct evtchn_bind_interdomain bind_interdomain; - int err; - - bind_interdomain.remote_dom = remote_domain; - bind_interdomain.remote_port = remote_port; - - err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, - &bind_interdomain); - - return err ? : bind_evtchn_to_irq(bind_interdomain.local_port); -} - -static int find_virq(unsigned int virq, unsigned int cpu) -{ - struct evtchn_status status; - int port, rc = -ENOENT; - - memset(&status, 0, sizeof(status)); - for (port = 0; port <= NR_EVENT_CHANNELS; port++) { - status.dom = DOMID_SELF; - status.port = port; - rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status); - if (rc < 0) - continue; - if (status.status != EVTCHNSTAT_virq) - continue; - if (status.u.virq == virq && status.vcpu == cpu) { - rc = port; - break; - } - } - return rc; -} - -int bind_virq_to_irq(unsigned int virq, unsigned int cpu) -{ - struct evtchn_bind_virq bind_virq; - int evtchn, irq, ret; - - mutex_lock(&irq_mapping_update_lock); - - irq = per_cpu(virq_to_irq, cpu)[virq]; - - if (irq == -1) { - irq = xen_allocate_irq_dynamic(); - if (irq < 0) - goto out; - - irq_set_chip_and_handler_name(irq, &xen_percpu_chip, - handle_percpu_irq, "virq"); - - bind_virq.virq = virq; - bind_virq.vcpu = cpu; - ret = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, - &bind_virq); - if (ret == 0) - evtchn = bind_virq.port; - else { - if (ret == -EEXIST) - ret = find_virq(virq, cpu); - BUG_ON(ret < 0); - evtchn = ret; - } - - xen_irq_info_virq_init(cpu, irq, evtchn, virq); - - bind_evtchn_to_cpu(evtchn, cpu); - } else { - struct irq_info *info = info_for_irq(irq); - WARN_ON(info == NULL || info->type != IRQT_VIRQ); - } - -out: - mutex_unlock(&irq_mapping_update_lock); - - return irq; -} - -static void unbind_from_irq(unsigned int irq) -{ - struct evtchn_close close; - int evtchn = evtchn_from_irq(irq); - struct irq_info *info = irq_get_handler_data(irq); - - if (WARN_ON(!info)) - return; - - mutex_lock(&irq_mapping_update_lock); - - if (info->refcnt > 0) { - info->refcnt--; - if (info->refcnt != 0) - goto done; - } - - if (VALID_EVTCHN(evtchn)) { - close.port = evtchn; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); - - switch (type_from_irq(irq)) { - case IRQT_VIRQ: - per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) - [virq_from_irq(irq)] = -1; - break; - case IRQT_IPI: - per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn)) - [ipi_from_irq(irq)] = -1; - break; - default: - break; - } - - /* Closed ports are implicitly re-bound to VCPU0. */ - bind_evtchn_to_cpu(evtchn, 0); - - evtchn_to_irq[evtchn] = -1; - } - - BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND); - - xen_free_irq(irq); - - done: - mutex_unlock(&irq_mapping_update_lock); -} - -int bind_evtchn_to_irqhandler(unsigned int evtchn, - irq_handler_t handler, - unsigned long irqflags, - const char *devname, void *dev_id) -{ - int irq, retval; - - irq = bind_evtchn_to_irq(evtchn); - if (irq < 0) - return irq; - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if (retval != 0) { - unbind_from_irq(irq); - return retval; - } - - return irq; -} -EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); - -int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, - unsigned int remote_port, - irq_handler_t handler, - unsigned long irqflags, - const char *devname, - void *dev_id) -{ - int irq, retval; - - irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port); - if (irq < 0) - return irq; - - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if (retval != 0) { - unbind_from_irq(irq); - return retval; - } - - return irq; -} -EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler); - -int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, - irq_handler_t handler, - unsigned long irqflags, const char *devname, void *dev_id) -{ - int irq, retval; - - irq = bind_virq_to_irq(virq, cpu); - if (irq < 0) - return irq; - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if (retval != 0) { - unbind_from_irq(irq); - return retval; - } - - return irq; -} -EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); - -int bind_ipi_to_irqhandler(enum ipi_vector ipi, - unsigned int cpu, - irq_handler_t handler, - unsigned long irqflags, - const char *devname, - void *dev_id) -{ - int irq, retval; - - irq = bind_ipi_to_irq(ipi, cpu); - if (irq < 0) - return irq; - - irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME | IRQF_EARLY_RESUME; - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if (retval != 0) { - unbind_from_irq(irq); - return retval; - } - - return irq; -} - -void unbind_from_irqhandler(unsigned int irq, void *dev_id) -{ - struct irq_info *info = irq_get_handler_data(irq); - - if (WARN_ON(!info)) - return; - free_irq(irq, dev_id); - unbind_from_irq(irq); -} -EXPORT_SYMBOL_GPL(unbind_from_irqhandler); - -int evtchn_make_refcounted(unsigned int evtchn) -{ - int irq = evtchn_to_irq[evtchn]; - struct irq_info *info; - - if (irq == -1) - return -ENOENT; - - info = irq_get_handler_data(irq); - - if (!info) - return -ENOENT; - - WARN_ON(info->refcnt != -1); - - info->refcnt = 1; - - return 0; -} -EXPORT_SYMBOL_GPL(evtchn_make_refcounted); - -int evtchn_get(unsigned int evtchn) -{ - int irq; - struct irq_info *info; - int err = -ENOENT; - - if (evtchn >= NR_EVENT_CHANNELS) - return -EINVAL; - - mutex_lock(&irq_mapping_update_lock); - - irq = evtchn_to_irq[evtchn]; - if (irq == -1) - goto done; - - info = irq_get_handler_data(irq); - - if (!info) - goto done; - - err = -EINVAL; - if (info->refcnt <= 0) - goto done; - - info->refcnt++; - err = 0; - done: - mutex_unlock(&irq_mapping_update_lock); - - return err; -} -EXPORT_SYMBOL_GPL(evtchn_get); - -void evtchn_put(unsigned int evtchn) -{ - int irq = evtchn_to_irq[evtchn]; - if (WARN_ON(irq == -1)) - return; - unbind_from_irq(irq); -} -EXPORT_SYMBOL_GPL(evtchn_put); - -void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) -{ - int irq = per_cpu(ipi_to_irq, cpu)[vector]; - BUG_ON(irq < 0); - notify_remote_via_irq(irq); -} - -irqreturn_t xen_debug_interrupt(int irq, void *dev_id) -{ - struct shared_info *sh = HYPERVISOR_shared_info; - int cpu = smp_processor_id(); - xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); - int i; - unsigned long flags; - static DEFINE_SPINLOCK(debug_lock); - struct vcpu_info *v; - - spin_lock_irqsave(&debug_lock, flags); - - printk("\nvcpu %d\n ", cpu); - - for_each_online_cpu(i) { - int pending; - v = per_cpu(xen_vcpu, i); - pending = (get_irq_regs() && i == cpu) - ? xen_irqs_disabled(get_irq_regs()) - : v->evtchn_upcall_mask; - printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n ", i, - pending, v->evtchn_upcall_pending, - (int)(sizeof(v->evtchn_pending_sel)*2), - v->evtchn_pending_sel); - } - v = per_cpu(xen_vcpu, cpu); - - printk("\npending:\n "); - for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) - printk("%0*"PRI_xen_ulong"%s", - (int)sizeof(sh->evtchn_pending[0])*2, - sh->evtchn_pending[i], - i % 8 == 0 ? "\n " : " "); - printk("\nglobal mask:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) - printk("%0*"PRI_xen_ulong"%s", - (int)(sizeof(sh->evtchn_mask[0])*2), - sh->evtchn_mask[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nglobally unmasked:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) - printk("%0*"PRI_xen_ulong"%s", - (int)(sizeof(sh->evtchn_mask[0])*2), - sh->evtchn_pending[i] & ~sh->evtchn_mask[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nlocal cpu%d mask:\n ", cpu); - for (i = (NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--) - printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2), - cpu_evtchn[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nlocally unmasked:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { - xen_ulong_t pending = sh->evtchn_pending[i] - & ~sh->evtchn_mask[i] - & cpu_evtchn[i]; - printk("%0*"PRI_xen_ulong"%s", - (int)(sizeof(sh->evtchn_mask[0])*2), - pending, i % 8 == 0 ? "\n " : " "); - } - - printk("\npending list:\n"); - for (i = 0; i < NR_EVENT_CHANNELS; i++) { - if (sync_test_bit(i, BM(sh->evtchn_pending))) { - int word_idx = i / BITS_PER_EVTCHN_WORD; - printk(" %d: event %d -> irq %d%s%s%s\n", - cpu_from_evtchn(i), i, - evtchn_to_irq[i], - sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) - ? "" : " l2-clear", - !sync_test_bit(i, BM(sh->evtchn_mask)) - ? "" : " globally-masked", - sync_test_bit(i, BM(cpu_evtchn)) - ? "" : " locally-masked"); - } - } - - spin_unlock_irqrestore(&debug_lock, flags); - - return IRQ_HANDLED; -} - -static DEFINE_PER_CPU(unsigned, xed_nesting_count); -static DEFINE_PER_CPU(unsigned int, current_word_idx); -static DEFINE_PER_CPU(unsigned int, current_bit_idx); - -/* - * Mask out the i least significant bits of w - */ -#define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i)) - -/* - * Search the CPUs pending events bitmasks. For each one found, map - * the event number to an irq, and feed it into do_IRQ() for - * handling. - * - * Xen uses a two-level bitmap to speed searching. The first level is - * a bitset of words which contain pending event bits. The second - * level is a bitset of pending events themselves. - */ -static void __xen_evtchn_do_upcall(void) -{ - int start_word_idx, start_bit_idx; - int word_idx, bit_idx; - int i, irq; - int cpu = get_cpu(); - struct shared_info *s = HYPERVISOR_shared_info; - struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); - unsigned count; - - do { - xen_ulong_t pending_words; - xen_ulong_t pending_bits; - struct irq_desc *desc; - - vcpu_info->evtchn_upcall_pending = 0; - - if (__this_cpu_inc_return(xed_nesting_count) - 1) - goto out; - - /* - * Master flag must be cleared /before/ clearing - * selector flag. xchg_xen_ulong must contain an - * appropriate barrier. - */ - if ((irq = per_cpu(virq_to_irq, cpu)[VIRQ_TIMER]) != -1) { - int evtchn = evtchn_from_irq(irq); - word_idx = evtchn / BITS_PER_LONG; - pending_bits = evtchn % BITS_PER_LONG; - if (active_evtchns(cpu, s, word_idx) & (1ULL << pending_bits)) { - desc = irq_to_desc(irq); - if (desc) - generic_handle_irq_desc(irq, desc); - } - } - - pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0); - - start_word_idx = __this_cpu_read(current_word_idx); - start_bit_idx = __this_cpu_read(current_bit_idx); - - word_idx = start_word_idx; - - for (i = 0; pending_words != 0; i++) { - xen_ulong_t words; - - words = MASK_LSBS(pending_words, word_idx); - - /* - * If we masked out all events, wrap to beginning. - */ - if (words == 0) { - word_idx = 0; - bit_idx = 0; - continue; - } - word_idx = EVTCHN_FIRST_BIT(words); - - pending_bits = active_evtchns(cpu, s, word_idx); - bit_idx = 0; /* usually scan entire word from start */ - if (word_idx == start_word_idx) { - /* We scan the starting word in two parts */ - if (i == 0) - /* 1st time: start in the middle */ - bit_idx = start_bit_idx; - else - /* 2nd time: mask bits done already */ - bit_idx &= (1UL << start_bit_idx) - 1; - } - - do { - xen_ulong_t bits; - int port; - - bits = MASK_LSBS(pending_bits, bit_idx); - - /* If we masked out all events, move on. */ - if (bits == 0) - break; - - bit_idx = EVTCHN_FIRST_BIT(bits); - - /* Process port. */ - port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx; - irq = evtchn_to_irq[port]; - - if (irq != -1) { - desc = irq_to_desc(irq); - if (desc) - generic_handle_irq_desc(irq, desc); - } - - bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD; - - /* Next caller starts at last processed + 1 */ - __this_cpu_write(current_word_idx, - bit_idx ? word_idx : - (word_idx+1) % BITS_PER_EVTCHN_WORD); - __this_cpu_write(current_bit_idx, bit_idx); - } while (bit_idx != 0); - - /* Scan start_l1i twice; all others once. */ - if ((word_idx != start_word_idx) || (i != 0)) - pending_words &= ~(1UL << word_idx); - - word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD; - } - - BUG_ON(!irqs_disabled()); - - count = __this_cpu_read(xed_nesting_count); - __this_cpu_write(xed_nesting_count, 0); - } while (count != 1 || vcpu_info->evtchn_upcall_pending); - -out: - - put_cpu(); -} - -void xen_evtchn_do_upcall(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - irq_enter(); -#ifdef CONFIG_X86 - exit_idle(); -#endif - - __xen_evtchn_do_upcall(); - - irq_exit(); - set_irq_regs(old_regs); -} - -void xen_hvm_evtchn_do_upcall(void) -{ - __xen_evtchn_do_upcall(); -} -EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall); - -/* Rebind a new event channel to an existing irq. */ -void rebind_evtchn_irq(int evtchn, int irq) -{ - struct irq_info *info = info_for_irq(irq); - - if (WARN_ON(!info)) - return; - - /* Make sure the irq is masked, since the new event channel - will also be masked. */ - disable_irq(irq); - - mutex_lock(&irq_mapping_update_lock); - - /* After resume the irq<->evtchn mappings are all cleared out */ - BUG_ON(evtchn_to_irq[evtchn] != -1); - /* Expect irq to have been bound before, - so there should be a proper type */ - BUG_ON(info->type == IRQT_UNBOUND); - - xen_irq_info_evtchn_init(irq, evtchn); - - mutex_unlock(&irq_mapping_update_lock); - - /* new event channels are always bound to cpu 0 */ - irq_set_affinity(irq, cpumask_of(0)); - - /* Unmask the event channel. */ - enable_irq(irq); -} - -/* Rebind an evtchn so that it gets delivered to a specific cpu */ -static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) -{ - struct evtchn_bind_vcpu bind_vcpu; - int evtchn = evtchn_from_irq(irq); - - if (!VALID_EVTCHN(evtchn)) - return -1; - - /* - * Events delivered via platform PCI interrupts are always - * routed to vcpu 0 and hence cannot be rebound. - */ - if (xen_hvm_domain() && !xen_have_vector_callback) - return -1; - - /* Send future instances of this interrupt to other vcpu. */ - bind_vcpu.port = evtchn; - bind_vcpu.vcpu = tcpu; - - /* - * If this fails, it usually just indicates that we're dealing with a - * virq or IPI channel, which don't actually need to be rebound. Ignore - * it, but don't do the xenlinux-level rebind in that case. - */ - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) - bind_evtchn_to_cpu(evtchn, tcpu); - - return 0; -} - -static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, - bool force) -{ - unsigned tcpu = cpumask_first(dest); - - return rebind_irq_to_cpu(data->irq, tcpu); -} - -int resend_irq_on_evtchn(unsigned int irq) -{ - int masked, evtchn = evtchn_from_irq(irq); - struct shared_info *s = HYPERVISOR_shared_info; - - if (!VALID_EVTCHN(evtchn)) - return 1; - - masked = sync_test_and_set_bit(evtchn, BM(s->evtchn_mask)); - sync_set_bit(evtchn, BM(s->evtchn_pending)); - if (!masked) - unmask_evtchn(evtchn); - - return 1; -} - -static void enable_dynirq(struct irq_data *data) -{ - int evtchn = evtchn_from_irq(data->irq); - - if (VALID_EVTCHN(evtchn)) - unmask_evtchn(evtchn); -} - -static void disable_dynirq(struct irq_data *data) -{ - int evtchn = evtchn_from_irq(data->irq); - - if (VALID_EVTCHN(evtchn)) - mask_evtchn(evtchn); -} - -static void ack_dynirq(struct irq_data *data) -{ - int evtchn = evtchn_from_irq(data->irq); - - irq_move_irq(data); - - if (VALID_EVTCHN(evtchn)) - clear_evtchn(evtchn); -} - -static void mask_ack_dynirq(struct irq_data *data) -{ - disable_dynirq(data); - ack_dynirq(data); -} - -static int retrigger_dynirq(struct irq_data *data) -{ - int evtchn = evtchn_from_irq(data->irq); - struct shared_info *sh = HYPERVISOR_shared_info; - int ret = 0; - - if (VALID_EVTCHN(evtchn)) { - int masked; - - masked = sync_test_and_set_bit(evtchn, BM(sh->evtchn_mask)); - sync_set_bit(evtchn, BM(sh->evtchn_pending)); - if (!masked) - unmask_evtchn(evtchn); - ret = 1; - } - - return ret; -} - -static void restore_pirqs(void) -{ - int pirq, rc, irq, gsi; - struct physdev_map_pirq map_irq; - struct irq_info *info; - - list_for_each_entry(info, &xen_irq_list_head, list) { - if (info->type != IRQT_PIRQ) - continue; - - pirq = info->u.pirq.pirq; - gsi = info->u.pirq.gsi; - irq = info->irq; - - /* save/restore of PT devices doesn't work, so at this point the - * only devices present are GSI based emulated devices */ - if (!gsi) - continue; - - map_irq.domid = DOMID_SELF; - map_irq.type = MAP_PIRQ_TYPE_GSI; - map_irq.index = gsi; - map_irq.pirq = pirq; - - rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); - if (rc) { - pr_warn("xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n", - gsi, irq, pirq, rc); - xen_free_irq(irq); - continue; - } - - printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); - - __startup_pirq(irq); - } -} - -static void restore_cpu_virqs(unsigned int cpu) -{ - struct evtchn_bind_virq bind_virq; - int virq, irq, evtchn; - - for (virq = 0; virq < NR_VIRQS; virq++) { - if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) - continue; - - BUG_ON(virq_from_irq(irq) != virq); - - /* Get a new binding from Xen. */ - bind_virq.virq = virq; - bind_virq.vcpu = cpu; - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, - &bind_virq) != 0) - BUG(); - evtchn = bind_virq.port; - - /* Record the new mapping. */ - xen_irq_info_virq_init(cpu, irq, evtchn, virq); - bind_evtchn_to_cpu(evtchn, cpu); - } -} - -static void restore_cpu_ipis(unsigned int cpu) -{ - struct evtchn_bind_ipi bind_ipi; - int ipi, irq, evtchn; - - for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) { - if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) - continue; - - BUG_ON(ipi_from_irq(irq) != ipi); - - /* Get a new binding from Xen. */ - bind_ipi.vcpu = cpu; - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, - &bind_ipi) != 0) - BUG(); - evtchn = bind_ipi.port; - - /* Record the new mapping. */ - xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); - bind_evtchn_to_cpu(evtchn, cpu); - } -} - -/* Clear an irq's pending state, in preparation for polling on it */ -void xen_clear_irq_pending(int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - clear_evtchn(evtchn); -} -EXPORT_SYMBOL(xen_clear_irq_pending); -void xen_set_irq_pending(int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - set_evtchn(evtchn); -} - -bool xen_test_irq_pending(int irq) -{ - int evtchn = evtchn_from_irq(irq); - bool ret = false; - - if (VALID_EVTCHN(evtchn)) - ret = test_evtchn(evtchn); - - return ret; -} - -/* Poll waiting for an irq to become pending with timeout. In the usual case, - * the irq will be disabled so it won't deliver an interrupt. */ -void xen_poll_irq_timeout(int irq, u64 timeout) -{ - evtchn_port_t evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) { - struct sched_poll poll; - - poll.nr_ports = 1; - poll.timeout = timeout; - set_xen_guest_handle(poll.ports, &evtchn); - - if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0) - BUG(); - } -} -EXPORT_SYMBOL(xen_poll_irq_timeout); -/* Poll waiting for an irq to become pending. In the usual case, the - * irq will be disabled so it won't deliver an interrupt. */ -void xen_poll_irq(int irq) -{ - xen_poll_irq_timeout(irq, 0 /* no timeout */); -} - -/* Check whether the IRQ line is shared with other guests. */ -int xen_test_irq_shared(int irq) -{ - struct irq_info *info = info_for_irq(irq); - struct physdev_irq_status_query irq_status; - - if (WARN_ON(!info)) - return -ENOENT; - - irq_status.irq = info->u.pirq.pirq; - - if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) - return 0; - return !(irq_status.flags & XENIRQSTAT_shared); -} -EXPORT_SYMBOL_GPL(xen_test_irq_shared); - -void xen_irq_resume(void) -{ - unsigned int cpu, evtchn; - struct irq_info *info; - - init_evtchn_cpu_bindings(); - - /* New event-channel space is not 'live' yet. */ - for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) - mask_evtchn(evtchn); - - /* No IRQ <-> event-channel mappings. */ - list_for_each_entry(info, &xen_irq_list_head, list) - info->evtchn = 0; /* zap event-channel binding */ - - for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) - evtchn_to_irq[evtchn] = -1; - - for_each_possible_cpu(cpu) { - restore_cpu_virqs(cpu); - restore_cpu_ipis(cpu); - } - - restore_pirqs(); -} - -static struct irq_chip xen_dynamic_chip __read_mostly = { - .name = "xen-dyn", - - .irq_disable = disable_dynirq, - .irq_mask = disable_dynirq, - .irq_unmask = enable_dynirq, - - .irq_ack = ack_dynirq, - .irq_mask_ack = mask_ack_dynirq, - - .irq_set_affinity = set_affinity_irq, - .irq_retrigger = retrigger_dynirq, -}; - -static struct irq_chip xen_pirq_chip __read_mostly = { - .name = "xen-pirq", - - .irq_startup = startup_pirq, - .irq_shutdown = shutdown_pirq, - .irq_enable = enable_pirq, - .irq_disable = disable_pirq, - - .irq_mask = disable_dynirq, - .irq_unmask = enable_dynirq, - - .irq_ack = eoi_pirq, - .irq_eoi = eoi_pirq, - .irq_mask_ack = mask_ack_pirq, - - .irq_set_affinity = set_affinity_irq, - - .irq_retrigger = retrigger_dynirq, -}; - -static struct irq_chip xen_percpu_chip __read_mostly = { - .name = "xen-percpu", - - .irq_disable = disable_dynirq, - .irq_mask = disable_dynirq, - .irq_unmask = enable_dynirq, - - .irq_ack = ack_dynirq, -}; - -int xen_set_callback_via(uint64_t via) -{ - struct xen_hvm_param a; - a.domid = DOMID_SELF; - a.index = HVM_PARAM_CALLBACK_IRQ; - a.value = via; - return HYPERVISOR_hvm_op(HVMOP_set_param, &a); -} -EXPORT_SYMBOL_GPL(xen_set_callback_via); - -#ifdef CONFIG_XEN_PVHVM -/* Vector callbacks are better than PCI interrupts to receive event - * channel notifications because we can receive vector callbacks on any - * vcpu and we don't need PCI support or APIC interactions. */ -void xen_callback_vector(void) -{ - int rc; - uint64_t callback_via; - if (xen_have_vector_callback) { - callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR); - rc = xen_set_callback_via(callback_via); - if (rc) { - pr_err("Request for Xen HVM callback vector failed\n"); - xen_have_vector_callback = 0; - return; - } - pr_info("Xen HVM callback vector for event delivery is enabled\n"); - /* in the restore case the vector has already been allocated */ - if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, - xen_hvm_callback_vector); - } -} -#else -void xen_callback_vector(void) {} -#endif - -void __init xen_init_IRQ(void) -{ - int i; - - evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), - GFP_KERNEL); - BUG_ON(!evtchn_to_irq); - for (i = 0; i < NR_EVENT_CHANNELS; i++) - evtchn_to_irq[i] = -1; - - init_evtchn_cpu_bindings(); - - /* No event channels are 'live' right now. */ - for (i = 0; i < NR_EVENT_CHANNELS; i++) - mask_evtchn(i); - - pirq_needs_eoi = pirq_needs_eoi_flag; - -#ifdef CONFIG_X86 - if (xen_hvm_domain()) { - xen_callback_vector(); - native_init_IRQ(); - /* pci_xen_hvm_init must be called after native_init_IRQ so that - * __acpi_register_gsi can point at the right function */ - pci_xen_hvm_init(); - } else { - int rc; - struct physdev_pirq_eoi_gmfn eoi_gmfn; - - irq_ctx_init(smp_processor_id()); - if (xen_initial_domain()) - pci_xen_initial_domain(); - - pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO); - eoi_gmfn.gmfn = virt_to_mfn(pirq_eoi_map); - rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn); - if (rc != 0) { - free_page((unsigned long) pirq_eoi_map); - pirq_eoi_map = NULL; - } else - pirq_needs_eoi = pirq_check_eoi_map; - } -#endif -} diff --git a/drivers/xen/events/Makefile b/drivers/xen/events/Makefile new file mode 100644 index 000000000000..92508d9a6bd2 --- /dev/null +++ b/drivers/xen/events/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-y += events.o + +events-y += events_base.o +events-y += events_2l.o +events-y += events_fifo.o diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c new file mode 100644 index 000000000000..e3585330cf98 --- /dev/null +++ b/drivers/xen/events/events_2l.c @@ -0,0 +1,384 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Xen event channels (2-level ABI) + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> + +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> + +#include "events_internal.h" + +/* + * Note sizeof(xen_ulong_t) can be more than sizeof(unsigned long). Be + * careful to only use bitops which allow for this (e.g + * test_bit/find_first_bit and friends but not __ffs) and to pass + * BITS_PER_EVTCHN_WORD as the bitmask length. + */ +#define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8) +/* + * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t + * array. Primarily to avoid long lines (hence the terse name). + */ +#define BM(x) (unsigned long *)(x) +/* Find the first set bit in a evtchn mask */ +#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD) + +#define EVTCHN_MASK_SIZE (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD) + +static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_MASK_SIZE], cpu_evtchn_mask); + +static unsigned evtchn_2l_max_channels(void) +{ + return EVTCHN_2L_NR_CHANNELS; +} + +static void evtchn_2l_remove(evtchn_port_t evtchn, unsigned int cpu) +{ + clear_bit(evtchn, BM(per_cpu(cpu_evtchn_mask, cpu))); +} + +static void evtchn_2l_bind_to_cpu(evtchn_port_t evtchn, unsigned int cpu, + unsigned int old_cpu) +{ + clear_bit(evtchn, BM(per_cpu(cpu_evtchn_mask, old_cpu))); + set_bit(evtchn, BM(per_cpu(cpu_evtchn_mask, cpu))); +} + +static void evtchn_2l_clear_pending(evtchn_port_t port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_clear_bit(port, BM(&s->evtchn_pending[0])); +} + +static void evtchn_2l_set_pending(evtchn_port_t port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, BM(&s->evtchn_pending[0])); +} + +static bool evtchn_2l_is_pending(evtchn_port_t port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + return sync_test_bit(port, BM(&s->evtchn_pending[0])); +} + +static void evtchn_2l_mask(evtchn_port_t port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, BM(&s->evtchn_mask[0])); +} + +static void evtchn_2l_unmask(evtchn_port_t port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + unsigned int cpu = get_cpu(); + int do_hypercall = 0, evtchn_pending = 0; + + BUG_ON(!irqs_disabled()); + + smp_wmb(); /* All writes before unmask must be visible. */ + + if (unlikely((cpu != cpu_from_evtchn(port)))) + do_hypercall = 1; + else { + /* + * Need to clear the mask before checking pending to + * avoid a race with an event becoming pending. + * + * EVTCHNOP_unmask will only trigger an upcall if the + * mask bit was set, so if a hypercall is needed + * remask the event. + */ + sync_clear_bit(port, BM(&s->evtchn_mask[0])); + evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0])); + + if (unlikely(evtchn_pending && xen_hvm_domain())) { + sync_set_bit(port, BM(&s->evtchn_mask[0])); + do_hypercall = 1; + } + } + + /* Slow path (hypercall) if this is a non-local port or if this is + * an hvm domain and an event is pending (hvm domains don't have + * their own implementation of irq_enable). */ + if (do_hypercall) { + struct evtchn_unmask unmask = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + } else { + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + + /* + * The following is basically the equivalent of + * 'hw_resend_irq'. Just like a real IO-APIC we 'lose + * the interrupt edge' if the channel is masked. + */ + if (evtchn_pending && + !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD, + BM(&vcpu_info->evtchn_pending_sel))) + vcpu_info->evtchn_upcall_pending = 1; + } + + put_cpu(); +} + +static DEFINE_PER_CPU(unsigned int, current_word_idx); +static DEFINE_PER_CPU(unsigned int, current_bit_idx); + +/* + * Mask out the i least significant bits of w + */ +#define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i)) + +static inline xen_ulong_t active_evtchns(unsigned int cpu, + struct shared_info *sh, + unsigned int idx) +{ + return sh->evtchn_pending[idx] & + per_cpu(cpu_evtchn_mask, cpu)[idx] & + ~sh->evtchn_mask[idx]; +} + +/* + * Search the CPU's pending events bitmasks. For each one found, map + * the event number to an irq, and feed it into do_IRQ() for handling. + * + * Xen uses a two-level bitmap to speed searching. The first level is + * a bitset of words which contain pending event bits. The second + * level is a bitset of pending events themselves. + */ +static void evtchn_2l_handle_events(unsigned cpu, struct evtchn_loop_ctrl *ctrl) +{ + int irq; + xen_ulong_t pending_words; + xen_ulong_t pending_bits; + int start_word_idx, start_bit_idx; + int word_idx, bit_idx; + int i; + struct shared_info *s = HYPERVISOR_shared_info; + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + evtchn_port_t evtchn; + + /* Timer interrupt has highest priority. */ + irq = irq_evtchn_from_virq(cpu, VIRQ_TIMER, &evtchn); + if (irq != -1) { + word_idx = evtchn / BITS_PER_LONG; + bit_idx = evtchn % BITS_PER_LONG; + if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx)) + generic_handle_irq(irq); + } + + /* + * Master flag must be cleared /before/ clearing + * selector flag. xchg_xen_ulong must contain an + * appropriate barrier. + */ + pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0); + + start_word_idx = __this_cpu_read(current_word_idx); + start_bit_idx = __this_cpu_read(current_bit_idx); + + word_idx = start_word_idx; + + for (i = 0; pending_words != 0; i++) { + xen_ulong_t words; + + words = MASK_LSBS(pending_words, word_idx); + + /* + * If we masked out all events, wrap to beginning. + */ + if (words == 0) { + word_idx = 0; + bit_idx = 0; + continue; + } + word_idx = EVTCHN_FIRST_BIT(words); + + pending_bits = active_evtchns(cpu, s, word_idx); + bit_idx = 0; /* usually scan entire word from start */ + /* + * We scan the starting word in two parts. + * + * 1st time: start in the middle, scanning the + * upper bits. + * + * 2nd time: scan the whole word (not just the + * parts skipped in the first pass) -- if an + * event in the previously scanned bits is + * pending again it would just be scanned on + * the next loop anyway. + */ + if (word_idx == start_word_idx) { + if (i == 0) + bit_idx = start_bit_idx; + } + + do { + xen_ulong_t bits; + evtchn_port_t port; + + bits = MASK_LSBS(pending_bits, bit_idx); + + /* If we masked out all events, move on. */ + if (bits == 0) + break; + + bit_idx = EVTCHN_FIRST_BIT(bits); + + /* Process port. */ + port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx; + handle_irq_for_port(port, ctrl); + + bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD; + + /* Next caller starts at last processed + 1 */ + __this_cpu_write(current_word_idx, + bit_idx ? word_idx : + (word_idx+1) % BITS_PER_EVTCHN_WORD); + __this_cpu_write(current_bit_idx, bit_idx); + } while (bit_idx != 0); + + /* Scan start_l1i twice; all others once. */ + if ((word_idx != start_word_idx) || (i != 0)) + pending_words &= ~(1UL << word_idx); + + word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD; + } +} + +irqreturn_t xen_debug_interrupt(int irq, void *dev_id) +{ + struct shared_info *sh = HYPERVISOR_shared_info; + int cpu = smp_processor_id(); + xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); + int i; + unsigned long flags; + static DEFINE_SPINLOCK(debug_lock); + struct vcpu_info *v; + + spin_lock_irqsave(&debug_lock, flags); + + printk("\nvcpu %d\n ", cpu); + + for_each_online_cpu(i) { + int pending; + v = per_cpu(xen_vcpu, i); + pending = (get_irq_regs() && i == cpu) + ? xen_irqs_disabled(get_irq_regs()) + : v->evtchn_upcall_mask; + printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n ", i, + pending, v->evtchn_upcall_pending, + (int)(sizeof(v->evtchn_pending_sel)*2), + v->evtchn_pending_sel); + } + v = per_cpu(xen_vcpu, cpu); + + printk("\npending:\n "); + for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)sizeof(sh->evtchn_pending[0])*2, + sh->evtchn_pending[i], + i % 8 == 0 ? "\n " : " "); + printk("\nglobal mask:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nglobally unmasked:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + sh->evtchn_pending[i] & ~sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nlocal cpu%d mask:\n ", cpu); + for (i = (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2), + cpu_evtchn[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nlocally unmasked:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { + xen_ulong_t pending = sh->evtchn_pending[i] + & ~sh->evtchn_mask[i] + & cpu_evtchn[i]; + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + pending, i % 8 == 0 ? "\n " : " "); + } + + printk("\npending list:\n"); + for (i = 0; i < EVTCHN_2L_NR_CHANNELS; i++) { + if (sync_test_bit(i, BM(sh->evtchn_pending))) { + int word_idx = i / BITS_PER_EVTCHN_WORD; + printk(" %d: event %d -> irq %u%s%s%s\n", + cpu_from_evtchn(i), i, + irq_from_evtchn(i), + sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) + ? "" : " l2-clear", + !sync_test_bit(i, BM(sh->evtchn_mask)) + ? "" : " globally-masked", + sync_test_bit(i, BM(cpu_evtchn)) + ? "" : " locally-masked"); + } + } + + spin_unlock_irqrestore(&debug_lock, flags); + + return IRQ_HANDLED; +} + +static void evtchn_2l_resume(void) +{ + int i; + + for_each_online_cpu(i) + memset(per_cpu(cpu_evtchn_mask, i), 0, sizeof(xen_ulong_t) * + EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD); +} + +static int evtchn_2l_percpu_deinit(unsigned int cpu) +{ + memset(per_cpu(cpu_evtchn_mask, cpu), 0, sizeof(xen_ulong_t) * + EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD); + + return 0; +} + +static const struct evtchn_ops evtchn_ops_2l = { + .max_channels = evtchn_2l_max_channels, + .nr_channels = evtchn_2l_max_channels, + .remove = evtchn_2l_remove, + .bind_to_cpu = evtchn_2l_bind_to_cpu, + .clear_pending = evtchn_2l_clear_pending, + .set_pending = evtchn_2l_set_pending, + .is_pending = evtchn_2l_is_pending, + .mask = evtchn_2l_mask, + .unmask = evtchn_2l_unmask, + .handle_events = evtchn_2l_handle_events, + .resume = evtchn_2l_resume, + .percpu_deinit = evtchn_2l_percpu_deinit, +}; + +void __init xen_evtchn_2l_init(void) +{ + pr_info("Using 2-level ABI\n"); + evtchn_ops = &evtchn_ops_2l; +} diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c new file mode 100644 index 000000000000..9478fae014e5 --- /dev/null +++ b/drivers/xen/events/events_base.c @@ -0,0 +1,2333 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Xen event channels + * + * Xen models interrupts with abstract event channels. Because each + * domain gets 1024 event channels, but NR_IRQ is not that large, we + * must dynamically map irqs<->event channels. The event channels + * interface with the rest of the kernel by defining a xen interrupt + * chip. When an event is received, it is mapped to an irq and sent + * through the normal interrupt processing path. + * + * There are four kinds of events which can be mapped to an event + * channel: + * + * 1. Inter-domain notifications. This includes all the virtual + * device events, since they're driven by front-ends in another domain + * (typically dom0). + * 2. VIRQs, typically used for timers. These are per-cpu events. + * 3. IPIs. + * 4. PIRQs - Hardware interrupts. + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/moduleparam.h> +#include <linux/string.h> +#include <linux/memblock.h> +#include <linux/slab.h> +#include <linux/irqnr.h> +#include <linux/pci.h> +#include <linux/rcupdate.h> +#include <linux/spinlock.h> +#include <linux/cpuhotplug.h> +#include <linux/atomic.h> +#include <linux/ktime.h> + +#ifdef CONFIG_X86 +#include <asm/desc.h> +#include <asm/ptrace.h> +#include <asm/idtentry.h> +#include <asm/irq.h> +#include <asm/io_apic.h> +#include <asm/i8259.h> +#include <asm/xen/cpuid.h> +#include <asm/xen/pci.h> +#endif +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> +#include <xen/page.h> + +#include <xen/xen.h> +#include <xen/hvm.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> +#include <xen/interface/hvm/hvm_op.h> +#include <xen/interface/hvm/params.h> +#include <xen/interface/physdev.h> +#include <xen/interface/sched.h> +#include <xen/interface/vcpu.h> +#include <xen/xenbus.h> +#include <asm/hw_irq.h> + +#include "events_internal.h" + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "xen." + +/* Interrupt types. */ +enum xen_irq_type { + IRQT_UNBOUND = 0, + IRQT_PIRQ, + IRQT_VIRQ, + IRQT_IPI, + IRQT_EVTCHN +}; + +/* + * Packed IRQ information: + * type - enum xen_irq_type + * event channel - irq->event channel mapping + * cpu - cpu this event channel is bound to + * index - type-specific information: + * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM + * guest, or GSI (real passthrough IRQ) of the device. + * VIRQ - virq number + * IPI - IPI vector + * EVTCHN - + */ +struct irq_info { + struct list_head list; + struct list_head eoi_list; + struct rcu_work rwork; + short refcnt; + u8 spurious_cnt; + u8 is_accounted; + short type; /* type: IRQT_* */ + u8 mask_reason; /* Why is event channel masked */ +#define EVT_MASK_REASON_EXPLICIT 0x01 +#define EVT_MASK_REASON_TEMPORARY 0x02 +#define EVT_MASK_REASON_EOI_PENDING 0x04 + u8 is_active; /* Is event just being handled? */ + unsigned irq; + evtchn_port_t evtchn; /* event channel */ + unsigned short cpu; /* cpu bound */ + unsigned short eoi_cpu; /* EOI must happen on this cpu-1 */ + unsigned int irq_epoch; /* If eoi_cpu valid: irq_epoch of event */ + u64 eoi_time; /* Time in jiffies when to EOI. */ + raw_spinlock_t lock; + bool is_static; /* Is event channel static */ + + union { + unsigned short virq; + enum ipi_vector ipi; + struct { + unsigned short pirq; + unsigned short gsi; + unsigned char vector; + unsigned char flags; + uint16_t domid; + } pirq; + struct xenbus_device *interdomain; + } u; +}; + +#define PIRQ_NEEDS_EOI (1 << 0) +#define PIRQ_SHAREABLE (1 << 1) +#define PIRQ_MSI_GROUP (1 << 2) + +static uint __read_mostly event_loop_timeout = 2; +module_param(event_loop_timeout, uint, 0644); + +static uint __read_mostly event_eoi_delay = 10; +module_param(event_eoi_delay, uint, 0644); + +const struct evtchn_ops *evtchn_ops; + +/* + * This lock protects updates to the following mapping and reference-count + * arrays. The lock does not need to be acquired to read the mapping tables. + */ +static DEFINE_MUTEX(irq_mapping_update_lock); + +/* + * Lock hierarchy: + * + * irq_mapping_update_lock + * IRQ-desc lock + * percpu eoi_list_lock + * irq_info->lock + */ + +static LIST_HEAD(xen_irq_list_head); + +/* IRQ <-> VIRQ mapping. */ +static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; + +/* IRQ <-> IPI mapping */ +static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; +/* Cache for IPI event channels - needed for hot cpu unplug (avoid RCU usage). */ +static DEFINE_PER_CPU(evtchn_port_t [XEN_NR_IPIS], ipi_to_evtchn) = {[0 ... XEN_NR_IPIS-1] = 0}; + +/* Event channel distribution data */ +static atomic_t channels_on_cpu[NR_CPUS]; + +static int **evtchn_to_irq; +#ifdef CONFIG_X86 +static unsigned long *pirq_eoi_map; +#endif +static bool (*pirq_needs_eoi)(struct irq_info *info); + +#define EVTCHN_ROW(e) (e / (PAGE_SIZE/sizeof(**evtchn_to_irq))) +#define EVTCHN_COL(e) (e % (PAGE_SIZE/sizeof(**evtchn_to_irq))) +#define EVTCHN_PER_ROW (PAGE_SIZE / sizeof(**evtchn_to_irq)) + +/* Xen will never allocate port zero for any purpose. */ +#define VALID_EVTCHN(chn) ((chn) != 0) + +static struct irq_info *legacy_info_ptrs[NR_IRQS_LEGACY]; + +static struct irq_chip xen_dynamic_chip; +static struct irq_chip xen_lateeoi_chip; +static struct irq_chip xen_percpu_chip; +static struct irq_chip xen_pirq_chip; +static void enable_dynirq(struct irq_data *data); + +static DEFINE_PER_CPU(unsigned int, irq_epoch); + +static void clear_evtchn_to_irq_row(int *evtchn_row) +{ + unsigned col; + + for (col = 0; col < EVTCHN_PER_ROW; col++) + WRITE_ONCE(evtchn_row[col], -1); +} + +static void clear_evtchn_to_irq_all(void) +{ + unsigned row; + + for (row = 0; row < EVTCHN_ROW(xen_evtchn_max_channels()); row++) { + if (evtchn_to_irq[row] == NULL) + continue; + clear_evtchn_to_irq_row(evtchn_to_irq[row]); + } +} + +static int set_evtchn_to_irq(evtchn_port_t evtchn, unsigned int irq) +{ + unsigned row; + unsigned col; + int *evtchn_row; + + if (evtchn >= xen_evtchn_max_channels()) + return -EINVAL; + + row = EVTCHN_ROW(evtchn); + col = EVTCHN_COL(evtchn); + + if (evtchn_to_irq[row] == NULL) { + /* Unallocated irq entries return -1 anyway */ + if (irq == -1) + return 0; + + evtchn_row = (int *) __get_free_pages(GFP_KERNEL, 0); + if (evtchn_row == NULL) + return -ENOMEM; + + clear_evtchn_to_irq_row(evtchn_row); + + /* + * We've prepared an empty row for the mapping. If a different + * thread was faster inserting it, we can drop ours. + */ + if (cmpxchg(&evtchn_to_irq[row], NULL, evtchn_row) != NULL) + free_page((unsigned long) evtchn_row); + } + + WRITE_ONCE(evtchn_to_irq[row][col], irq); + return 0; +} + +/* Get info for IRQ */ +static struct irq_info *info_for_irq(unsigned irq) +{ + if (irq < nr_legacy_irqs()) + return legacy_info_ptrs[irq]; + else + return irq_get_chip_data(irq); +} + +static void set_info_for_irq(unsigned int irq, struct irq_info *info) +{ + if (irq < nr_legacy_irqs()) + legacy_info_ptrs[irq] = info; + else + irq_set_chip_data(irq, info); +} + +static struct irq_info *evtchn_to_info(evtchn_port_t evtchn) +{ + int irq; + + if (evtchn >= xen_evtchn_max_channels()) + return NULL; + if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL) + return NULL; + irq = READ_ONCE(evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]); + + return (irq < 0) ? NULL : info_for_irq(irq); +} + +/* Per CPU channel accounting */ +static void channels_on_cpu_dec(struct irq_info *info) +{ + if (!info->is_accounted) + return; + + info->is_accounted = 0; + + if (WARN_ON_ONCE(info->cpu >= nr_cpu_ids)) + return; + + WARN_ON_ONCE(!atomic_add_unless(&channels_on_cpu[info->cpu], -1 , 0)); +} + +static void channels_on_cpu_inc(struct irq_info *info) +{ + if (WARN_ON_ONCE(info->cpu >= nr_cpu_ids)) + return; + + if (WARN_ON_ONCE(!atomic_add_unless(&channels_on_cpu[info->cpu], 1, + INT_MAX))) + return; + + info->is_accounted = 1; +} + +static void xen_irq_free_desc(unsigned int irq) +{ + /* Legacy IRQ descriptors are managed by the arch. */ + if (irq >= nr_legacy_irqs()) + irq_free_desc(irq); +} + +static void delayed_free_irq(struct work_struct *work) +{ + struct irq_info *info = container_of(to_rcu_work(work), struct irq_info, + rwork); + unsigned int irq = info->irq; + + /* Remove the info pointer only now, with no potential users left. */ + set_info_for_irq(irq, NULL); + + kfree(info); + + xen_irq_free_desc(irq); +} + +/* Constructors for packed IRQ information. */ +static int xen_irq_info_common_setup(struct irq_info *info, + enum xen_irq_type type, + evtchn_port_t evtchn, + unsigned short cpu) +{ + int ret; + + BUG_ON(info->type != IRQT_UNBOUND && info->type != type); + + info->type = type; + info->evtchn = evtchn; + info->cpu = cpu; + info->mask_reason = EVT_MASK_REASON_EXPLICIT; + raw_spin_lock_init(&info->lock); + + ret = set_evtchn_to_irq(evtchn, info->irq); + if (ret < 0) + return ret; + + irq_clear_status_flags(info->irq, IRQ_NOREQUEST | IRQ_NOAUTOEN); + + return xen_evtchn_port_setup(evtchn); +} + +static int xen_irq_info_evtchn_setup(struct irq_info *info, + evtchn_port_t evtchn, + struct xenbus_device *dev) +{ + int ret; + + ret = xen_irq_info_common_setup(info, IRQT_EVTCHN, evtchn, 0); + info->u.interdomain = dev; + if (dev) + atomic_inc(&dev->event_channels); + + return ret; +} + +static int xen_irq_info_ipi_setup(struct irq_info *info, unsigned int cpu, + evtchn_port_t evtchn, enum ipi_vector ipi) +{ + info->u.ipi = ipi; + + per_cpu(ipi_to_irq, cpu)[ipi] = info->irq; + per_cpu(ipi_to_evtchn, cpu)[ipi] = evtchn; + + return xen_irq_info_common_setup(info, IRQT_IPI, evtchn, 0); +} + +static int xen_irq_info_virq_setup(struct irq_info *info, unsigned int cpu, + evtchn_port_t evtchn, unsigned int virq) +{ + info->u.virq = virq; + + per_cpu(virq_to_irq, cpu)[virq] = info->irq; + + return xen_irq_info_common_setup(info, IRQT_VIRQ, evtchn, 0); +} + +static int xen_irq_info_pirq_setup(struct irq_info *info, evtchn_port_t evtchn, + unsigned int pirq, unsigned int gsi, + uint16_t domid, unsigned char flags) +{ + info->u.pirq.pirq = pirq; + info->u.pirq.gsi = gsi; + info->u.pirq.domid = domid; + info->u.pirq.flags = flags; + + return xen_irq_info_common_setup(info, IRQT_PIRQ, evtchn, 0); +} + +static void xen_irq_info_cleanup(struct irq_info *info) +{ + set_evtchn_to_irq(info->evtchn, -1); + xen_evtchn_port_remove(info->evtchn, info->cpu); + info->evtchn = 0; + channels_on_cpu_dec(info); +} + +/* + * Accessors for packed IRQ information. + */ +static evtchn_port_t evtchn_from_irq(unsigned int irq) +{ + const struct irq_info *info = NULL; + + if (likely(irq < irq_get_nr_irqs())) + info = info_for_irq(irq); + if (!info) + return 0; + + return info->evtchn; +} + +unsigned int irq_from_evtchn(evtchn_port_t evtchn) +{ + struct irq_info *info = evtchn_to_info(evtchn); + + return info ? info->irq : -1; +} +EXPORT_SYMBOL_GPL(irq_from_evtchn); + +int irq_evtchn_from_virq(unsigned int cpu, unsigned int virq, + evtchn_port_t *evtchn) +{ + int irq = per_cpu(virq_to_irq, cpu)[virq]; + + *evtchn = evtchn_from_irq(irq); + + return irq; +} + +static enum ipi_vector ipi_from_irq(struct irq_info *info) +{ + BUG_ON(info == NULL); + BUG_ON(info->type != IRQT_IPI); + + return info->u.ipi; +} + +static unsigned int virq_from_irq(struct irq_info *info) +{ + BUG_ON(info == NULL); + BUG_ON(info->type != IRQT_VIRQ); + + return info->u.virq; +} + +static unsigned int pirq_from_irq(struct irq_info *info) +{ + BUG_ON(info == NULL); + BUG_ON(info->type != IRQT_PIRQ); + + return info->u.pirq.pirq; +} + +unsigned int cpu_from_evtchn(evtchn_port_t evtchn) +{ + struct irq_info *info = evtchn_to_info(evtchn); + + return info ? info->cpu : 0; +} + +static void do_mask(struct irq_info *info, u8 reason) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&info->lock, flags); + + if (!info->mask_reason) + mask_evtchn(info->evtchn); + + info->mask_reason |= reason; + + raw_spin_unlock_irqrestore(&info->lock, flags); +} + +static void do_unmask(struct irq_info *info, u8 reason) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&info->lock, flags); + + info->mask_reason &= ~reason; + + if (!info->mask_reason) + unmask_evtchn(info->evtchn); + + raw_spin_unlock_irqrestore(&info->lock, flags); +} + +#ifdef CONFIG_X86 +static bool pirq_check_eoi_map(struct irq_info *info) +{ + return test_bit(pirq_from_irq(info), pirq_eoi_map); +} +#endif + +static bool pirq_needs_eoi_flag(struct irq_info *info) +{ + BUG_ON(info->type != IRQT_PIRQ); + + return info->u.pirq.flags & PIRQ_NEEDS_EOI; +} + +static void bind_evtchn_to_cpu(struct irq_info *info, unsigned int cpu, + bool force_affinity) +{ + if (IS_ENABLED(CONFIG_SMP) && force_affinity) { + struct irq_data *data = irq_get_irq_data(info->irq); + + irq_data_update_affinity(data, cpumask_of(cpu)); + irq_data_update_effective_affinity(data, cpumask_of(cpu)); + } + + xen_evtchn_port_bind_to_cpu(info->evtchn, cpu, info->cpu); + + channels_on_cpu_dec(info); + info->cpu = cpu; + channels_on_cpu_inc(info); +} + +/** + * notify_remote_via_irq - send event to remote end of event channel via irq + * @irq: irq of event channel to send event to + * + * Unlike notify_remote_via_evtchn(), this is safe to use across + * save/restore. Notifications on a broken connection are silently + * dropped. + */ +void notify_remote_via_irq(int irq) +{ + evtchn_port_t evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + notify_remote_via_evtchn(evtchn); +} +EXPORT_SYMBOL_GPL(notify_remote_via_irq); + +struct lateeoi_work { + struct delayed_work delayed; + spinlock_t eoi_list_lock; + struct list_head eoi_list; +}; + +static DEFINE_PER_CPU(struct lateeoi_work, lateeoi); + +static void lateeoi_list_del(struct irq_info *info) +{ + struct lateeoi_work *eoi = &per_cpu(lateeoi, info->eoi_cpu); + unsigned long flags; + + spin_lock_irqsave(&eoi->eoi_list_lock, flags); + list_del_init(&info->eoi_list); + spin_unlock_irqrestore(&eoi->eoi_list_lock, flags); +} + +static void lateeoi_list_add(struct irq_info *info) +{ + struct lateeoi_work *eoi = &per_cpu(lateeoi, info->eoi_cpu); + struct irq_info *elem; + u64 now = get_jiffies_64(); + unsigned long delay; + unsigned long flags; + + if (now < info->eoi_time) + delay = info->eoi_time - now; + else + delay = 1; + + spin_lock_irqsave(&eoi->eoi_list_lock, flags); + + elem = list_first_entry_or_null(&eoi->eoi_list, struct irq_info, + eoi_list); + if (!elem || info->eoi_time < elem->eoi_time) { + list_add(&info->eoi_list, &eoi->eoi_list); + mod_delayed_work_on(info->eoi_cpu, system_wq, + &eoi->delayed, delay); + } else { + list_for_each_entry_reverse(elem, &eoi->eoi_list, eoi_list) { + if (elem->eoi_time <= info->eoi_time) + break; + } + list_add(&info->eoi_list, &elem->eoi_list); + } + + spin_unlock_irqrestore(&eoi->eoi_list_lock, flags); +} + +static void xen_irq_lateeoi_locked(struct irq_info *info, bool spurious) +{ + evtchn_port_t evtchn; + unsigned int cpu; + unsigned int delay = 0; + + evtchn = info->evtchn; + if (!VALID_EVTCHN(evtchn) || !list_empty(&info->eoi_list)) + return; + + if (spurious) { + struct xenbus_device *dev = info->u.interdomain; + unsigned int threshold = 1; + + if (dev && dev->spurious_threshold) + threshold = dev->spurious_threshold; + + if ((1 << info->spurious_cnt) < (HZ << 2)) { + if (info->spurious_cnt != 0xFF) + info->spurious_cnt++; + } + if (info->spurious_cnt > threshold) { + delay = 1 << (info->spurious_cnt - 1 - threshold); + if (delay > HZ) + delay = HZ; + if (!info->eoi_time) + info->eoi_cpu = smp_processor_id(); + info->eoi_time = get_jiffies_64() + delay; + if (dev) + atomic_add(delay, &dev->jiffies_eoi_delayed); + } + if (dev) + atomic_inc(&dev->spurious_events); + } else { + info->spurious_cnt = 0; + } + + cpu = info->eoi_cpu; + if (info->eoi_time && + (info->irq_epoch == per_cpu(irq_epoch, cpu) || delay)) { + lateeoi_list_add(info); + return; + } + + info->eoi_time = 0; + + /* is_active hasn't been reset yet, do it now. */ + smp_store_release(&info->is_active, 0); + do_unmask(info, EVT_MASK_REASON_EOI_PENDING); +} + +static void xen_irq_lateeoi_worker(struct work_struct *work) +{ + struct lateeoi_work *eoi; + struct irq_info *info; + u64 now = get_jiffies_64(); + unsigned long flags; + + eoi = container_of(to_delayed_work(work), struct lateeoi_work, delayed); + + rcu_read_lock(); + + while (true) { + spin_lock_irqsave(&eoi->eoi_list_lock, flags); + + info = list_first_entry_or_null(&eoi->eoi_list, struct irq_info, + eoi_list); + + if (info == NULL) + break; + + if (now < info->eoi_time) { + mod_delayed_work_on(info->eoi_cpu, system_wq, + &eoi->delayed, + info->eoi_time - now); + break; + } + + list_del_init(&info->eoi_list); + + spin_unlock_irqrestore(&eoi->eoi_list_lock, flags); + + info->eoi_time = 0; + + xen_irq_lateeoi_locked(info, false); + } + + spin_unlock_irqrestore(&eoi->eoi_list_lock, flags); + + rcu_read_unlock(); +} + +static void xen_cpu_init_eoi(unsigned int cpu) +{ + struct lateeoi_work *eoi = &per_cpu(lateeoi, cpu); + + INIT_DELAYED_WORK(&eoi->delayed, xen_irq_lateeoi_worker); + spin_lock_init(&eoi->eoi_list_lock); + INIT_LIST_HEAD(&eoi->eoi_list); +} + +void xen_irq_lateeoi(unsigned int irq, unsigned int eoi_flags) +{ + struct irq_info *info; + + rcu_read_lock(); + + info = info_for_irq(irq); + + if (info) + xen_irq_lateeoi_locked(info, eoi_flags & XEN_EOI_FLAG_SPURIOUS); + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(xen_irq_lateeoi); + +static struct irq_info *xen_irq_init(unsigned int irq) +{ + struct irq_info *info; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (info) { + info->irq = irq; + info->type = IRQT_UNBOUND; + info->refcnt = -1; + INIT_RCU_WORK(&info->rwork, delayed_free_irq); + + set_info_for_irq(irq, info); + INIT_LIST_HEAD(&info->eoi_list); + list_add_tail(&info->list, &xen_irq_list_head); + } + + return info; +} + +static struct irq_info *xen_allocate_irq_dynamic(void) +{ + int irq = irq_alloc_desc_from(0, -1); + struct irq_info *info = NULL; + + if (irq >= 0) { + info = xen_irq_init(irq); + if (!info) + xen_irq_free_desc(irq); + } + + return info; +} + +static struct irq_info *xen_allocate_irq_gsi(unsigned int gsi) +{ + int irq; + struct irq_info *info; + + /* + * A PV guest has no concept of a GSI (since it has no ACPI + * nor access to/knowledge of the physical APICs). Therefore + * all IRQs are dynamically allocated from the entire IRQ + * space. + */ + if (xen_pv_domain() && !xen_initial_domain()) + return xen_allocate_irq_dynamic(); + + /* Legacy IRQ descriptors are already allocated by the arch. */ + if (gsi < nr_legacy_irqs()) + irq = gsi; + else + irq = irq_alloc_desc_at(gsi, -1); + + info = xen_irq_init(irq); + if (!info) + xen_irq_free_desc(irq); + + return info; +} + +static void xen_free_irq(struct irq_info *info) +{ + if (WARN_ON(!info)) + return; + + if (!list_empty(&info->eoi_list)) + lateeoi_list_del(info); + + list_del(&info->list); + + WARN_ON(info->refcnt > 0); + + queue_rcu_work(system_wq, &info->rwork); +} + +/* Not called for lateeoi events. */ +static void event_handler_exit(struct irq_info *info) +{ + smp_store_release(&info->is_active, 0); + clear_evtchn(info->evtchn); +} + +static void pirq_query_unmask(struct irq_info *info) +{ + struct physdev_irq_status_query irq_status; + + irq_status.irq = pirq_from_irq(info); + if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) + irq_status.flags = 0; + + info->u.pirq.flags &= ~PIRQ_NEEDS_EOI; + if (irq_status.flags & XENIRQSTAT_needs_eoi) + info->u.pirq.flags |= PIRQ_NEEDS_EOI; +} + +static void do_eoi_pirq(struct irq_info *info) +{ + struct physdev_eoi eoi = { .irq = pirq_from_irq(info) }; + int rc = 0; + + if (!VALID_EVTCHN(info->evtchn)) + return; + + event_handler_exit(info); + + if (pirq_needs_eoi(info)) { + rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); + WARN_ON(rc); + } +} + +static void eoi_pirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + + do_eoi_pirq(info); +} + +static void do_disable_dynirq(struct irq_info *info) +{ + if (VALID_EVTCHN(info->evtchn)) + do_mask(info, EVT_MASK_REASON_EXPLICIT); +} + +static void disable_dynirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + + if (info) + do_disable_dynirq(info); +} + +static void mask_ack_pirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + + if (info) { + do_disable_dynirq(info); + do_eoi_pirq(info); + } +} + +static unsigned int __startup_pirq(struct irq_info *info) +{ + struct evtchn_bind_pirq bind_pirq; + evtchn_port_t evtchn = info->evtchn; + int rc; + + if (VALID_EVTCHN(evtchn)) + goto out; + + bind_pirq.pirq = pirq_from_irq(info); + /* NB. We are happy to share unless we are probing. */ + bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ? + BIND_PIRQ__WILL_SHARE : 0; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); + if (rc != 0) { + pr_warn("Failed to obtain physical IRQ %d\n", info->irq); + return 0; + } + evtchn = bind_pirq.port; + + pirq_query_unmask(info); + + rc = set_evtchn_to_irq(evtchn, info->irq); + if (rc) + goto err; + + info->evtchn = evtchn; + bind_evtchn_to_cpu(info, 0, false); + + rc = xen_evtchn_port_setup(evtchn); + if (rc) + goto err; + +out: + do_unmask(info, EVT_MASK_REASON_EXPLICIT); + + do_eoi_pirq(info); + + return 0; + +err: + pr_err("irq%d: Failed to set port to irq mapping (%d)\n", info->irq, + rc); + xen_evtchn_close(evtchn); + return 0; +} + +static unsigned int startup_pirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + + return __startup_pirq(info); +} + +static void shutdown_pirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info->evtchn; + + BUG_ON(info->type != IRQT_PIRQ); + + if (!VALID_EVTCHN(evtchn)) + return; + + do_mask(info, EVT_MASK_REASON_EXPLICIT); + xen_irq_info_cleanup(info); + xen_evtchn_close(evtchn); +} + +static void enable_pirq(struct irq_data *data) +{ + enable_dynirq(data); +} + +static void disable_pirq(struct irq_data *data) +{ + disable_dynirq(data); +} + +int xen_irq_from_gsi(unsigned gsi) +{ + struct irq_info *info; + + list_for_each_entry(info, &xen_irq_list_head, list) { + if (info->type != IRQT_PIRQ) + continue; + + if (info->u.pirq.gsi == gsi) + return info->irq; + } + + return -1; +} +EXPORT_SYMBOL_GPL(xen_irq_from_gsi); + +static void __unbind_from_irq(struct irq_info *info, unsigned int irq) +{ + evtchn_port_t evtchn; + bool close_evtchn = false; + + if (!info) { + xen_irq_free_desc(irq); + return; + } + + if (info->refcnt > 0) { + info->refcnt--; + if (info->refcnt != 0) + return; + } + + evtchn = info->evtchn; + + if (VALID_EVTCHN(evtchn)) { + unsigned int cpu = info->cpu; + struct xenbus_device *dev; + + if (!info->is_static) + close_evtchn = true; + + switch (info->type) { + case IRQT_VIRQ: + per_cpu(virq_to_irq, cpu)[virq_from_irq(info)] = -1; + break; + case IRQT_IPI: + per_cpu(ipi_to_irq, cpu)[ipi_from_irq(info)] = -1; + per_cpu(ipi_to_evtchn, cpu)[ipi_from_irq(info)] = 0; + break; + case IRQT_EVTCHN: + dev = info->u.interdomain; + if (dev) + atomic_dec(&dev->event_channels); + break; + default: + break; + } + + xen_irq_info_cleanup(info); + + if (close_evtchn) + xen_evtchn_close(evtchn); + } + + xen_free_irq(info); +} + +/* + * Do not make any assumptions regarding the relationship between the + * IRQ number returned here and the Xen pirq argument. + * + * Note: We don't assign an event channel until the irq actually started + * up. Return an existing irq if we've already got one for the gsi. + * + * Shareable implies level triggered, not shareable implies edge + * triggered here. + */ +int xen_bind_pirq_gsi_to_irq(unsigned gsi, + unsigned pirq, int shareable, char *name) +{ + struct irq_info *info; + struct physdev_irq irq_op; + int ret; + + mutex_lock(&irq_mapping_update_lock); + + ret = xen_irq_from_gsi(gsi); + if (ret != -1) { + pr_info("%s: returning irq %d for gsi %u\n", + __func__, ret, gsi); + goto out; + } + + info = xen_allocate_irq_gsi(gsi); + if (!info) + goto out; + + irq_op.irq = info->irq; + irq_op.vector = 0; + + /* Only the privileged domain can do this. For non-priv, the pcifront + * driver provides a PCI bus that does the call to do exactly + * this in the priv domain. */ + if (xen_initial_domain() && + HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { + xen_free_irq(info); + ret = -ENOSPC; + goto out; + } + + ret = xen_irq_info_pirq_setup(info, 0, pirq, gsi, DOMID_SELF, + shareable ? PIRQ_SHAREABLE : 0); + if (ret < 0) { + __unbind_from_irq(info, info->irq); + goto out; + } + + pirq_query_unmask(info); + /* We try to use the handler with the appropriate semantic for the + * type of interrupt: if the interrupt is an edge triggered + * interrupt we use handle_edge_irq. + * + * On the other hand if the interrupt is level triggered we use + * handle_fasteoi_irq like the native code does for this kind of + * interrupts. + * + * Depending on the Xen version, pirq_needs_eoi might return true + * not only for level triggered interrupts but for edge triggered + * interrupts too. In any case Xen always honors the eoi mechanism, + * not injecting any more pirqs of the same kind if the first one + * hasn't received an eoi yet. Therefore using the fasteoi handler + * is the right choice either way. + */ + if (shareable) + irq_set_chip_and_handler_name(info->irq, &xen_pirq_chip, + handle_fasteoi_irq, name); + else + irq_set_chip_and_handler_name(info->irq, &xen_pirq_chip, + handle_edge_irq, name); + + ret = info->irq; + +out: + mutex_unlock(&irq_mapping_update_lock); + + return ret; +} + +#ifdef CONFIG_PCI_MSI +int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc) +{ + int rc; + struct physdev_get_free_pirq op_get_free_pirq; + + op_get_free_pirq.type = MAP_PIRQ_TYPE_MSI; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq); + + WARN_ONCE(rc == -ENOSYS, + "hypervisor does not support the PHYSDEVOP_get_free_pirq interface\n"); + + return rc ? -1 : op_get_free_pirq.pirq; +} + +int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, + int pirq, int nvec, const char *name, domid_t domid) +{ + int i, irq, ret; + struct irq_info *info; + + mutex_lock(&irq_mapping_update_lock); + + irq = irq_alloc_descs(-1, 0, nvec, -1); + if (irq < 0) + goto out; + + for (i = 0; i < nvec; i++) { + info = xen_irq_init(irq + i); + if (!info) { + ret = -ENOMEM; + goto error_irq; + } + + irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name); + + ret = xen_irq_info_pirq_setup(info, 0, pirq + i, 0, domid, + i == 0 ? 0 : PIRQ_MSI_GROUP); + if (ret < 0) + goto error_irq; + } + + ret = irq_set_msi_desc(irq, msidesc); + if (ret < 0) + goto error_irq; +out: + mutex_unlock(&irq_mapping_update_lock); + return irq; + +error_irq: + while (nvec--) { + info = info_for_irq(irq + nvec); + __unbind_from_irq(info, irq + nvec); + } + mutex_unlock(&irq_mapping_update_lock); + return ret; +} +#endif + +int xen_destroy_irq(int irq) +{ + struct physdev_unmap_pirq unmap_irq; + struct irq_info *info = info_for_irq(irq); + int rc = -ENOENT; + + mutex_lock(&irq_mapping_update_lock); + + /* + * If trying to remove a vector in a MSI group different + * than the first one skip the PIRQ unmap unless this vector + * is the first one in the group. + */ + if (xen_initial_domain() && !(info->u.pirq.flags & PIRQ_MSI_GROUP)) { + unmap_irq.pirq = info->u.pirq.pirq; + unmap_irq.domid = info->u.pirq.domid; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); + /* If another domain quits without making the pci_disable_msix + * call, the Xen hypervisor takes care of freeing the PIRQs + * (free_domain_pirqs). + */ + if ((rc == -ESRCH && info->u.pirq.domid != DOMID_SELF)) + pr_info("domain %d does not have %d anymore\n", + info->u.pirq.domid, info->u.pirq.pirq); + else if (rc) { + pr_warn("unmap irq failed %d\n", rc); + goto out; + } + } + + xen_free_irq(info); + +out: + mutex_unlock(&irq_mapping_update_lock); + return rc; +} + +int xen_pirq_from_irq(unsigned irq) +{ + struct irq_info *info = info_for_irq(irq); + + return pirq_from_irq(info); +} +EXPORT_SYMBOL_GPL(xen_pirq_from_irq); + +static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip, + struct xenbus_device *dev, bool shared) +{ + int ret = -ENOMEM; + struct irq_info *info; + + if (evtchn >= xen_evtchn_max_channels()) + return -ENOMEM; + + mutex_lock(&irq_mapping_update_lock); + + info = evtchn_to_info(evtchn); + + if (!info) { + info = xen_allocate_irq_dynamic(); + if (!info) + goto out; + + irq_set_chip_and_handler_name(info->irq, chip, + handle_edge_irq, "event"); + + ret = xen_irq_info_evtchn_setup(info, evtchn, dev); + if (ret < 0) { + __unbind_from_irq(info, info->irq); + goto out; + } + /* + * New interdomain events are initially bound to vCPU0 This + * is required to setup the event channel in the first + * place and also important for UP guests because the + * affinity setting is not invoked on them so nothing would + * bind the channel. + */ + bind_evtchn_to_cpu(info, 0, false); + } else if (!WARN_ON(info->type != IRQT_EVTCHN)) { + if (shared && !WARN_ON(info->refcnt < 0)) + info->refcnt++; + } + + ret = info->irq; + +out: + mutex_unlock(&irq_mapping_update_lock); + + return ret; +} + +int bind_evtchn_to_irq(evtchn_port_t evtchn) +{ + return bind_evtchn_to_irq_chip(evtchn, &xen_dynamic_chip, NULL, false); +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); + +int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn) +{ + return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip, NULL, false); +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi); + +static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) +{ + struct evtchn_bind_ipi bind_ipi; + evtchn_port_t evtchn; + struct irq_info *info; + int ret; + + mutex_lock(&irq_mapping_update_lock); + + ret = per_cpu(ipi_to_irq, cpu)[ipi]; + + if (ret == -1) { + info = xen_allocate_irq_dynamic(); + if (!info) + goto out; + + irq_set_chip_and_handler_name(info->irq, &xen_percpu_chip, + handle_percpu_irq, "ipi"); + + bind_ipi.vcpu = xen_vcpu_nr(cpu); + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, + &bind_ipi) != 0) + BUG(); + evtchn = bind_ipi.port; + + ret = xen_irq_info_ipi_setup(info, cpu, evtchn, ipi); + if (ret < 0) { + __unbind_from_irq(info, info->irq); + goto out; + } + /* + * Force the affinity mask to the target CPU so proc shows + * the correct target. + */ + bind_evtchn_to_cpu(info, cpu, true); + ret = info->irq; + } else { + info = info_for_irq(ret); + WARN_ON(info == NULL || info->type != IRQT_IPI); + } + + out: + mutex_unlock(&irq_mapping_update_lock); + return ret; +} + +static int bind_interdomain_evtchn_to_irq_chip(struct xenbus_device *dev, + evtchn_port_t remote_port, + struct irq_chip *chip, + bool shared) +{ + struct evtchn_bind_interdomain bind_interdomain; + int err; + + bind_interdomain.remote_dom = dev->otherend_id; + bind_interdomain.remote_port = remote_port; + + err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, + &bind_interdomain); + + return err ? : bind_evtchn_to_irq_chip(bind_interdomain.local_port, + chip, dev, shared); +} + +int bind_interdomain_evtchn_to_irq_lateeoi(struct xenbus_device *dev, + evtchn_port_t remote_port) +{ + return bind_interdomain_evtchn_to_irq_chip(dev, remote_port, + &xen_lateeoi_chip, false); +} +EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq_lateeoi); + +static int find_virq(unsigned int virq, unsigned int cpu, evtchn_port_t *evtchn, + bool percpu) +{ + struct evtchn_status status; + evtchn_port_t port; + bool exists = false; + + memset(&status, 0, sizeof(status)); + for (port = 0; port < xen_evtchn_max_channels(); port++) { + int rc; + + status.dom = DOMID_SELF; + status.port = port; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status); + if (rc < 0) + continue; + if (status.status != EVTCHNSTAT_virq) + continue; + if (status.u.virq != virq) + continue; + if (status.vcpu == xen_vcpu_nr(cpu)) { + *evtchn = port; + return 0; + } else if (!percpu) { + exists = true; + } + } + return exists ? -EEXIST : -ENOENT; +} + +/** + * xen_evtchn_nr_channels - number of usable event channel ports + * + * This may be less than the maximum supported by the current + * hypervisor ABI. Use xen_evtchn_max_channels() for the maximum + * supported. + */ +unsigned xen_evtchn_nr_channels(void) +{ + return evtchn_ops->nr_channels(); +} +EXPORT_SYMBOL_GPL(xen_evtchn_nr_channels); + +int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu) +{ + struct evtchn_bind_virq bind_virq; + evtchn_port_t evtchn = 0; + struct irq_info *info; + int ret; + + mutex_lock(&irq_mapping_update_lock); + + ret = per_cpu(virq_to_irq, cpu)[virq]; + + if (ret == -1) { + info = xen_allocate_irq_dynamic(); + if (!info) + goto out; + + if (percpu) + irq_set_chip_and_handler_name(info->irq, &xen_percpu_chip, + handle_percpu_irq, "virq"); + else + irq_set_chip_and_handler_name(info->irq, &xen_dynamic_chip, + handle_edge_irq, "virq"); + + bind_virq.virq = virq; + bind_virq.vcpu = xen_vcpu_nr(cpu); + ret = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq); + if (ret == 0) + evtchn = bind_virq.port; + else { + if (ret == -EEXIST) + ret = find_virq(virq, cpu, &evtchn, percpu); + if (ret) { + __unbind_from_irq(info, info->irq); + goto out; + } + } + + ret = xen_irq_info_virq_setup(info, cpu, evtchn, virq); + if (ret < 0) { + __unbind_from_irq(info, info->irq); + goto out; + } + + /* + * Force the affinity mask for percpu interrupts so proc + * shows the correct target. + */ + bind_evtchn_to_cpu(info, cpu, percpu); + ret = info->irq; + } else { + info = info_for_irq(ret); + WARN_ON(info == NULL || info->type != IRQT_VIRQ); + } + +out: + mutex_unlock(&irq_mapping_update_lock); + + return ret; +} + +static void unbind_from_irq(unsigned int irq) +{ + struct irq_info *info; + + mutex_lock(&irq_mapping_update_lock); + info = info_for_irq(irq); + __unbind_from_irq(info, irq); + mutex_unlock(&irq_mapping_update_lock); +} + +static int bind_evtchn_to_irqhandler_chip(evtchn_port_t evtchn, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, void *dev_id, + struct irq_chip *chip) +{ + int irq, retval; + + irq = bind_evtchn_to_irq_chip(evtchn, chip, NULL, + irqflags & IRQF_SHARED); + if (irq < 0) + return irq; + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} + +int bind_evtchn_to_irqhandler(evtchn_port_t evtchn, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, void *dev_id) +{ + return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags, + devname, dev_id, + &xen_dynamic_chip); +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); + +int bind_evtchn_to_irqhandler_lateeoi(evtchn_port_t evtchn, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, void *dev_id) +{ + return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags, + devname, dev_id, + &xen_lateeoi_chip); +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler_lateeoi); + +static int bind_interdomain_evtchn_to_irqhandler_chip( + struct xenbus_device *dev, evtchn_port_t remote_port, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id, struct irq_chip *chip) +{ + int irq, retval; + + irq = bind_interdomain_evtchn_to_irq_chip(dev, remote_port, chip, + irqflags & IRQF_SHARED); + if (irq < 0) + return irq; + + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} + +int bind_interdomain_evtchn_to_irqhandler_lateeoi(struct xenbus_device *dev, + evtchn_port_t remote_port, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + return bind_interdomain_evtchn_to_irqhandler_chip(dev, + remote_port, handler, irqflags, devname, + dev_id, &xen_lateeoi_chip); +} +EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler_lateeoi); + +int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, + irq_handler_t handler, + unsigned long irqflags, const char *devname, void *dev_id) +{ + int irq, retval; + + irq = bind_virq_to_irq(virq, cpu, irqflags & IRQF_PERCPU); + if (irq < 0) + return irq; + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); + +int bind_ipi_to_irqhandler(enum ipi_vector ipi, + unsigned int cpu, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + int irq, retval; + + irq = bind_ipi_to_irq(ipi, cpu); + if (irq < 0) + return irq; + + irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME | IRQF_EARLY_RESUME; + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} + +void unbind_from_irqhandler(unsigned int irq, void *dev_id) +{ + struct irq_info *info = info_for_irq(irq); + + if (WARN_ON(!info)) + return; + free_irq(irq, dev_id); + unbind_from_irq(irq); +} +EXPORT_SYMBOL_GPL(unbind_from_irqhandler); + +/** + * xen_set_irq_priority() - set an event channel priority. + * @irq:irq bound to an event channel. + * @priority: priority between XEN_IRQ_PRIORITY_MAX and XEN_IRQ_PRIORITY_MIN. + */ +int xen_set_irq_priority(unsigned irq, unsigned priority) +{ + struct evtchn_set_priority set_priority; + + set_priority.port = evtchn_from_irq(irq); + set_priority.priority = priority; + + return HYPERVISOR_event_channel_op(EVTCHNOP_set_priority, + &set_priority); +} +EXPORT_SYMBOL_GPL(xen_set_irq_priority); + +int evtchn_make_refcounted(evtchn_port_t evtchn, bool is_static) +{ + struct irq_info *info = evtchn_to_info(evtchn); + + if (!info) + return -ENOENT; + + WARN_ON(info->refcnt != -1); + + info->refcnt = 1; + info->is_static = is_static; + + return 0; +} +EXPORT_SYMBOL_GPL(evtchn_make_refcounted); + +int evtchn_get(evtchn_port_t evtchn) +{ + struct irq_info *info; + int err = -ENOENT; + + if (evtchn >= xen_evtchn_max_channels()) + return -EINVAL; + + mutex_lock(&irq_mapping_update_lock); + + info = evtchn_to_info(evtchn); + + if (!info) + goto done; + + err = -EINVAL; + if (info->refcnt <= 0 || info->refcnt == SHRT_MAX) + goto done; + + info->refcnt++; + err = 0; + done: + mutex_unlock(&irq_mapping_update_lock); + + return err; +} +EXPORT_SYMBOL_GPL(evtchn_get); + +void evtchn_put(evtchn_port_t evtchn) +{ + struct irq_info *info = evtchn_to_info(evtchn); + + if (WARN_ON(!info)) + return; + unbind_from_irq(info->irq); +} +EXPORT_SYMBOL_GPL(evtchn_put); + +void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) +{ + evtchn_port_t evtchn; + +#ifdef CONFIG_X86 + if (unlikely(vector == XEN_NMI_VECTOR)) { + int rc = HYPERVISOR_vcpu_op(VCPUOP_send_nmi, xen_vcpu_nr(cpu), + NULL); + if (rc < 0) + printk(KERN_WARNING "Sending nmi to CPU%d failed (rc:%d)\n", cpu, rc); + return; + } +#endif + evtchn = per_cpu(ipi_to_evtchn, cpu)[vector]; + BUG_ON(evtchn == 0); + notify_remote_via_evtchn(evtchn); +} + +struct evtchn_loop_ctrl { + ktime_t timeout; + unsigned count; + bool defer_eoi; +}; + +void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl) +{ + struct irq_info *info = evtchn_to_info(port); + struct xenbus_device *dev; + + if (!info) + return; + + /* + * Check for timeout every 256 events. + * We are setting the timeout value only after the first 256 + * events in order to not hurt the common case of few loop + * iterations. The 256 is basically an arbitrary value. + * + * In case we are hitting the timeout we need to defer all further + * EOIs in order to ensure to leave the event handling loop rather + * sooner than later. + */ + if (!ctrl->defer_eoi && !(++ctrl->count & 0xff)) { + ktime_t kt = ktime_get(); + + if (!ctrl->timeout) { + kt = ktime_add_ms(kt, + jiffies_to_msecs(event_loop_timeout)); + ctrl->timeout = kt; + } else if (kt > ctrl->timeout) { + ctrl->defer_eoi = true; + } + } + + if (xchg_acquire(&info->is_active, 1)) + return; + + dev = (info->type == IRQT_EVTCHN) ? info->u.interdomain : NULL; + if (dev) + atomic_inc(&dev->events); + + if (ctrl->defer_eoi) { + info->eoi_cpu = smp_processor_id(); + info->irq_epoch = __this_cpu_read(irq_epoch); + info->eoi_time = get_jiffies_64() + event_eoi_delay; + } + + generic_handle_irq(info->irq); +} + +int xen_evtchn_do_upcall(void) +{ + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + int ret = vcpu_info->evtchn_upcall_pending ? IRQ_HANDLED : IRQ_NONE; + int cpu = smp_processor_id(); + struct evtchn_loop_ctrl ctrl = { 0 }; + + /* + * When closing an event channel the associated IRQ must not be freed + * until all cpus have left the event handling loop. This is ensured + * by taking the rcu_read_lock() while handling events, as freeing of + * the IRQ is handled via queue_rcu_work() _after_ closing the event + * channel. + */ + rcu_read_lock(); + + do { + vcpu_info->evtchn_upcall_pending = 0; + + xen_evtchn_handle_events(cpu, &ctrl); + + BUG_ON(!irqs_disabled()); + + virt_rmb(); /* Hypervisor can set upcall pending. */ + + } while (vcpu_info->evtchn_upcall_pending); + + rcu_read_unlock(); + + /* + * Increment irq_epoch only now to defer EOIs only for + * xen_irq_lateeoi() invocations occurring from inside the loop + * above. + */ + __this_cpu_inc(irq_epoch); + + return ret; +} +EXPORT_SYMBOL_GPL(xen_evtchn_do_upcall); + +/* Rebind a new event channel to an existing irq. */ +void rebind_evtchn_irq(evtchn_port_t evtchn, int irq) +{ + struct irq_info *info = info_for_irq(irq); + + if (WARN_ON(!info)) + return; + + /* Make sure the irq is masked, since the new event channel + will also be masked. */ + disable_irq(irq); + + mutex_lock(&irq_mapping_update_lock); + + /* After resume the irq<->evtchn mappings are all cleared out */ + BUG_ON(evtchn_to_info(evtchn)); + /* Expect irq to have been bound before, + so there should be a proper type */ + BUG_ON(info->type == IRQT_UNBOUND); + + info->irq = irq; + (void)xen_irq_info_evtchn_setup(info, evtchn, NULL); + + mutex_unlock(&irq_mapping_update_lock); + + bind_evtchn_to_cpu(info, info->cpu, false); + + /* Unmask the event channel. */ + enable_irq(irq); +} + +/* Rebind an evtchn so that it gets delivered to a specific cpu */ +static int xen_rebind_evtchn_to_cpu(struct irq_info *info, unsigned int tcpu) +{ + struct evtchn_bind_vcpu bind_vcpu; + evtchn_port_t evtchn = info ? info->evtchn : 0; + + if (!VALID_EVTCHN(evtchn)) + return -1; + + if (!xen_support_evtchn_rebind()) + return -1; + + /* Send future instances of this interrupt to other vcpu. */ + bind_vcpu.port = evtchn; + bind_vcpu.vcpu = xen_vcpu_nr(tcpu); + + /* + * Mask the event while changing the VCPU binding to prevent + * it being delivered on an unexpected VCPU. + */ + do_mask(info, EVT_MASK_REASON_TEMPORARY); + + /* + * If this fails, it usually just indicates that we're dealing with a + * virq or IPI channel, which don't actually need to be rebound. Ignore + * it, but don't do the xenlinux-level rebind in that case. + */ + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) { + int old_cpu = info->cpu; + + bind_evtchn_to_cpu(info, tcpu, false); + + if (info->type == IRQT_VIRQ) { + int virq = info->u.virq; + int irq = per_cpu(virq_to_irq, old_cpu)[virq]; + + per_cpu(virq_to_irq, old_cpu)[virq] = -1; + per_cpu(virq_to_irq, tcpu)[virq] = irq; + } + } + + do_unmask(info, EVT_MASK_REASON_TEMPORARY); + + return 0; +} + +/* + * Find the CPU within @dest mask which has the least number of channels + * assigned. This is not precise as the per cpu counts can be modified + * concurrently. + */ +static unsigned int select_target_cpu(const struct cpumask *dest) +{ + unsigned int cpu, best_cpu = UINT_MAX, minch = UINT_MAX; + + for_each_cpu_and(cpu, dest, cpu_online_mask) { + unsigned int curch = atomic_read(&channels_on_cpu[cpu]); + + if (curch < minch) { + minch = curch; + best_cpu = cpu; + } + } + + /* + * Catch the unlikely case that dest contains no online CPUs. Can't + * recurse. + */ + if (best_cpu == UINT_MAX) + return select_target_cpu(cpu_online_mask); + + return best_cpu; +} + +static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, + bool force) +{ + unsigned int tcpu = select_target_cpu(dest); + int ret; + + ret = xen_rebind_evtchn_to_cpu(info_for_irq(data->irq), tcpu); + if (!ret) + irq_data_update_effective_affinity(data, cpumask_of(tcpu)); + + return ret; +} + +static void enable_dynirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info ? info->evtchn : 0; + + if (VALID_EVTCHN(evtchn)) + do_unmask(info, EVT_MASK_REASON_EXPLICIT); +} + +static void do_ack_dynirq(struct irq_info *info) +{ + evtchn_port_t evtchn = info->evtchn; + + if (VALID_EVTCHN(evtchn)) + event_handler_exit(info); +} + +static void ack_dynirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + + if (info) + do_ack_dynirq(info); +} + +static void mask_ack_dynirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + + if (info) { + do_disable_dynirq(info); + do_ack_dynirq(info); + } +} + +static void lateeoi_ack_dynirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info ? info->evtchn : 0; + + if (VALID_EVTCHN(evtchn)) { + do_mask(info, EVT_MASK_REASON_EOI_PENDING); + /* + * Don't call event_handler_exit(). + * Need to keep is_active non-zero in order to ignore re-raised + * events after cpu affinity changes while a lateeoi is pending. + */ + clear_evtchn(evtchn); + } +} + +static void lateeoi_mask_ack_dynirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info ? info->evtchn : 0; + + if (VALID_EVTCHN(evtchn)) { + do_mask(info, EVT_MASK_REASON_EXPLICIT); + event_handler_exit(info); + } +} + +static int retrigger_dynirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info ? info->evtchn : 0; + + if (!VALID_EVTCHN(evtchn)) + return 0; + + do_mask(info, EVT_MASK_REASON_TEMPORARY); + set_evtchn(evtchn); + do_unmask(info, EVT_MASK_REASON_TEMPORARY); + + return 1; +} + +static void restore_pirqs(void) +{ + int pirq, rc, irq, gsi; + struct physdev_map_pirq map_irq; + struct irq_info *info; + + list_for_each_entry(info, &xen_irq_list_head, list) { + if (info->type != IRQT_PIRQ) + continue; + + pirq = info->u.pirq.pirq; + gsi = info->u.pirq.gsi; + irq = info->irq; + + /* save/restore of PT devices doesn't work, so at this point the + * only devices present are GSI based emulated devices */ + if (!gsi) + continue; + + map_irq.domid = DOMID_SELF; + map_irq.type = MAP_PIRQ_TYPE_GSI; + map_irq.index = gsi; + map_irq.pirq = pirq; + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); + if (rc) { + pr_warn("xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n", + gsi, irq, pirq, rc); + xen_free_irq(info); + continue; + } + + printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); + + __startup_pirq(info); + } +} + +static void restore_cpu_virqs(unsigned int cpu) +{ + struct evtchn_bind_virq bind_virq; + evtchn_port_t evtchn; + struct irq_info *info; + int virq, irq; + + for (virq = 0; virq < NR_VIRQS; virq++) { + if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) + continue; + info = info_for_irq(irq); + + BUG_ON(virq_from_irq(info) != virq); + + /* Get a new binding from Xen. */ + bind_virq.virq = virq; + bind_virq.vcpu = xen_vcpu_nr(cpu); + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq) != 0) + BUG(); + evtchn = bind_virq.port; + + /* Record the new mapping. */ + xen_irq_info_virq_setup(info, cpu, evtchn, virq); + /* The affinity mask is still valid */ + bind_evtchn_to_cpu(info, cpu, false); + } +} + +static void restore_cpu_ipis(unsigned int cpu) +{ + struct evtchn_bind_ipi bind_ipi; + evtchn_port_t evtchn; + struct irq_info *info; + int ipi, irq; + + for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) { + if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) + continue; + info = info_for_irq(irq); + + BUG_ON(ipi_from_irq(info) != ipi); + + /* Get a new binding from Xen. */ + bind_ipi.vcpu = xen_vcpu_nr(cpu); + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, + &bind_ipi) != 0) + BUG(); + evtchn = bind_ipi.port; + + /* Record the new mapping. */ + xen_irq_info_ipi_setup(info, cpu, evtchn, ipi); + /* The affinity mask is still valid */ + bind_evtchn_to_cpu(info, cpu, false); + } +} + +/* Clear an irq's pending state, in preparation for polling on it */ +void xen_clear_irq_pending(int irq) +{ + struct irq_info *info = info_for_irq(irq); + evtchn_port_t evtchn = info ? info->evtchn : 0; + + if (VALID_EVTCHN(evtchn)) + event_handler_exit(info); +} +EXPORT_SYMBOL(xen_clear_irq_pending); + +bool xen_test_irq_pending(int irq) +{ + evtchn_port_t evtchn = evtchn_from_irq(irq); + bool ret = false; + + if (VALID_EVTCHN(evtchn)) + ret = test_evtchn(evtchn); + + return ret; +} + +/* Poll waiting for an irq to become pending with timeout. In the usual case, + * the irq will be disabled so it won't deliver an interrupt. */ +void xen_poll_irq_timeout(int irq, u64 timeout) +{ + evtchn_port_t evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) { + struct sched_poll poll; + + poll.nr_ports = 1; + poll.timeout = timeout; + set_xen_guest_handle(poll.ports, &evtchn); + + if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0) + BUG(); + } +} +EXPORT_SYMBOL(xen_poll_irq_timeout); +/* Poll waiting for an irq to become pending. In the usual case, the + * irq will be disabled so it won't deliver an interrupt. */ +void xen_poll_irq(int irq) +{ + xen_poll_irq_timeout(irq, 0 /* no timeout */); +} + +/* Check whether the IRQ line is shared with other guests. */ +int xen_test_irq_shared(int irq) +{ + struct irq_info *info = info_for_irq(irq); + struct physdev_irq_status_query irq_status; + + if (WARN_ON(!info)) + return -ENOENT; + + irq_status.irq = info->u.pirq.pirq; + + if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) + return 0; + return !(irq_status.flags & XENIRQSTAT_shared); +} +EXPORT_SYMBOL_GPL(xen_test_irq_shared); + +void xen_irq_resume(void) +{ + unsigned int cpu; + struct irq_info *info; + + /* New event-channel space is not 'live' yet. */ + xen_evtchn_resume(); + + /* No IRQ <-> event-channel mappings. */ + list_for_each_entry(info, &xen_irq_list_head, list) { + /* Zap event-channel binding */ + info->evtchn = 0; + /* Adjust accounting */ + channels_on_cpu_dec(info); + } + + clear_evtchn_to_irq_all(); + + for_each_possible_cpu(cpu) { + restore_cpu_virqs(cpu); + restore_cpu_ipis(cpu); + } + + restore_pirqs(); +} + +static struct irq_chip xen_dynamic_chip __read_mostly = { + .name = "xen-dyn", + + .irq_disable = disable_dynirq, + .irq_mask = disable_dynirq, + .irq_unmask = enable_dynirq, + + .irq_ack = ack_dynirq, + .irq_mask_ack = mask_ack_dynirq, + + .irq_set_affinity = set_affinity_irq, + .irq_retrigger = retrigger_dynirq, +}; + +static struct irq_chip xen_lateeoi_chip __read_mostly = { + /* The chip name needs to contain "xen-dyn" for irqbalance to work. */ + .name = "xen-dyn-lateeoi", + + .irq_disable = disable_dynirq, + .irq_mask = disable_dynirq, + .irq_unmask = enable_dynirq, + + .irq_ack = lateeoi_ack_dynirq, + .irq_mask_ack = lateeoi_mask_ack_dynirq, + + .irq_set_affinity = set_affinity_irq, + .irq_retrigger = retrigger_dynirq, +}; + +static struct irq_chip xen_pirq_chip __read_mostly = { + .name = "xen-pirq", + + .irq_startup = startup_pirq, + .irq_shutdown = shutdown_pirq, + .irq_enable = enable_pirq, + .irq_disable = disable_pirq, + + .irq_mask = disable_dynirq, + .irq_unmask = enable_dynirq, + + .irq_ack = eoi_pirq, + .irq_eoi = eoi_pirq, + .irq_mask_ack = mask_ack_pirq, + + .irq_set_affinity = set_affinity_irq, + + .irq_retrigger = retrigger_dynirq, +}; + +static struct irq_chip xen_percpu_chip __read_mostly = { + .name = "xen-percpu", + + .irq_disable = disable_dynirq, + .irq_mask = disable_dynirq, + .irq_unmask = enable_dynirq, + + .irq_ack = ack_dynirq, +}; + +#ifdef CONFIG_X86 +#ifdef CONFIG_XEN_PVHVM +/* Vector callbacks are better than PCI interrupts to receive event + * channel notifications because we can receive vector callbacks on any + * vcpu and we don't need PCI support or APIC interactions. */ +void xen_setup_callback_vector(void) +{ + uint64_t callback_via; + + if (xen_have_vector_callback) { + callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR); + if (xen_set_callback_via(callback_via)) { + pr_err("Request for Xen HVM callback vector failed\n"); + xen_have_vector_callback = false; + } + } +} + +/* + * Setup per-vCPU vector-type callbacks. If this setup is unavailable, + * fallback to the global vector-type callback. + */ +static __init void xen_init_setup_upcall_vector(void) +{ + if (!xen_have_vector_callback) + return; + + if ((cpuid_eax(xen_cpuid_base() + 4) & XEN_HVM_CPUID_UPCALL_VECTOR) && + !xen_set_upcall_vector(0)) + xen_percpu_upcall = true; + else if (xen_feature(XENFEAT_hvm_callback_vector)) + xen_setup_callback_vector(); + else + xen_have_vector_callback = false; +} + +int xen_set_upcall_vector(unsigned int cpu) +{ + int rc; + xen_hvm_evtchn_upcall_vector_t op = { + .vector = HYPERVISOR_CALLBACK_VECTOR, + .vcpu = per_cpu(xen_vcpu_id, cpu), + }; + + rc = HYPERVISOR_hvm_op(HVMOP_set_evtchn_upcall_vector, &op); + if (rc) + return rc; + + /* Trick toolstack to think we are enlightened. */ + if (!cpu) + rc = xen_set_callback_via(1); + + return rc; +} + +static __init void xen_alloc_callback_vector(void) +{ + if (!xen_have_vector_callback) + return; + + pr_info("Xen HVM callback vector for event delivery is enabled\n"); + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_xen_hvm_callback); +} +#else +void xen_setup_callback_vector(void) {} +static inline void xen_init_setup_upcall_vector(void) {} +int xen_set_upcall_vector(unsigned int cpu) {} +static inline void xen_alloc_callback_vector(void) {} +#endif /* CONFIG_XEN_PVHVM */ +#endif /* CONFIG_X86 */ + +bool xen_fifo_events = true; +module_param_named(fifo_events, xen_fifo_events, bool, 0); + +static int xen_evtchn_cpu_prepare(unsigned int cpu) +{ + int ret = 0; + + xen_cpu_init_eoi(cpu); + + if (evtchn_ops->percpu_init) + ret = evtchn_ops->percpu_init(cpu); + + return ret; +} + +static int xen_evtchn_cpu_dead(unsigned int cpu) +{ + int ret = 0; + + if (evtchn_ops->percpu_deinit) + ret = evtchn_ops->percpu_deinit(cpu); + + return ret; +} + +void __init xen_init_IRQ(void) +{ + int ret = -EINVAL; + evtchn_port_t evtchn; + + if (xen_fifo_events) + ret = xen_evtchn_fifo_init(); + if (ret < 0) { + xen_evtchn_2l_init(); + xen_fifo_events = false; + } + + xen_cpu_init_eoi(smp_processor_id()); + + cpuhp_setup_state_nocalls(CPUHP_XEN_EVTCHN_PREPARE, + "xen/evtchn:prepare", + xen_evtchn_cpu_prepare, xen_evtchn_cpu_dead); + + evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()), + sizeof(*evtchn_to_irq), GFP_KERNEL); + BUG_ON(!evtchn_to_irq); + + /* No event channels are 'live' right now. */ + for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++) + mask_evtchn(evtchn); + + pirq_needs_eoi = pirq_needs_eoi_flag; + +#ifdef CONFIG_X86 + if (xen_pv_domain()) { + if (xen_initial_domain()) + pci_xen_initial_domain(); + } + xen_init_setup_upcall_vector(); + xen_alloc_callback_vector(); + + + if (xen_hvm_domain()) { + native_init_IRQ(); + /* pci_xen_hvm_init must be called after native_init_IRQ so that + * __acpi_register_gsi can point at the right function */ + pci_xen_hvm_init(); + } else { + int rc; + struct physdev_pirq_eoi_gmfn eoi_gmfn; + + pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO); + eoi_gmfn.gmfn = virt_to_gfn(pirq_eoi_map); + rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn); + if (rc != 0) { + free_page((unsigned long) pirq_eoi_map); + pirq_eoi_map = NULL; + } else + pirq_needs_eoi = pirq_check_eoi_map; + } +#endif +} diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c new file mode 100644 index 000000000000..655775db7caf --- /dev/null +++ b/drivers/xen/events/events_fifo.c @@ -0,0 +1,440 @@ +/* + * Xen event channels (FIFO-based ABI) + * + * Copyright (C) 2013 Citrix Systems R&D ltd. + * + * This source code is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * Or, when distributed separately from the Linux kernel or + * incorporated into other software packages, subject to the following + * license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/smp.h> +#include <linux/percpu.h> +#include <linux/cpu.h> + +#include <asm/barrier.h> +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> +#include <xen/page.h> + +#include "events_internal.h" + +#define EVENT_WORDS_PER_PAGE (XEN_PAGE_SIZE / sizeof(event_word_t)) +#define MAX_EVENT_ARRAY_PAGES (EVTCHN_FIFO_NR_CHANNELS / EVENT_WORDS_PER_PAGE) + +struct evtchn_fifo_queue { + uint32_t head[EVTCHN_FIFO_MAX_QUEUES]; +}; + +static DEFINE_PER_CPU(struct evtchn_fifo_control_block *, cpu_control_block); +static DEFINE_PER_CPU(struct evtchn_fifo_queue, cpu_queue); +static event_word_t *event_array[MAX_EVENT_ARRAY_PAGES] __read_mostly; +static unsigned event_array_pages __read_mostly; + +/* + * sync_set_bit() and friends must be unsigned long aligned. + */ +#if BITS_PER_LONG > 32 + +#define BM(w) (unsigned long *)((unsigned long)w & ~0x7UL) +#define EVTCHN_FIFO_BIT(b, w) \ + (((unsigned long)w & 0x4UL) ? (EVTCHN_FIFO_ ##b + 32) : EVTCHN_FIFO_ ##b) + +#else + +#define BM(w) ((unsigned long *)(w)) +#define EVTCHN_FIFO_BIT(b, w) EVTCHN_FIFO_ ##b + +#endif + +static inline event_word_t *event_word_from_port(evtchn_port_t port) +{ + unsigned i = port / EVENT_WORDS_PER_PAGE; + + return event_array[i] + port % EVENT_WORDS_PER_PAGE; +} + +static unsigned evtchn_fifo_max_channels(void) +{ + return EVTCHN_FIFO_NR_CHANNELS; +} + +static unsigned evtchn_fifo_nr_channels(void) +{ + return event_array_pages * EVENT_WORDS_PER_PAGE; +} + +static int init_control_block(int cpu, + struct evtchn_fifo_control_block *control_block) +{ + struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu); + struct evtchn_init_control init_control; + unsigned int i; + + /* Reset the control block and the local HEADs. */ + clear_page(control_block); + for (i = 0; i < EVTCHN_FIFO_MAX_QUEUES; i++) + q->head[i] = 0; + + init_control.control_gfn = virt_to_gfn(control_block); + init_control.offset = 0; + init_control.vcpu = xen_vcpu_nr(cpu); + + return HYPERVISOR_event_channel_op(EVTCHNOP_init_control, &init_control); +} + +static void free_unused_array_pages(void) +{ + unsigned i; + + for (i = event_array_pages; i < MAX_EVENT_ARRAY_PAGES; i++) { + if (!event_array[i]) + break; + free_page((unsigned long)event_array[i]); + event_array[i] = NULL; + } +} + +static void init_array_page(event_word_t *array_page) +{ + unsigned i; + + for (i = 0; i < EVENT_WORDS_PER_PAGE; i++) + array_page[i] = 1 << EVTCHN_FIFO_MASKED; +} + +static int evtchn_fifo_setup(evtchn_port_t port) +{ + unsigned new_array_pages; + int ret; + + new_array_pages = port / EVENT_WORDS_PER_PAGE + 1; + + if (new_array_pages > MAX_EVENT_ARRAY_PAGES) + return -EINVAL; + + while (event_array_pages < new_array_pages) { + void *array_page; + struct evtchn_expand_array expand_array; + + /* Might already have a page if we've resumed. */ + array_page = event_array[event_array_pages]; + if (!array_page) { + array_page = (void *)__get_free_page(GFP_KERNEL); + if (array_page == NULL) { + ret = -ENOMEM; + goto error; + } + event_array[event_array_pages] = array_page; + } + + /* Mask all events in this page before adding it. */ + init_array_page(array_page); + + expand_array.array_gfn = virt_to_gfn(array_page); + + ret = HYPERVISOR_event_channel_op(EVTCHNOP_expand_array, &expand_array); + if (ret < 0) + goto error; + + event_array_pages++; + } + return 0; + + error: + if (event_array_pages == 0) + panic("xen: unable to expand event array with initial page (%d)\n", ret); + else + pr_err("unable to expand event array (%d)\n", ret); + free_unused_array_pages(); + return ret; +} + +static void evtchn_fifo_bind_to_cpu(evtchn_port_t evtchn, unsigned int cpu, + unsigned int old_cpu) +{ + /* no-op */ +} + +static void evtchn_fifo_clear_pending(evtchn_port_t port) +{ + event_word_t *word = event_word_from_port(port); + sync_clear_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static void evtchn_fifo_set_pending(evtchn_port_t port) +{ + event_word_t *word = event_word_from_port(port); + sync_set_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static bool evtchn_fifo_is_pending(evtchn_port_t port) +{ + event_word_t *word = event_word_from_port(port); + return sync_test_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static void evtchn_fifo_mask(evtchn_port_t port) +{ + event_word_t *word = event_word_from_port(port); + sync_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} + +static bool evtchn_fifo_is_masked(evtchn_port_t port) +{ + event_word_t *word = event_word_from_port(port); + return sync_test_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} +/* + * Clear MASKED if not PENDING, spinning if BUSY is set. + * Return true if mask was cleared. + */ +static bool clear_masked_cond(volatile event_word_t *word) +{ + event_word_t new, old; + + old = *word; + + do { + if (!(old & (1 << EVTCHN_FIFO_MASKED))) + return true; + + if (old & (1 << EVTCHN_FIFO_PENDING)) + return false; + + old = old & ~(1 << EVTCHN_FIFO_BUSY); + new = old & ~(1 << EVTCHN_FIFO_MASKED); + } while (!sync_try_cmpxchg(word, &old, new)); + + return true; +} + +static void evtchn_fifo_unmask(evtchn_port_t port) +{ + event_word_t *word = event_word_from_port(port); + + BUG_ON(!irqs_disabled()); + + if (!clear_masked_cond(word)) { + struct evtchn_unmask unmask = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + } +} + +static uint32_t clear_linked(volatile event_word_t *word) +{ + event_word_t new, old; + + old = *word; + + do { + new = (old & ~((1 << EVTCHN_FIFO_LINKED) + | EVTCHN_FIFO_LINK_MASK)); + } while (!sync_try_cmpxchg(word, &old, new)); + + return old & EVTCHN_FIFO_LINK_MASK; +} + +static void consume_one_event(unsigned cpu, struct evtchn_loop_ctrl *ctrl, + struct evtchn_fifo_control_block *control_block, + unsigned priority, unsigned long *ready) +{ + struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu); + uint32_t head; + evtchn_port_t port; + event_word_t *word; + + head = q->head[priority]; + + /* + * Reached the tail last time? Read the new HEAD from the + * control block. + */ + if (head == 0) { + virt_rmb(); /* Ensure word is up-to-date before reading head. */ + head = control_block->head[priority]; + } + + port = head; + word = event_word_from_port(port); + head = clear_linked(word); + + /* + * If the link is non-zero, there are more events in the + * queue, otherwise the queue is empty. + * + * If the queue is empty, clear this priority from our local + * copy of the ready word. + */ + if (head == 0) + clear_bit(priority, ready); + + if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port)) { + if (unlikely(!ctrl)) + pr_warn("Dropping pending event for port %u\n", port); + else + handle_irq_for_port(port, ctrl); + } + + q->head[priority] = head; +} + +static void __evtchn_fifo_handle_events(unsigned cpu, + struct evtchn_loop_ctrl *ctrl) +{ + struct evtchn_fifo_control_block *control_block; + unsigned long ready; + unsigned q; + + control_block = per_cpu(cpu_control_block, cpu); + + ready = xchg(&control_block->ready, 0); + + while (ready) { + q = find_first_bit(&ready, EVTCHN_FIFO_MAX_QUEUES); + consume_one_event(cpu, ctrl, control_block, q, &ready); + ready |= xchg(&control_block->ready, 0); + } +} + +static void evtchn_fifo_handle_events(unsigned cpu, + struct evtchn_loop_ctrl *ctrl) +{ + __evtchn_fifo_handle_events(cpu, ctrl); +} + +static void evtchn_fifo_resume(void) +{ + unsigned cpu; + + for_each_possible_cpu(cpu) { + void *control_block = per_cpu(cpu_control_block, cpu); + int ret; + + if (!control_block) + continue; + + /* + * If this CPU is offline, take the opportunity to + * free the control block while it is not being + * used. + */ + if (!cpu_online(cpu)) { + free_page((unsigned long)control_block); + per_cpu(cpu_control_block, cpu) = NULL; + continue; + } + + ret = init_control_block(cpu, control_block); + BUG_ON(ret < 0); + } + + /* + * The event array starts out as empty again and is extended + * as normal when events are bound. The existing pages will + * be reused. + */ + event_array_pages = 0; +} + +static int evtchn_fifo_alloc_control_block(unsigned cpu) +{ + void *control_block = NULL; + int ret = -ENOMEM; + + control_block = (void *)__get_free_page(GFP_KERNEL); + if (control_block == NULL) + goto error; + + ret = init_control_block(cpu, control_block); + if (ret < 0) + goto error; + + per_cpu(cpu_control_block, cpu) = control_block; + + return 0; + + error: + free_page((unsigned long)control_block); + return ret; +} + +static int evtchn_fifo_percpu_init(unsigned int cpu) +{ + if (!per_cpu(cpu_control_block, cpu)) + return evtchn_fifo_alloc_control_block(cpu); + return 0; +} + +static int evtchn_fifo_percpu_deinit(unsigned int cpu) +{ + __evtchn_fifo_handle_events(cpu, NULL); + return 0; +} + +static const struct evtchn_ops evtchn_ops_fifo = { + .max_channels = evtchn_fifo_max_channels, + .nr_channels = evtchn_fifo_nr_channels, + .setup = evtchn_fifo_setup, + .bind_to_cpu = evtchn_fifo_bind_to_cpu, + .clear_pending = evtchn_fifo_clear_pending, + .set_pending = evtchn_fifo_set_pending, + .is_pending = evtchn_fifo_is_pending, + .mask = evtchn_fifo_mask, + .unmask = evtchn_fifo_unmask, + .handle_events = evtchn_fifo_handle_events, + .resume = evtchn_fifo_resume, + .percpu_init = evtchn_fifo_percpu_init, + .percpu_deinit = evtchn_fifo_percpu_deinit, +}; + +int __init xen_evtchn_fifo_init(void) +{ + int cpu = smp_processor_id(); + int ret; + + ret = evtchn_fifo_alloc_control_block(cpu); + if (ret < 0) + return ret; + + pr_info("Using FIFO-based ABI\n"); + + evtchn_ops = &evtchn_ops_fifo; + + return ret; +} diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h new file mode 100644 index 000000000000..19ae31695edc --- /dev/null +++ b/drivers/xen/events/events_internal.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Xen Event Channels (internal header) + * + * Copyright (C) 2013 Citrix Systems R&D Ltd. + */ +#ifndef __EVENTS_INTERNAL_H__ +#define __EVENTS_INTERNAL_H__ + +struct evtchn_loop_ctrl; + +struct evtchn_ops { + unsigned (*max_channels)(void); + unsigned (*nr_channels)(void); + + int (*setup)(evtchn_port_t port); + void (*remove)(evtchn_port_t port, unsigned int cpu); + void (*bind_to_cpu)(evtchn_port_t evtchn, unsigned int cpu, + unsigned int old_cpu); + + void (*clear_pending)(evtchn_port_t port); + void (*set_pending)(evtchn_port_t port); + bool (*is_pending)(evtchn_port_t port); + void (*mask)(evtchn_port_t port); + void (*unmask)(evtchn_port_t port); + + void (*handle_events)(unsigned cpu, struct evtchn_loop_ctrl *ctrl); + void (*resume)(void); + + int (*percpu_init)(unsigned int cpu); + int (*percpu_deinit)(unsigned int cpu); +}; + +extern const struct evtchn_ops *evtchn_ops; + +void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl); + +unsigned int cpu_from_evtchn(evtchn_port_t evtchn); + +static inline unsigned xen_evtchn_max_channels(void) +{ + return evtchn_ops->max_channels(); +} + +/* + * Do any ABI specific setup for a bound event channel before it can + * be unmasked and used. + */ +static inline int xen_evtchn_port_setup(evtchn_port_t evtchn) +{ + if (evtchn_ops->setup) + return evtchn_ops->setup(evtchn); + return 0; +} + +static inline void xen_evtchn_port_remove(evtchn_port_t evtchn, + unsigned int cpu) +{ + if (evtchn_ops->remove) + evtchn_ops->remove(evtchn, cpu); +} + +static inline void xen_evtchn_port_bind_to_cpu(evtchn_port_t evtchn, + unsigned int cpu, + unsigned int old_cpu) +{ + evtchn_ops->bind_to_cpu(evtchn, cpu, old_cpu); +} + +static inline void clear_evtchn(evtchn_port_t port) +{ + evtchn_ops->clear_pending(port); +} + +static inline void set_evtchn(evtchn_port_t port) +{ + evtchn_ops->set_pending(port); +} + +static inline bool test_evtchn(evtchn_port_t port) +{ + return evtchn_ops->is_pending(port); +} + +static inline void mask_evtchn(evtchn_port_t port) +{ + return evtchn_ops->mask(port); +} + +static inline void unmask_evtchn(evtchn_port_t port) +{ + return evtchn_ops->unmask(port); +} + +static inline void xen_evtchn_handle_events(unsigned cpu, + struct evtchn_loop_ctrl *ctrl) +{ + return evtchn_ops->handle_events(cpu, ctrl); +} + +static inline void xen_evtchn_resume(void) +{ + if (evtchn_ops->resume) + evtchn_ops->resume(); +} + +void xen_evtchn_2l_init(void); +int xen_evtchn_fifo_init(void); + +#endif /* #ifndef __EVENTS_INTERNAL_H__ */ diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index 8feecf01d55c..7e4a13e632dc 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -49,78 +49,142 @@ #include <linux/init.h> #include <linux/mutex.h> #include <linux/cpu.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> #include <xen/xen.h> #include <xen/events.h> #include <xen/evtchn.h> +#include <xen/xen-ops.h> #include <asm/xen/hypervisor.h> struct per_user_data { struct mutex bind_mutex; /* serialize bind/unbind operations */ + struct rb_root evtchns; + unsigned int nr_evtchns; /* Notification ring, accessed via /dev/xen/evtchn. */ -#define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t)) -#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1)) + unsigned int ring_size; evtchn_port_t *ring; unsigned int ring_cons, ring_prod, ring_overflow; struct mutex ring_cons_mutex; /* protect against concurrent readers */ + spinlock_t ring_prod_lock; /* product against concurrent interrupts */ /* Processes wait on this queue when ring is empty. */ wait_queue_head_t evtchn_wait; struct fasync_struct *evtchn_async_queue; const char *name; + + domid_t restrict_domid; }; -/* - * Who's bound to each port? This is logically an array of struct - * per_user_data *, but we encode the current enabled-state in bit 0. - */ -static unsigned long *port_user; -static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */ +#define UNRESTRICTED_DOMID ((domid_t)-1) + +struct user_evtchn { + struct rb_node node; + struct per_user_data *user; + evtchn_port_t port; + bool enabled; + bool unbinding; +}; -static inline struct per_user_data *get_port_user(unsigned port) +static void evtchn_free_ring(evtchn_port_t *ring) { - return (struct per_user_data *)(port_user[port] & ~1); + kvfree(ring); } -static inline void set_port_user(unsigned port, struct per_user_data *u) +static unsigned int evtchn_ring_offset(struct per_user_data *u, + unsigned int idx) { - port_user[port] = (unsigned long)u; + return idx & (u->ring_size - 1); } -static inline bool get_port_enabled(unsigned port) +static evtchn_port_t *evtchn_ring_entry(struct per_user_data *u, + unsigned int idx) { - return port_user[port] & 1; + return u->ring + evtchn_ring_offset(u, idx); } -static inline void set_port_enabled(unsigned port, bool enabled) +static int add_evtchn(struct per_user_data *u, struct user_evtchn *evtchn) { - if (enabled) - port_user[port] |= 1; - else - port_user[port] &= ~1; + struct rb_node **new = &(u->evtchns.rb_node), *parent = NULL; + + u->nr_evtchns++; + + while (*new) { + struct user_evtchn *this; + + this = rb_entry(*new, struct user_evtchn, node); + + parent = *new; + if (this->port < evtchn->port) + new = &((*new)->rb_left); + else if (this->port > evtchn->port) + new = &((*new)->rb_right); + else + return -EEXIST; + } + + /* Add new node and rebalance tree. */ + rb_link_node(&evtchn->node, parent, new); + rb_insert_color(&evtchn->node, &u->evtchns); + + return 0; +} + +static void del_evtchn(struct per_user_data *u, struct user_evtchn *evtchn) +{ + u->nr_evtchns--; + rb_erase(&evtchn->node, &u->evtchns); + kfree(evtchn); +} + +static struct user_evtchn *find_evtchn(struct per_user_data *u, + evtchn_port_t port) +{ + struct rb_node *node = u->evtchns.rb_node; + + while (node) { + struct user_evtchn *evtchn; + + evtchn = rb_entry(node, struct user_evtchn, node); + + if (evtchn->port < port) + node = node->rb_left; + else if (evtchn->port > port) + node = node->rb_right; + else + return evtchn; + } + return NULL; } static irqreturn_t evtchn_interrupt(int irq, void *data) { - unsigned int port = (unsigned long)data; - struct per_user_data *u; + struct user_evtchn *evtchn = data; + struct per_user_data *u = evtchn->user; + unsigned int prod, cons; + + /* Handler might be called when tearing down the IRQ. */ + if (evtchn->unbinding) + return IRQ_HANDLED; - spin_lock(&port_user_lock); + WARN(!evtchn->enabled, + "Interrupt for port %u, but apparently not enabled; per-user %p\n", + evtchn->port, u); - u = get_port_user(port); + evtchn->enabled = false; - WARN(!get_port_enabled(port), - "Interrupt for port %d, but apparently not enabled; per-user %p\n", - port, u); + spin_lock(&u->ring_prod_lock); - disable_irq_nosync(irq); - set_port_enabled(port, false); + prod = READ_ONCE(u->ring_prod); + cons = READ_ONCE(u->ring_cons); - if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) { - u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port; - wmb(); /* Ensure ring contents visible */ - if (u->ring_cons == u->ring_prod++) { + if ((prod - cons) < u->ring_size) { + *evtchn_ring_entry(u, prod) = evtchn->port; + smp_wmb(); /* Ensure ring contents visible */ + WRITE_ONCE(u->ring_prod, prod + 1); + if (cons == prod) { wake_up_interruptible(&u->evtchn_wait); kill_fasync(&u->evtchn_async_queue, SIGIO, POLL_IN); @@ -128,7 +192,7 @@ static irqreturn_t evtchn_interrupt(int irq, void *data) } else u->ring_overflow = 1; - spin_unlock(&port_user_lock); + spin_unlock(&u->ring_prod_lock); return IRQ_HANDLED; } @@ -156,8 +220,8 @@ static ssize_t evtchn_read(struct file *file, char __user *buf, if (u->ring_overflow) goto unlock_out; - c = u->ring_cons; - p = u->ring_prod; + c = READ_ONCE(u->ring_cons); + p = READ_ONCE(u->ring_prod); if (c != p) break; @@ -167,16 +231,16 @@ static ssize_t evtchn_read(struct file *file, char __user *buf, return -EAGAIN; rc = wait_event_interruptible(u->evtchn_wait, - u->ring_cons != u->ring_prod); + READ_ONCE(u->ring_cons) != READ_ONCE(u->ring_prod)); if (rc) return rc; } /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */ - if (((c ^ p) & EVTCHN_RING_SIZE) != 0) { - bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) * + if (((c ^ p) & u->ring_size) != 0) { + bytes1 = (u->ring_size - evtchn_ring_offset(u, c)) * sizeof(evtchn_port_t); - bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t); + bytes2 = evtchn_ring_offset(u, p) * sizeof(evtchn_port_t); } else { bytes1 = (p - c) * sizeof(evtchn_port_t); bytes2 = 0; @@ -191,13 +255,13 @@ static ssize_t evtchn_read(struct file *file, char __user *buf, } rc = -EFAULT; - rmb(); /* Ensure that we see the port before we copy it. */ - if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) || + smp_rmb(); /* Ensure that we see the port before we copy it. */ + if (copy_to_user(buf, evtchn_ring_entry(u, c), bytes1) || ((bytes2 != 0) && copy_to_user(&buf[bytes1], &u->ring[0], bytes2))) goto unlock_out; - u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t); + WRITE_ONCE(u->ring_cons, c + (bytes1 + bytes2) / sizeof(evtchn_port_t)); rc = bytes1 + bytes2; unlock_out: @@ -229,20 +293,20 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf, if (copy_from_user(kbuf, buf, count) != 0) goto out; - spin_lock_irq(&port_user_lock); + mutex_lock(&u->bind_mutex); for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) { - unsigned port = kbuf[i]; + evtchn_port_t port = kbuf[i]; + struct user_evtchn *evtchn; - if (port < NR_EVENT_CHANNELS && - get_port_user(port) == u && - !get_port_enabled(port)) { - set_port_enabled(port, true); - enable_irq(irq_from_evtchn(port)); + evtchn = find_evtchn(u, port); + if (evtchn && !evtchn->enabled) { + evtchn->enabled = true; + xen_irq_lateeoi(irq_from_evtchn(port), 0); } } - spin_unlock_irq(&port_user_lock); + mutex_unlock(&u->bind_mutex); rc = count; @@ -251,8 +315,66 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf, return rc; } -static int evtchn_bind_to_user(struct per_user_data *u, int port) +static int evtchn_resize_ring(struct per_user_data *u) { + unsigned int new_size; + evtchn_port_t *new_ring, *old_ring; + + /* + * Ensure the ring is large enough to capture all possible + * events. i.e., one free slot for each bound event. + */ + if (u->nr_evtchns <= u->ring_size) + return 0; + + if (u->ring_size == 0) + new_size = 64; + else + new_size = 2 * u->ring_size; + + new_ring = kvmalloc_array(new_size, sizeof(*new_ring), GFP_KERNEL); + if (!new_ring) + return -ENOMEM; + + old_ring = u->ring; + + /* + * Access to the ring contents is serialized by either the + * prod /or/ cons lock so take both when resizing. + */ + mutex_lock(&u->ring_cons_mutex); + spin_lock_irq(&u->ring_prod_lock); + + /* + * Copy the old ring contents to the new ring. + * + * To take care of wrapping, a full ring, and the new index + * pointing into the second half, simply copy the old contents + * twice. + * + * +---------+ +------------------+ + * |34567 12| -> |34567 1234567 12| + * +-----p-c-+ +-------c------p---+ + */ + memcpy(new_ring, old_ring, u->ring_size * sizeof(*u->ring)); + memcpy(new_ring + u->ring_size, old_ring, + u->ring_size * sizeof(*u->ring)); + + u->ring = new_ring; + u->ring_size = new_size; + + spin_unlock_irq(&u->ring_prod_lock); + mutex_unlock(&u->ring_cons_mutex); + + evtchn_free_ring(old_ring); + + return 0; +} + +static int evtchn_bind_to_user(struct per_user_data *u, evtchn_port_t port, + bool is_static) +{ + struct user_evtchn *evtchn; int rc = 0; /* @@ -263,35 +385,51 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port) * interrupt handler yet, and our caller has already * serialized bind operations.) */ - BUG_ON(get_port_user(port) != NULL); - set_port_user(port, u); - set_port_enabled(port, true); /* start enabled */ - - rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED, - u->name, (void *)(unsigned long)port); - if (rc >= 0) - rc = evtchn_make_refcounted(port); - else { - /* bind failed, should close the port now */ - struct evtchn_close close; - close.port = port; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); - set_port_user(port, NULL); - } + evtchn = kzalloc(sizeof(*evtchn), GFP_KERNEL); + if (!evtchn) + return -ENOMEM; + + evtchn->user = u; + evtchn->port = port; + evtchn->enabled = true; /* start enabled */ + + rc = add_evtchn(u, evtchn); + if (rc < 0) + goto err; + + rc = evtchn_resize_ring(u); + if (rc < 0) + goto err; + + rc = bind_evtchn_to_irqhandler_lateeoi(port, evtchn_interrupt, IRQF_SHARED, + u->name, evtchn); + if (rc < 0) + goto err; + + rc = evtchn_make_refcounted(port, is_static); + return rc; + +err: + /* bind failed, should close the port now */ + if (!is_static) + xen_evtchn_close(port); + + del_evtchn(u, evtchn); return rc; } -static void evtchn_unbind_from_user(struct per_user_data *u, int port) +static void evtchn_unbind_from_user(struct per_user_data *u, + struct user_evtchn *evtchn) { - int irq = irq_from_evtchn(port); + int irq = irq_from_evtchn(evtchn->port); BUG_ON(irq < 0); - unbind_from_irqhandler(irq, (void *)(unsigned long)port); + evtchn->unbinding = true; + unbind_from_irqhandler(irq, evtchn); - set_port_user(port, NULL); + del_evtchn(u, evtchn); } static long evtchn_ioctl(struct file *file, @@ -309,18 +447,22 @@ static long evtchn_ioctl(struct file *file, struct ioctl_evtchn_bind_virq bind; struct evtchn_bind_virq bind_virq; + rc = -EACCES; + if (u->restrict_domid != UNRESTRICTED_DOMID) + break; + rc = -EFAULT; if (copy_from_user(&bind, uarg, sizeof(bind))) break; bind_virq.virq = bind.virq; - bind_virq.vcpu = 0; + bind_virq.vcpu = xen_vcpu_nr(0); rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind_virq); if (rc != 0) break; - rc = evtchn_bind_to_user(u, bind_virq.port); + rc = evtchn_bind_to_user(u, bind_virq.port, false); if (rc == 0) rc = bind_virq.port; break; @@ -334,6 +476,11 @@ static long evtchn_ioctl(struct file *file, if (copy_from_user(&bind, uarg, sizeof(bind))) break; + rc = -EACCES; + if (u->restrict_domid != UNRESTRICTED_DOMID && + u->restrict_domid != bind.remote_domain) + break; + bind_interdomain.remote_dom = bind.remote_domain; bind_interdomain.remote_port = bind.remote_port; rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, @@ -341,7 +488,7 @@ static long evtchn_ioctl(struct file *file, if (rc != 0) break; - rc = evtchn_bind_to_user(u, bind_interdomain.local_port); + rc = evtchn_bind_to_user(u, bind_interdomain.local_port, false); if (rc == 0) rc = bind_interdomain.local_port; break; @@ -351,6 +498,10 @@ static long evtchn_ioctl(struct file *file, struct ioctl_evtchn_bind_unbound_port bind; struct evtchn_alloc_unbound alloc_unbound; + rc = -EACCES; + if (u->restrict_domid != UNRESTRICTED_DOMID) + break; + rc = -EFAULT; if (copy_from_user(&bind, uarg, sizeof(bind))) break; @@ -362,7 +513,7 @@ static long evtchn_ioctl(struct file *file, if (rc != 0) break; - rc = evtchn_bind_to_user(u, alloc_unbound.port); + rc = evtchn_bind_to_user(u, alloc_unbound.port, false); if (rc == 0) rc = alloc_unbound.port; break; @@ -370,45 +521,55 @@ static long evtchn_ioctl(struct file *file, case IOCTL_EVTCHN_UNBIND: { struct ioctl_evtchn_unbind unbind; + struct user_evtchn *evtchn; rc = -EFAULT; if (copy_from_user(&unbind, uarg, sizeof(unbind))) break; rc = -EINVAL; - if (unbind.port >= NR_EVENT_CHANNELS) + if (unbind.port >= xen_evtchn_nr_channels()) break; - spin_lock_irq(&port_user_lock); - rc = -ENOTCONN; - if (get_port_user(unbind.port) != u) { - spin_unlock_irq(&port_user_lock); + evtchn = find_evtchn(u, unbind.port); + if (!evtchn) break; - } disable_irq(irq_from_evtchn(unbind.port)); + evtchn_unbind_from_user(u, evtchn); + rc = 0; + break; + } - spin_unlock_irq(&port_user_lock); + case IOCTL_EVTCHN_BIND_STATIC: { + struct ioctl_evtchn_bind bind; + struct user_evtchn *evtchn; - evtchn_unbind_from_user(u, unbind.port); + rc = -EFAULT; + if (copy_from_user(&bind, uarg, sizeof(bind))) + break; - rc = 0; + rc = -EISCONN; + evtchn = find_evtchn(u, bind.port); + if (evtchn) + break; + + rc = evtchn_bind_to_user(u, bind.port, true); break; } case IOCTL_EVTCHN_NOTIFY: { struct ioctl_evtchn_notify notify; + struct user_evtchn *evtchn; rc = -EFAULT; if (copy_from_user(¬ify, uarg, sizeof(notify))) break; - if (notify.port >= NR_EVENT_CHANNELS) { - rc = -EINVAL; - } else if (get_port_user(notify.port) != u) { - rc = -ENOTCONN; - } else { + rc = -ENOTCONN; + evtchn = find_evtchn(u, notify.port); + if (evtchn) { notify_remote_via_evtchn(notify.port); rc = 0; } @@ -418,14 +579,37 @@ static long evtchn_ioctl(struct file *file, case IOCTL_EVTCHN_RESET: { /* Initialise the ring to empty. Clear errors. */ mutex_lock(&u->ring_cons_mutex); - spin_lock_irq(&port_user_lock); - u->ring_cons = u->ring_prod = u->ring_overflow = 0; - spin_unlock_irq(&port_user_lock); + spin_lock_irq(&u->ring_prod_lock); + WRITE_ONCE(u->ring_cons, 0); + WRITE_ONCE(u->ring_prod, 0); + u->ring_overflow = 0; + spin_unlock_irq(&u->ring_prod_lock); mutex_unlock(&u->ring_cons_mutex); rc = 0; break; } + case IOCTL_EVTCHN_RESTRICT_DOMID: { + struct ioctl_evtchn_restrict_domid ierd; + + rc = -EACCES; + if (u->restrict_domid != UNRESTRICTED_DOMID) + break; + + rc = -EFAULT; + if (copy_from_user(&ierd, uarg, sizeof(ierd))) + break; + + rc = -EINVAL; + if (ierd.domid == 0 || ierd.domid >= DOMID_FIRST_RESERVED) + break; + + u->restrict_domid = ierd.domid; + rc = 0; + + break; + } + default: rc = -ENOSYS; break; @@ -435,16 +619,16 @@ static long evtchn_ioctl(struct file *file, return rc; } -static unsigned int evtchn_poll(struct file *file, poll_table *wait) +static __poll_t evtchn_poll(struct file *file, poll_table *wait) { - unsigned int mask = POLLOUT | POLLWRNORM; + __poll_t mask = EPOLLOUT | EPOLLWRNORM; struct per_user_data *u = file->private_data; poll_wait(file, &u->evtchn_wait, wait); - if (u->ring_cons != u->ring_prod) - mask |= POLLIN | POLLRDNORM; + if (READ_ONCE(u->ring_cons) != READ_ONCE(u->ring_prod)) + mask |= EPOLLIN | EPOLLRDNORM; if (u->ring_overflow) - mask = POLLERR; + mask = EPOLLERR; return mask; } @@ -470,46 +654,31 @@ static int evtchn_open(struct inode *inode, struct file *filp) init_waitqueue_head(&u->evtchn_wait); - u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL); - if (u->ring == NULL) { - kfree(u->name); - kfree(u); - return -ENOMEM; - } - mutex_init(&u->bind_mutex); mutex_init(&u->ring_cons_mutex); + spin_lock_init(&u->ring_prod_lock); + + u->restrict_domid = UNRESTRICTED_DOMID; filp->private_data = u; - return nonseekable_open(inode, filp); + return stream_open(inode, filp); } static int evtchn_release(struct inode *inode, struct file *filp) { - int i; struct per_user_data *u = filp->private_data; + struct rb_node *node; - spin_lock_irq(&port_user_lock); - - free_page((unsigned long)u->ring); + while ((node = u->evtchns.rb_node)) { + struct user_evtchn *evtchn; - for (i = 0; i < NR_EVENT_CHANNELS; i++) { - if (get_port_user(i) != u) - continue; - - disable_irq(irq_from_evtchn(i)); - } - - spin_unlock_irq(&port_user_lock); - - for (i = 0; i < NR_EVENT_CHANNELS; i++) { - if (get_port_user(i) != u) - continue; - - evtchn_unbind_from_user(get_port_user(i), i); + evtchn = rb_entry(node, struct user_evtchn, node); + disable_irq(irq_from_evtchn(evtchn->port)); + evtchn_unbind_from_user(u, evtchn); } + evtchn_free_ring(u->ring); kfree(u->name); kfree(u); @@ -525,7 +694,6 @@ static const struct file_operations evtchn_fops = { .fasync = evtchn_fasync, .open = evtchn_open, .release = evtchn_release, - .llseek = no_llseek, }; static struct miscdevice evtchn_miscdev = { @@ -540,12 +708,6 @@ static int __init evtchn_init(void) if (!xen_domain()) return -ENODEV; - port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL); - if (port_user == NULL) - return -ENOMEM; - - spin_lock_init(&port_user_lock); - /* Create '/dev/xen/evtchn'. */ err = misc_register(&evtchn_miscdev); if (err != 0) { @@ -560,13 +722,11 @@ static int __init evtchn_init(void) static void __exit evtchn_cleanup(void) { - kfree(port_user); - port_user = NULL; - misc_deregister(&evtchn_miscdev); } module_init(evtchn_init); module_exit(evtchn_cleanup); +MODULE_DESCRIPTION("Xen /dev/xen/evtchn device driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/xen/fallback.c b/drivers/xen/fallback.c deleted file mode 100644 index b04fb64c5a91..000000000000 --- a/drivers/xen/fallback.c +++ /dev/null @@ -1,81 +0,0 @@ -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/bug.h> -#include <linux/export.h> -#include <asm/hypervisor.h> -#include <asm/xen/hypercall.h> - -int xen_event_channel_op_compat(int cmd, void *arg) -{ - struct evtchn_op op; - int rc; - - op.cmd = cmd; - memcpy(&op.u, arg, sizeof(op.u)); - rc = _hypercall1(int, event_channel_op_compat, &op); - - switch (cmd) { - case EVTCHNOP_close: - case EVTCHNOP_send: - case EVTCHNOP_bind_vcpu: - case EVTCHNOP_unmask: - /* no output */ - break; - -#define COPY_BACK(eop) \ - case EVTCHNOP_##eop: \ - memcpy(arg, &op.u.eop, sizeof(op.u.eop)); \ - break - - COPY_BACK(bind_interdomain); - COPY_BACK(bind_virq); - COPY_BACK(bind_pirq); - COPY_BACK(status); - COPY_BACK(alloc_unbound); - COPY_BACK(bind_ipi); -#undef COPY_BACK - - default: - WARN_ON(rc != -ENOSYS); - break; - } - - return rc; -} -EXPORT_SYMBOL_GPL(xen_event_channel_op_compat); - -int xen_physdev_op_compat(int cmd, void *arg) -{ - struct physdev_op op; - int rc; - - op.cmd = cmd; - memcpy(&op.u, arg, sizeof(op.u)); - rc = _hypercall1(int, physdev_op_compat, &op); - - switch (cmd) { - case PHYSDEVOP_IRQ_UNMASK_NOTIFY: - case PHYSDEVOP_set_iopl: - case PHYSDEVOP_set_iobitmap: - case PHYSDEVOP_apic_write: - /* no output */ - break; - -#define COPY_BACK(pop, fld) \ - case PHYSDEVOP_##pop: \ - memcpy(arg, &op.u.fld, sizeof(op.u.fld)); \ - break - - COPY_BACK(irq_status_query, irq_status_query); - COPY_BACK(apic_read, apic_op); - COPY_BACK(ASSIGN_VECTOR, irq_op); -#undef COPY_BACK - - default: - WARN_ON(rc != -ENOSYS); - break; - } - - return rc; -} -EXPORT_SYMBOL_GPL(xen_physdev_op_compat); diff --git a/drivers/xen/features.c b/drivers/xen/features.c index 99eda169c779..87f1828d40d5 100644 --- a/drivers/xen/features.c +++ b/drivers/xen/features.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** * features.c * @@ -7,14 +8,27 @@ */ #include <linux/types.h> #include <linux/cache.h> -#include <linux/module.h> +#include <linux/export.h> +#include <linux/printk.h> #include <asm/xen/hypercall.h> +#include <xen/xen.h> #include <xen/interface/xen.h> #include <xen/interface/version.h> #include <xen/features.h> +/* + * Linux kernel expects at least Xen 4.0. + * + * Assume some features to be available for that reason (depending on guest + * mode, of course). + */ +#define chk_required_feature(f) { \ + if (!xen_feature(f)) \ + panic("Xen: feature %s not available!\n", #f); \ + } + u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; EXPORT_SYMBOL_GPL(xen_features); @@ -28,6 +42,11 @@ void xen_setup_features(void) if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) break; for (j = 0; j < 32; j++) - xen_features[i * 32 + j] = !!(fi.submap & 1<<j); + xen_features[i * 32 + j] = !!(fi.submap & 1U << j); + } + + if (xen_pv_domain()) { + chk_required_feature(XENFEAT_mmu_pt_update_preserve_ad); + chk_required_feature(XENFEAT_gnttab_map_avail_bits); } } diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index 787d17945418..f93f73ecefee 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -124,30 +124,32 @@ static int add_grefs(struct ioctl_gntalloc_alloc_gref *op, int i, rc, readonly; LIST_HEAD(queue_gref); LIST_HEAD(queue_file); - struct gntalloc_gref *gref; + struct gntalloc_gref *gref, *next; readonly = !(op->flags & GNTALLOC_FLAG_WRITABLE); - rc = -ENOMEM; for (i = 0; i < op->count; i++) { gref = kzalloc(sizeof(*gref), GFP_KERNEL); - if (!gref) + if (!gref) { + rc = -ENOMEM; goto undo; + } list_add_tail(&gref->next_gref, &queue_gref); list_add_tail(&gref->next_file, &queue_file); gref->users = 1; gref->file_index = op->index + i * PAGE_SIZE; gref->page = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (!gref->page) + if (!gref->page) { + rc = -ENOMEM; goto undo; + } /* Grant foreign access to the page. */ - gref->gref_id = gnttab_grant_foreign_access(op->domid, - pfn_to_mfn(page_to_pfn(gref->page)), readonly); - if ((int)gref->gref_id < 0) { - rc = gref->gref_id; + rc = gnttab_grant_foreign_access(op->domid, + xen_page_to_gfn(gref->page), + readonly); + if (rc < 0) goto undo; - } - gref_ids[i] = gref->gref_id; + gref_ids[i] = gref->gref_id = rc; } /* Add to gref lists. */ @@ -162,19 +164,11 @@ undo: mutex_lock(&gref_mutex); gref_size -= (op->count - i); - list_for_each_entry(gref, &queue_file, next_file) { - /* __del_gref does not remove from queue_file */ + list_for_each_entry_safe(gref, next, &queue_file, next_file) { + list_del(&gref->next_file); __del_gref(gref); } - /* It's possible for the target domain to map the just-allocated grant - * references by blindly guessing their IDs; if this is done, then - * __del_gref will leave them in the queue_gref list. They need to be - * added to the global list so that we can free them when they are no - * longer referenced. - */ - if (unlikely(!list_empty(&queue_gref))) - list_splice_tail(&queue_gref, &gref_list); mutex_unlock(&gref_mutex); return rc; } @@ -182,9 +176,9 @@ undo: static void __del_gref(struct gntalloc_gref *gref) { if (gref->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { - uint8_t *tmp = kmap(gref->page); + uint8_t *tmp = kmap_local_page(gref->page); tmp[gref->notify.pgoff] = 0; - kunmap(gref->page); + kunmap_local(tmp); } if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT) { notify_remote_via_evtchn(gref->notify.event); @@ -193,22 +187,16 @@ static void __del_gref(struct gntalloc_gref *gref) gref->notify.flags = 0; - if (gref->gref_id > 0) { - if (gnttab_query_foreign_access(gref->gref_id)) - return; - - if (!gnttab_end_foreign_access_ref(gref->gref_id, 0)) - return; - - gnttab_free_grant_reference(gref->gref_id); + if (gref->gref_id) { + if (gref->page) + gnttab_end_foreign_access(gref->gref_id, gref->page); + else + gnttab_free_grant_reference(gref->gref_id); } gref_size--; list_del(&gref->next_gref); - if (gref->page) - __free_page(gref->page); - kfree(gref); } @@ -292,7 +280,7 @@ static long gntalloc_ioctl_alloc(struct gntalloc_file_private_data *priv, goto out; } - gref_ids = kcalloc(op.count, sizeof(gref_ids[0]), GFP_TEMPORARY); + gref_ids = kcalloc(op.count, sizeof(gref_ids[0]), GFP_KERNEL); if (!gref_ids) { rc = -ENOMEM; goto out; @@ -329,7 +317,7 @@ static long gntalloc_ioctl_alloc(struct gntalloc_file_private_data *priv, rc = -EFAULT; goto out_free; } - if (copy_to_user(arg->gref_ids, gref_ids, + if (copy_to_user(arg->gref_ids_flex, gref_ids, sizeof(gref_ids[0]) * op.count)) { rc = -EFAULT; goto out_free; @@ -495,7 +483,7 @@ static void gntalloc_vma_close(struct vm_area_struct *vma) mutex_unlock(&gref_mutex); } -static struct vm_operations_struct gntalloc_vmops = { +static const struct vm_operations_struct gntalloc_vmops = { .open = gntalloc_vma_open, .close = gntalloc_vma_close, }; @@ -505,7 +493,7 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) struct gntalloc_file_private_data *priv = filp->private_data; struct gntalloc_vma_private_data *vm_priv; struct gntalloc_gref *gref; - int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + int count = vma_pages(vma); int rv, i; if (!(vma->vm_flags & VM_SHARED)) { @@ -537,7 +525,7 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_private_data = vm_priv; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &gntalloc_vmops; diff --git a/drivers/xen/gntdev-common.h b/drivers/xen/gntdev-common.h new file mode 100644 index 000000000000..ac8ce3179ba2 --- /dev/null +++ b/drivers/xen/gntdev-common.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Common functionality of grant device. + * + * Copyright (c) 2006-2007, D G Murray. + * (c) 2009 Gerd Hoffmann <kraxel@redhat.com> + * (c) 2018 Oleksandr Andrushchenko, EPAM Systems Inc. + */ + +#ifndef _GNTDEV_COMMON_H +#define _GNTDEV_COMMON_H + +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/mmu_notifier.h> +#include <linux/types.h> +#include <xen/interface/event_channel.h> +#include <xen/grant_table.h> + +struct gntdev_dmabuf_priv; + +struct gntdev_priv { + /* Maps with visible offsets in the file descriptor. */ + struct list_head maps; + /* lock protects maps and freeable_maps. */ + struct mutex lock; + + /* Free instances of struct gntdev_copy_batch. */ + struct gntdev_copy_batch *batch; + struct mutex batch_lock; + +#ifdef CONFIG_XEN_GRANT_DMA_ALLOC + /* Device for which DMA memory is allocated. */ + struct device *dma_dev; +#endif + +#ifdef CONFIG_XEN_GNTDEV_DMABUF + struct gntdev_dmabuf_priv *dmabuf_priv; +#endif +}; + +struct gntdev_unmap_notify { + int flags; + /* Address relative to the start of the gntdev_grant_map. */ + int addr; + evtchn_port_t event; +}; + +struct gntdev_grant_map { + atomic_t in_use; + struct mmu_interval_notifier notifier; + bool notifier_init; + struct list_head next; + int index; + int count; + int flags; + refcount_t users; + struct gntdev_unmap_notify notify; + struct ioctl_gntdev_grant_ref *grants; + struct gnttab_map_grant_ref *map_ops; + struct gnttab_unmap_grant_ref *unmap_ops; + struct gnttab_map_grant_ref *kmap_ops; + struct gnttab_unmap_grant_ref *kunmap_ops; + bool *being_removed; + struct page **pages; + unsigned long pages_vm_start; + +#ifdef CONFIG_XEN_GRANT_DMA_ALLOC + /* + * If dmabuf_vaddr is not NULL then this mapping is backed by DMA + * capable memory. + */ + + struct device *dma_dev; + /* Flags used to create this DMA buffer: GNTDEV_DMA_FLAG_XXX. */ + int dma_flags; + void *dma_vaddr; + dma_addr_t dma_bus_addr; + /* Needed to avoid allocation in gnttab_dma_free_pages(). */ + xen_pfn_t *frames; +#endif + + /* Number of live grants */ + atomic_t live_grants; + /* Needed to avoid allocation in __unmap_grant_pages */ + struct gntab_unmap_queue_data unmap_data; +}; + +struct gntdev_grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count, + int dma_flags); + +void gntdev_add_map(struct gntdev_priv *priv, struct gntdev_grant_map *add); + +void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map); + +bool gntdev_test_page_count(unsigned int count); + +int gntdev_map_grant_pages(struct gntdev_grant_map *map); + +#endif diff --git a/drivers/xen/gntdev-dmabuf.c b/drivers/xen/gntdev-dmabuf.c new file mode 100644 index 000000000000..550980dd3b0b --- /dev/null +++ b/drivers/xen/gntdev-dmabuf.c @@ -0,0 +1,839 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Xen dma-buf functionality for gntdev. + * + * DMA buffer implementation is based on drivers/gpu/drm/drm_prime.c. + * + * Copyright (c) 2018 Oleksandr Andrushchenko, EPAM Systems Inc. + */ + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/dma-buf.h> +#include <linux/dma-direct.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/module.h> + +#include <xen/xen.h> +#include <xen/grant_table.h> + +#include "gntdev-common.h" +#include "gntdev-dmabuf.h" + +MODULE_IMPORT_NS("DMA_BUF"); + +struct gntdev_dmabuf { + struct gntdev_dmabuf_priv *priv; + struct dma_buf *dmabuf; + struct list_head next; + int fd; + + union { + struct { + /* Exported buffers are reference counted. */ + struct kref refcount; + + struct gntdev_priv *priv; + struct gntdev_grant_map *map; + } exp; + struct { + /* Granted references of the imported buffer. */ + grant_ref_t *refs; + /* Scatter-gather table of the imported buffer. */ + struct sg_table *sgt; + /* dma-buf attachment of the imported buffer. */ + struct dma_buf_attachment *attach; + } imp; + } u; + + /* Number of pages this buffer has. */ + int nr_pages; + /* Pages of this buffer (only for dma-buf export). */ + struct page **pages; +}; + +struct gntdev_dmabuf_wait_obj { + struct list_head next; + struct gntdev_dmabuf *gntdev_dmabuf; + struct completion completion; +}; + +struct gntdev_dmabuf_attachment { + struct sg_table *sgt; + enum dma_data_direction dir; +}; + +struct gntdev_dmabuf_priv { + /* List of exported DMA buffers. */ + struct list_head exp_list; + /* List of wait objects. */ + struct list_head exp_wait_list; + /* List of imported DMA buffers. */ + struct list_head imp_list; + /* This is the lock which protects dma_buf_xxx lists. */ + struct mutex lock; + /* + * We reference this file while exporting dma-bufs, so + * the grant device context is not destroyed while there are + * external users alive. + */ + struct file *filp; +}; + +/* DMA buffer export support. */ + +/* Implementation of wait for exported DMA buffer to be released. */ + +static void dmabuf_exp_release(struct kref *kref); + +static struct gntdev_dmabuf_wait_obj * +dmabuf_exp_wait_obj_new(struct gntdev_dmabuf_priv *priv, + struct gntdev_dmabuf *gntdev_dmabuf) +{ + struct gntdev_dmabuf_wait_obj *obj; + + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return ERR_PTR(-ENOMEM); + + init_completion(&obj->completion); + obj->gntdev_dmabuf = gntdev_dmabuf; + + mutex_lock(&priv->lock); + list_add(&obj->next, &priv->exp_wait_list); + /* Put our reference and wait for gntdev_dmabuf's release to fire. */ + kref_put(&gntdev_dmabuf->u.exp.refcount, dmabuf_exp_release); + mutex_unlock(&priv->lock); + return obj; +} + +static void dmabuf_exp_wait_obj_free(struct gntdev_dmabuf_priv *priv, + struct gntdev_dmabuf_wait_obj *obj) +{ + mutex_lock(&priv->lock); + list_del(&obj->next); + mutex_unlock(&priv->lock); + kfree(obj); +} + +static int dmabuf_exp_wait_obj_wait(struct gntdev_dmabuf_wait_obj *obj, + u32 wait_to_ms) +{ + if (wait_for_completion_timeout(&obj->completion, + msecs_to_jiffies(wait_to_ms)) <= 0) + return -ETIMEDOUT; + + return 0; +} + +static void dmabuf_exp_wait_obj_signal(struct gntdev_dmabuf_priv *priv, + struct gntdev_dmabuf *gntdev_dmabuf) +{ + struct gntdev_dmabuf_wait_obj *obj; + + list_for_each_entry(obj, &priv->exp_wait_list, next) + if (obj->gntdev_dmabuf == gntdev_dmabuf) { + pr_debug("Found gntdev_dmabuf in the wait list, wake\n"); + complete_all(&obj->completion); + break; + } +} + +static struct gntdev_dmabuf * +dmabuf_exp_wait_obj_get_dmabuf(struct gntdev_dmabuf_priv *priv, int fd) +{ + struct gntdev_dmabuf *gntdev_dmabuf, *ret = ERR_PTR(-ENOENT); + + mutex_lock(&priv->lock); + list_for_each_entry(gntdev_dmabuf, &priv->exp_list, next) + if (gntdev_dmabuf->fd == fd) { + pr_debug("Found gntdev_dmabuf in the wait list\n"); + kref_get(&gntdev_dmabuf->u.exp.refcount); + ret = gntdev_dmabuf; + break; + } + mutex_unlock(&priv->lock); + return ret; +} + +static int dmabuf_exp_wait_released(struct gntdev_dmabuf_priv *priv, int fd, + int wait_to_ms) +{ + struct gntdev_dmabuf *gntdev_dmabuf; + struct gntdev_dmabuf_wait_obj *obj; + int ret; + + pr_debug("Will wait for dma-buf with fd %d\n", fd); + /* + * Try to find the DMA buffer: if not found means that + * either the buffer has already been released or file descriptor + * provided is wrong. + */ + gntdev_dmabuf = dmabuf_exp_wait_obj_get_dmabuf(priv, fd); + if (IS_ERR(gntdev_dmabuf)) + return PTR_ERR(gntdev_dmabuf); + + /* + * gntdev_dmabuf still exists and is reference count locked by us now, + * so prepare to wait: allocate wait object and add it to the wait list, + * so we can find it on release. + */ + obj = dmabuf_exp_wait_obj_new(priv, gntdev_dmabuf); + if (IS_ERR(obj)) + return PTR_ERR(obj); + + ret = dmabuf_exp_wait_obj_wait(obj, wait_to_ms); + dmabuf_exp_wait_obj_free(priv, obj); + return ret; +} + +/* DMA buffer export support. */ + +static struct sg_table * +dmabuf_pages_to_sgt(struct page **pages, unsigned int nr_pages) +{ + struct sg_table *sgt; + int ret; + + sgt = kmalloc(sizeof(*sgt), GFP_KERNEL); + if (!sgt) { + ret = -ENOMEM; + goto out; + } + + ret = sg_alloc_table_from_pages(sgt, pages, nr_pages, 0, + nr_pages << PAGE_SHIFT, + GFP_KERNEL); + if (ret) + goto out; + + return sgt; + +out: + kfree(sgt); + return ERR_PTR(ret); +} + +static int dmabuf_exp_ops_attach(struct dma_buf *dma_buf, + struct dma_buf_attachment *attach) +{ + struct gntdev_dmabuf_attachment *gntdev_dmabuf_attach; + + gntdev_dmabuf_attach = kzalloc(sizeof(*gntdev_dmabuf_attach), + GFP_KERNEL); + if (!gntdev_dmabuf_attach) + return -ENOMEM; + + gntdev_dmabuf_attach->dir = DMA_NONE; + attach->priv = gntdev_dmabuf_attach; + return 0; +} + +static void dmabuf_exp_ops_detach(struct dma_buf *dma_buf, + struct dma_buf_attachment *attach) +{ + struct gntdev_dmabuf_attachment *gntdev_dmabuf_attach = attach->priv; + + if (gntdev_dmabuf_attach) { + struct sg_table *sgt = gntdev_dmabuf_attach->sgt; + + if (sgt) { + if (gntdev_dmabuf_attach->dir != DMA_NONE) + dma_unmap_sgtable(attach->dev, sgt, + gntdev_dmabuf_attach->dir, + DMA_ATTR_SKIP_CPU_SYNC); + sg_free_table(sgt); + } + + kfree(sgt); + kfree(gntdev_dmabuf_attach); + attach->priv = NULL; + } +} + +static struct sg_table * +dmabuf_exp_ops_map_dma_buf(struct dma_buf_attachment *attach, + enum dma_data_direction dir) +{ + struct gntdev_dmabuf_attachment *gntdev_dmabuf_attach = attach->priv; + struct gntdev_dmabuf *gntdev_dmabuf = attach->dmabuf->priv; + struct sg_table *sgt; + + pr_debug("Mapping %d pages for dev %p\n", gntdev_dmabuf->nr_pages, + attach->dev); + + if (dir == DMA_NONE || !gntdev_dmabuf_attach) + return ERR_PTR(-EINVAL); + + /* Return the cached mapping when possible. */ + if (gntdev_dmabuf_attach->dir == dir) + return gntdev_dmabuf_attach->sgt; + + /* + * Two mappings with different directions for the same attachment are + * not allowed. + */ + if (gntdev_dmabuf_attach->dir != DMA_NONE) + return ERR_PTR(-EBUSY); + + sgt = dmabuf_pages_to_sgt(gntdev_dmabuf->pages, + gntdev_dmabuf->nr_pages); + if (!IS_ERR(sgt)) { + if (dma_map_sgtable(attach->dev, sgt, dir, + DMA_ATTR_SKIP_CPU_SYNC)) { + sg_free_table(sgt); + kfree(sgt); + sgt = ERR_PTR(-ENOMEM); + } else { + gntdev_dmabuf_attach->sgt = sgt; + gntdev_dmabuf_attach->dir = dir; + } + } + if (IS_ERR(sgt)) + pr_debug("Failed to map sg table for dev %p\n", attach->dev); + return sgt; +} + +static void dmabuf_exp_ops_unmap_dma_buf(struct dma_buf_attachment *attach, + struct sg_table *sgt, + enum dma_data_direction dir) +{ + /* Not implemented. The unmap is done at dmabuf_exp_ops_detach(). */ +} + +static void dmabuf_exp_release(struct kref *kref) +{ + struct gntdev_dmabuf *gntdev_dmabuf = + container_of(kref, struct gntdev_dmabuf, u.exp.refcount); + + dmabuf_exp_wait_obj_signal(gntdev_dmabuf->priv, gntdev_dmabuf); + list_del(&gntdev_dmabuf->next); + fput(gntdev_dmabuf->priv->filp); + kfree(gntdev_dmabuf); +} + +static void dmabuf_exp_remove_map(struct gntdev_priv *priv, + struct gntdev_grant_map *map) +{ + mutex_lock(&priv->lock); + list_del(&map->next); + gntdev_put_map(NULL /* already removed */, map); + mutex_unlock(&priv->lock); +} + +static void dmabuf_exp_ops_release(struct dma_buf *dma_buf) +{ + struct gntdev_dmabuf *gntdev_dmabuf = dma_buf->priv; + struct gntdev_dmabuf_priv *priv = gntdev_dmabuf->priv; + + dmabuf_exp_remove_map(gntdev_dmabuf->u.exp.priv, + gntdev_dmabuf->u.exp.map); + mutex_lock(&priv->lock); + kref_put(&gntdev_dmabuf->u.exp.refcount, dmabuf_exp_release); + mutex_unlock(&priv->lock); +} + +static const struct dma_buf_ops dmabuf_exp_ops = { + .attach = dmabuf_exp_ops_attach, + .detach = dmabuf_exp_ops_detach, + .map_dma_buf = dmabuf_exp_ops_map_dma_buf, + .unmap_dma_buf = dmabuf_exp_ops_unmap_dma_buf, + .release = dmabuf_exp_ops_release, +}; + +struct gntdev_dmabuf_export_args { + struct gntdev_priv *priv; + struct gntdev_grant_map *map; + struct gntdev_dmabuf_priv *dmabuf_priv; + struct device *dev; + int count; + struct page **pages; + u32 fd; +}; + +static int dmabuf_exp_from_pages(struct gntdev_dmabuf_export_args *args) +{ + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct gntdev_dmabuf *gntdev_dmabuf __free(kfree) = NULL; + CLASS(get_unused_fd, ret)(O_CLOEXEC); + + if (ret < 0) + return ret; + + gntdev_dmabuf = kzalloc(sizeof(*gntdev_dmabuf), GFP_KERNEL); + if (!gntdev_dmabuf) + return -ENOMEM; + + kref_init(&gntdev_dmabuf->u.exp.refcount); + + gntdev_dmabuf->priv = args->dmabuf_priv; + gntdev_dmabuf->nr_pages = args->count; + gntdev_dmabuf->pages = args->pages; + gntdev_dmabuf->u.exp.priv = args->priv; + gntdev_dmabuf->u.exp.map = args->map; + + exp_info.exp_name = KBUILD_MODNAME; + if (args->dev->driver && args->dev->driver->owner) + exp_info.owner = args->dev->driver->owner; + else + exp_info.owner = THIS_MODULE; + exp_info.ops = &dmabuf_exp_ops; + exp_info.size = args->count << PAGE_SHIFT; + exp_info.flags = O_RDWR; + exp_info.priv = gntdev_dmabuf; + + gntdev_dmabuf->dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(gntdev_dmabuf->dmabuf)) + return PTR_ERR(gntdev_dmabuf->dmabuf); + + gntdev_dmabuf->fd = ret; + args->fd = ret; + + pr_debug("Exporting DMA buffer with fd %d\n", ret); + + get_file(gntdev_dmabuf->priv->filp); + mutex_lock(&args->dmabuf_priv->lock); + list_add(&gntdev_dmabuf->next, &args->dmabuf_priv->exp_list); + mutex_unlock(&args->dmabuf_priv->lock); + + fd_install(take_fd(ret), no_free_ptr(gntdev_dmabuf)->dmabuf->file); + return 0; +} + +static struct gntdev_grant_map * +dmabuf_exp_alloc_backing_storage(struct gntdev_priv *priv, int dmabuf_flags, + int count) +{ + struct gntdev_grant_map *map; + + if (unlikely(gntdev_test_page_count(count))) + return ERR_PTR(-EINVAL); + + if ((dmabuf_flags & GNTDEV_DMA_FLAG_WC) && + (dmabuf_flags & GNTDEV_DMA_FLAG_COHERENT)) { + pr_debug("Wrong dma-buf flags: 0x%x\n", dmabuf_flags); + return ERR_PTR(-EINVAL); + } + + map = gntdev_alloc_map(priv, count, dmabuf_flags); + if (!map) + return ERR_PTR(-ENOMEM); + + return map; +} + +static int dmabuf_exp_from_refs(struct gntdev_priv *priv, int flags, + int count, u32 domid, u32 *refs, u32 *fd) +{ + struct gntdev_grant_map *map; + struct gntdev_dmabuf_export_args args; + int i, ret; + + map = dmabuf_exp_alloc_backing_storage(priv, flags, count); + if (IS_ERR(map)) + return PTR_ERR(map); + + for (i = 0; i < count; i++) { + map->grants[i].domid = domid; + map->grants[i].ref = refs[i]; + } + + mutex_lock(&priv->lock); + gntdev_add_map(priv, map); + mutex_unlock(&priv->lock); + + map->flags |= GNTMAP_host_map; +#if defined(CONFIG_X86) + map->flags |= GNTMAP_device_map; +#endif + + ret = gntdev_map_grant_pages(map); + if (ret < 0) + goto out; + + args.priv = priv; + args.map = map; + args.dev = priv->dma_dev; + args.dmabuf_priv = priv->dmabuf_priv; + args.count = map->count; + args.pages = map->pages; + args.fd = -1; /* Shut up unnecessary gcc warning for i386 */ + + ret = dmabuf_exp_from_pages(&args); + if (ret < 0) + goto out; + + *fd = args.fd; + return 0; + +out: + dmabuf_exp_remove_map(priv, map); + return ret; +} + +/* DMA buffer import support. */ + +static int +dmabuf_imp_grant_foreign_access(unsigned long *gfns, u32 *refs, + int count, int domid) +{ + grant_ref_t priv_gref_head; + int i, ret; + + ret = gnttab_alloc_grant_references(count, &priv_gref_head); + if (ret < 0) { + pr_debug("Cannot allocate grant references, ret %d\n", ret); + return ret; + } + + for (i = 0; i < count; i++) { + int cur_ref; + + cur_ref = gnttab_claim_grant_reference(&priv_gref_head); + if (cur_ref < 0) { + ret = cur_ref; + pr_debug("Cannot claim grant reference, ret %d\n", ret); + goto out; + } + + gnttab_grant_foreign_access_ref(cur_ref, domid, + gfns[i], 0); + refs[i] = cur_ref; + } + + return 0; + +out: + gnttab_free_grant_references(priv_gref_head); + return ret; +} + +static void dmabuf_imp_end_foreign_access(u32 *refs, int count) +{ + int i; + + for (i = 0; i < count; i++) + if (refs[i] != INVALID_GRANT_REF) + gnttab_end_foreign_access(refs[i], NULL); +} + +static void dmabuf_imp_free_storage(struct gntdev_dmabuf *gntdev_dmabuf) +{ + kfree(gntdev_dmabuf->u.imp.refs); + kfree(gntdev_dmabuf); +} + +static struct gntdev_dmabuf *dmabuf_imp_alloc_storage(int count) +{ + struct gntdev_dmabuf *gntdev_dmabuf; + int i; + + gntdev_dmabuf = kzalloc(sizeof(*gntdev_dmabuf), GFP_KERNEL); + if (!gntdev_dmabuf) + goto fail_no_free; + + gntdev_dmabuf->u.imp.refs = kcalloc(count, + sizeof(gntdev_dmabuf->u.imp.refs[0]), + GFP_KERNEL); + if (!gntdev_dmabuf->u.imp.refs) + goto fail; + + gntdev_dmabuf->nr_pages = count; + + for (i = 0; i < count; i++) + gntdev_dmabuf->u.imp.refs[i] = INVALID_GRANT_REF; + + return gntdev_dmabuf; + +fail: + dmabuf_imp_free_storage(gntdev_dmabuf); +fail_no_free: + return ERR_PTR(-ENOMEM); +} + +static struct gntdev_dmabuf * +dmabuf_imp_to_refs(struct gntdev_dmabuf_priv *priv, struct device *dev, + int fd, int count, int domid) +{ + struct gntdev_dmabuf *gntdev_dmabuf, *ret; + struct dma_buf *dma_buf; + struct dma_buf_attachment *attach; + struct sg_table *sgt; + struct sg_dma_page_iter sg_iter; + unsigned long *gfns; + int i; + + dma_buf = dma_buf_get(fd); + if (IS_ERR(dma_buf)) + return ERR_CAST(dma_buf); + + gntdev_dmabuf = dmabuf_imp_alloc_storage(count); + if (IS_ERR(gntdev_dmabuf)) { + ret = gntdev_dmabuf; + goto fail_put; + } + + gntdev_dmabuf->priv = priv; + gntdev_dmabuf->fd = fd; + + attach = dma_buf_attach(dma_buf, dev); + if (IS_ERR(attach)) { + ret = ERR_CAST(attach); + goto fail_free_obj; + } + + gntdev_dmabuf->u.imp.attach = attach; + + sgt = dma_buf_map_attachment_unlocked(attach, DMA_BIDIRECTIONAL); + if (IS_ERR(sgt)) { + ret = ERR_CAST(sgt); + goto fail_detach; + } + + /* Check that we have zero offset. */ + if (sgt->sgl->offset) { + ret = ERR_PTR(-EINVAL); + pr_debug("DMA buffer has %d bytes offset, user-space expects 0\n", + sgt->sgl->offset); + goto fail_unmap; + } + + /* Check number of pages that imported buffer has. */ + if (attach->dmabuf->size != gntdev_dmabuf->nr_pages << PAGE_SHIFT) { + ret = ERR_PTR(-EINVAL); + pr_debug("DMA buffer has %zu pages, user-space expects %d\n", + attach->dmabuf->size, gntdev_dmabuf->nr_pages); + goto fail_unmap; + } + + gntdev_dmabuf->u.imp.sgt = sgt; + + gfns = kcalloc(count, sizeof(*gfns), GFP_KERNEL); + if (!gfns) { + ret = ERR_PTR(-ENOMEM); + goto fail_unmap; + } + + /* + * Now convert sgt to array of gfns without accessing underlying pages. + * It is not allowed to access the underlying struct page of an sg table + * exported by DMA-buf, but since we deal with special Xen dma device here + * (not a normal physical one) look at the dma addresses in the sg table + * and then calculate gfns directly from them. + */ + i = 0; + for_each_sgtable_dma_page(sgt, &sg_iter, 0) { + dma_addr_t addr = sg_page_iter_dma_address(&sg_iter); + unsigned long pfn = bfn_to_pfn(XEN_PFN_DOWN(dma_to_phys(dev, addr))); + + gfns[i++] = pfn_to_gfn(pfn); + } + + ret = ERR_PTR(dmabuf_imp_grant_foreign_access(gfns, + gntdev_dmabuf->u.imp.refs, + count, domid)); + kfree(gfns); + if (IS_ERR(ret)) + goto fail_end_access; + + pr_debug("Imported DMA buffer with fd %d\n", fd); + + mutex_lock(&priv->lock); + list_add(&gntdev_dmabuf->next, &priv->imp_list); + mutex_unlock(&priv->lock); + + return gntdev_dmabuf; + +fail_end_access: + dmabuf_imp_end_foreign_access(gntdev_dmabuf->u.imp.refs, count); +fail_unmap: + dma_buf_unmap_attachment_unlocked(attach, sgt, DMA_BIDIRECTIONAL); +fail_detach: + dma_buf_detach(dma_buf, attach); +fail_free_obj: + dmabuf_imp_free_storage(gntdev_dmabuf); +fail_put: + dma_buf_put(dma_buf); + return ret; +} + +/* + * Find the hyper dma-buf by its file descriptor and remove + * it from the buffer's list. + */ +static struct gntdev_dmabuf * +dmabuf_imp_find_unlink(struct gntdev_dmabuf_priv *priv, int fd) +{ + struct gntdev_dmabuf *q, *gntdev_dmabuf, *ret = ERR_PTR(-ENOENT); + + mutex_lock(&priv->lock); + list_for_each_entry_safe(gntdev_dmabuf, q, &priv->imp_list, next) { + if (gntdev_dmabuf->fd == fd) { + pr_debug("Found gntdev_dmabuf in the import list\n"); + ret = gntdev_dmabuf; + list_del(&gntdev_dmabuf->next); + break; + } + } + mutex_unlock(&priv->lock); + return ret; +} + +static int dmabuf_imp_release(struct gntdev_dmabuf_priv *priv, u32 fd) +{ + struct gntdev_dmabuf *gntdev_dmabuf; + struct dma_buf_attachment *attach; + struct dma_buf *dma_buf; + + gntdev_dmabuf = dmabuf_imp_find_unlink(priv, fd); + if (IS_ERR(gntdev_dmabuf)) + return PTR_ERR(gntdev_dmabuf); + + pr_debug("Releasing DMA buffer with fd %d\n", fd); + + dmabuf_imp_end_foreign_access(gntdev_dmabuf->u.imp.refs, + gntdev_dmabuf->nr_pages); + + attach = gntdev_dmabuf->u.imp.attach; + + if (gntdev_dmabuf->u.imp.sgt) + dma_buf_unmap_attachment_unlocked(attach, gntdev_dmabuf->u.imp.sgt, + DMA_BIDIRECTIONAL); + dma_buf = attach->dmabuf; + dma_buf_detach(attach->dmabuf, attach); + dma_buf_put(dma_buf); + + dmabuf_imp_free_storage(gntdev_dmabuf); + return 0; +} + +static void dmabuf_imp_release_all(struct gntdev_dmabuf_priv *priv) +{ + struct gntdev_dmabuf *q, *gntdev_dmabuf; + + list_for_each_entry_safe(gntdev_dmabuf, q, &priv->imp_list, next) + dmabuf_imp_release(priv, gntdev_dmabuf->fd); +} + +/* DMA buffer IOCTL support. */ + +long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv, + struct ioctl_gntdev_dmabuf_exp_from_refs __user *u) +{ + struct ioctl_gntdev_dmabuf_exp_from_refs op; + u32 *refs; + long ret; + + if (xen_pv_domain()) { + pr_debug("Cannot provide dma-buf in a PV domain\n"); + return -EINVAL; + } + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + + if (unlikely(gntdev_test_page_count(op.count))) + return -EINVAL; + + refs = kcalloc(op.count, sizeof(*refs), GFP_KERNEL); + if (!refs) + return -ENOMEM; + + if (copy_from_user(refs, u->refs, sizeof(*refs) * op.count) != 0) { + ret = -EFAULT; + goto out; + } + + ret = dmabuf_exp_from_refs(priv, op.flags, op.count, + op.domid, refs, &op.fd); + if (ret) + goto out; + + if (copy_to_user(u, &op, sizeof(op)) != 0) + ret = -EFAULT; + +out: + kfree(refs); + return ret; +} + +long gntdev_ioctl_dmabuf_exp_wait_released(struct gntdev_priv *priv, + struct ioctl_gntdev_dmabuf_exp_wait_released __user *u) +{ + struct ioctl_gntdev_dmabuf_exp_wait_released op; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + + return dmabuf_exp_wait_released(priv->dmabuf_priv, op.fd, + op.wait_to_ms); +} + +long gntdev_ioctl_dmabuf_imp_to_refs(struct gntdev_priv *priv, + struct ioctl_gntdev_dmabuf_imp_to_refs __user *u) +{ + struct ioctl_gntdev_dmabuf_imp_to_refs op; + struct gntdev_dmabuf *gntdev_dmabuf; + long ret; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + + if (unlikely(gntdev_test_page_count(op.count))) + return -EINVAL; + + gntdev_dmabuf = dmabuf_imp_to_refs(priv->dmabuf_priv, + priv->dma_dev, op.fd, + op.count, op.domid); + if (IS_ERR(gntdev_dmabuf)) + return PTR_ERR(gntdev_dmabuf); + + if (copy_to_user(u->refs, gntdev_dmabuf->u.imp.refs, + sizeof(*u->refs) * op.count) != 0) { + ret = -EFAULT; + goto out_release; + } + return 0; + +out_release: + dmabuf_imp_release(priv->dmabuf_priv, op.fd); + return ret; +} + +long gntdev_ioctl_dmabuf_imp_release(struct gntdev_priv *priv, + struct ioctl_gntdev_dmabuf_imp_release __user *u) +{ + struct ioctl_gntdev_dmabuf_imp_release op; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + + return dmabuf_imp_release(priv->dmabuf_priv, op.fd); +} + +struct gntdev_dmabuf_priv *gntdev_dmabuf_init(struct file *filp) +{ + struct gntdev_dmabuf_priv *priv; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return ERR_PTR(-ENOMEM); + + mutex_init(&priv->lock); + INIT_LIST_HEAD(&priv->exp_list); + INIT_LIST_HEAD(&priv->exp_wait_list); + INIT_LIST_HEAD(&priv->imp_list); + + priv->filp = filp; + + return priv; +} + +void gntdev_dmabuf_fini(struct gntdev_dmabuf_priv *priv) +{ + dmabuf_imp_release_all(priv); + kfree(priv); +} diff --git a/drivers/xen/gntdev-dmabuf.h b/drivers/xen/gntdev-dmabuf.h new file mode 100644 index 000000000000..9adf96ac74d3 --- /dev/null +++ b/drivers/xen/gntdev-dmabuf.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Xen dma-buf functionality for gntdev. + * + * Copyright (c) 2018 Oleksandr Andrushchenko, EPAM Systems Inc. + */ + +#ifndef _GNTDEV_DMABUF_H +#define _GNTDEV_DMABUF_H + +#include <xen/gntdev.h> + +struct gntdev_dmabuf_priv; +struct gntdev_priv; + +struct gntdev_dmabuf_priv *gntdev_dmabuf_init(struct file *filp); + +void gntdev_dmabuf_fini(struct gntdev_dmabuf_priv *priv); + +long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv, + struct ioctl_gntdev_dmabuf_exp_from_refs __user *u); + +long gntdev_ioctl_dmabuf_exp_wait_released(struct gntdev_priv *priv, + struct ioctl_gntdev_dmabuf_exp_wait_released __user *u); + +long gntdev_ioctl_dmabuf_imp_to_refs(struct gntdev_priv *priv, + struct ioctl_gntdev_dmabuf_imp_to_refs __user *u); + +long gntdev_ioctl_dmabuf_imp_release(struct gntdev_priv *priv, + struct ioctl_gntdev_dmabuf_imp_release __user *u); + +#endif diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index eab5427c75f5..2c960f187f7c 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -6,6 +6,7 @@ * * Copyright (c) 2006-2007, D G Murray. * (c) 2009 Gerd Hoffmann <kraxel@redhat.com> + * (c) 2018 Oleksandr Andrushchenko, EPAM Systems Inc. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -21,88 +22,74 @@ #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt +#include <linux/dma-mapping.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/miscdevice.h> #include <linux/fs.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/mmu_notifier.h> -#include <linux/types.h> #include <linux/uaccess.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/highmem.h> +#include <linux/refcount.h> +#include <linux/workqueue.h> #include <xen/xen.h> #include <xen/grant_table.h> #include <xen/balloon.h> #include <xen/gntdev.h> #include <xen/events.h> +#include <xen/page.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> -#include <asm/xen/page.h> + +#include "gntdev-common.h" +#ifdef CONFIG_XEN_GNTDEV_DMABUF +#include "gntdev-dmabuf.h" +#endif MODULE_LICENSE("GPL"); MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, " "Gerd Hoffmann <kraxel@redhat.com>"); MODULE_DESCRIPTION("User-space granted page access driver"); -static int limit = 1024*1024; -module_param(limit, int, 0644); -MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by " - "the gntdev device"); - -static atomic_t pages_mapped = ATOMIC_INIT(0); - -static int use_ptemod; -#define populate_freeable_maps use_ptemod - -struct gntdev_priv { - /* maps with visible offsets in the file descriptor */ - struct list_head maps; - /* maps that are not visible; will be freed on munmap. - * Only populated if populate_freeable_maps == 1 */ - struct list_head freeable_maps; - /* lock protects maps and freeable_maps */ - spinlock_t lock; - struct mm_struct *mm; - struct mmu_notifier mn; -}; +#define GNTDEV_COPY_BATCH 16 -struct unmap_notify { - int flags; - /* Address relative to the start of the grant_map */ - int addr; - int event; +struct gntdev_copy_batch { + struct gnttab_copy ops[GNTDEV_COPY_BATCH]; + struct page *pages[GNTDEV_COPY_BATCH]; + s16 __user *status[GNTDEV_COPY_BATCH]; + unsigned int nr_ops; + unsigned int nr_pages; + bool writeable; + struct gntdev_copy_batch *next; }; -struct grant_map { - struct list_head next; - struct vm_area_struct *vma; - int index; - int count; - int flags; - atomic_t users; - struct unmap_notify notify; - struct ioctl_gntdev_grant_ref *grants; - struct gnttab_map_grant_ref *map_ops; - struct gnttab_unmap_grant_ref *unmap_ops; - struct gnttab_map_grant_ref *kmap_ops; - struct page **pages; -}; +static unsigned int limit = 64*1024; +module_param(limit, uint, 0644); +MODULE_PARM_DESC(limit, + "Maximum number of grants that may be mapped by one mapping request"); + +static void unmap_grant_pages(struct gntdev_grant_map *map, + int offset, int pages); -static int unmap_grant_pages(struct grant_map *map, int offset, int pages); +static struct miscdevice gntdev_miscdev; /* ------------------------------------------------------------------ */ +bool gntdev_test_page_count(unsigned int count) +{ + return !count || count > limit; +} + static void gntdev_print_maps(struct gntdev_priv *priv, char *text, int text_index) { #ifdef DEBUG - struct grant_map *map; + struct gntdev_grant_map *map; pr_debug("%s: maps list (priv %p)\n", __func__, priv); list_for_each_entry(map, &priv->maps, next) @@ -112,54 +99,124 @@ static void gntdev_print_maps(struct gntdev_priv *priv, #endif } -static void gntdev_free_map(struct grant_map *map) +static void gntdev_free_map(struct gntdev_grant_map *map) { if (map == NULL) return; +#ifdef CONFIG_XEN_GRANT_DMA_ALLOC + if (map->dma_vaddr) { + struct gnttab_dma_alloc_args args; + + args.dev = map->dma_dev; + args.coherent = !!(map->dma_flags & GNTDEV_DMA_FLAG_COHERENT); + args.nr_pages = map->count; + args.pages = map->pages; + args.frames = map->frames; + args.vaddr = map->dma_vaddr; + args.dev_bus_addr = map->dma_bus_addr; + + gnttab_dma_free_pages(&args); + } else +#endif if (map->pages) - free_xenballooned_pages(map->count, map->pages); - kfree(map->pages); - kfree(map->grants); - kfree(map->map_ops); - kfree(map->unmap_ops); - kfree(map->kmap_ops); + gnttab_free_pages(map->count, map->pages); + +#ifdef CONFIG_XEN_GRANT_DMA_ALLOC + kvfree(map->frames); +#endif + kvfree(map->pages); + kvfree(map->grants); + kvfree(map->map_ops); + kvfree(map->unmap_ops); + kvfree(map->kmap_ops); + kvfree(map->kunmap_ops); + kvfree(map->being_removed); kfree(map); } -static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) +struct gntdev_grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count, + int dma_flags) { - struct grant_map *add; + struct gntdev_grant_map *add; int i; - add = kzalloc(sizeof(struct grant_map), GFP_KERNEL); + add = kzalloc(sizeof(*add), GFP_KERNEL); if (NULL == add) return NULL; - add->grants = kcalloc(count, sizeof(add->grants[0]), GFP_KERNEL); - add->map_ops = kcalloc(count, sizeof(add->map_ops[0]), GFP_KERNEL); - add->unmap_ops = kcalloc(count, sizeof(add->unmap_ops[0]), GFP_KERNEL); - add->kmap_ops = kcalloc(count, sizeof(add->kmap_ops[0]), GFP_KERNEL); - add->pages = kcalloc(count, sizeof(add->pages[0]), GFP_KERNEL); + add->grants = kvmalloc_array(count, sizeof(add->grants[0]), + GFP_KERNEL); + add->map_ops = kvmalloc_array(count, sizeof(add->map_ops[0]), + GFP_KERNEL); + add->unmap_ops = kvmalloc_array(count, sizeof(add->unmap_ops[0]), + GFP_KERNEL); + add->pages = kvcalloc(count, sizeof(add->pages[0]), GFP_KERNEL); + add->being_removed = + kvcalloc(count, sizeof(add->being_removed[0]), GFP_KERNEL); if (NULL == add->grants || NULL == add->map_ops || NULL == add->unmap_ops || - NULL == add->kmap_ops || - NULL == add->pages) + NULL == add->pages || + NULL == add->being_removed) goto err; + if (xen_pv_domain()) { + add->kmap_ops = kvmalloc_array(count, sizeof(add->kmap_ops[0]), + GFP_KERNEL); + add->kunmap_ops = kvmalloc_array(count, sizeof(add->kunmap_ops[0]), + GFP_KERNEL); + if (NULL == add->kmap_ops || NULL == add->kunmap_ops) + goto err; + } + +#ifdef CONFIG_XEN_GRANT_DMA_ALLOC + add->dma_flags = dma_flags; + + /* + * Check if this mapping is requested to be backed + * by a DMA buffer. + */ + if (dma_flags & (GNTDEV_DMA_FLAG_WC | GNTDEV_DMA_FLAG_COHERENT)) { + struct gnttab_dma_alloc_args args; + + add->frames = kvcalloc(count, sizeof(add->frames[0]), + GFP_KERNEL); + if (!add->frames) + goto err; + + /* Remember the device, so we can free DMA memory. */ + add->dma_dev = priv->dma_dev; + + args.dev = priv->dma_dev; + args.coherent = !!(dma_flags & GNTDEV_DMA_FLAG_COHERENT); + args.nr_pages = count; + args.pages = add->pages; + args.frames = add->frames; - if (alloc_xenballooned_pages(count, add->pages, false /* lowmem */)) + if (gnttab_dma_alloc_pages(&args)) + goto err; + + add->dma_vaddr = args.vaddr; + add->dma_bus_addr = args.dev_bus_addr; + } else +#endif + if (gnttab_alloc_pages(count, add->pages)) goto err; for (i = 0; i < count; i++) { - add->map_ops[i].handle = -1; - add->unmap_ops[i].handle = -1; - add->kmap_ops[i].handle = -1; + add->grants[i].domid = DOMID_INVALID; + add->grants[i].ref = INVALID_GRANT_REF; + add->map_ops[i].handle = INVALID_GRANT_HANDLE; + add->unmap_ops[i].handle = INVALID_GRANT_HANDLE; + if (xen_pv_domain()) { + add->kmap_ops[i].handle = INVALID_GRANT_HANDLE; + add->kunmap_ops[i].handle = INVALID_GRANT_HANDLE; + } } add->index = 0; add->count = count; - atomic_set(&add->users, 1); + refcount_set(&add->users, 1); return add; @@ -168,9 +225,9 @@ err: return NULL; } -static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add) +void gntdev_add_map(struct gntdev_priv *priv, struct gntdev_grant_map *add) { - struct grant_map *map; + struct gntdev_grant_map *map; list_for_each_entry(map, &priv->maps, next) { if (add->index + add->count < map->index) { @@ -185,10 +242,10 @@ done: gntdev_print_maps(priv, "[new]", add->index); } -static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, - int index, int count) +static struct gntdev_grant_map *gntdev_find_map_index(struct gntdev_priv *priv, + int index, int count) { - struct grant_map *map; + struct gntdev_grant_map *map; list_for_each_entry(map, &priv->maps, next) { if (map->index != index) @@ -200,60 +257,84 @@ static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, return NULL; } -static void gntdev_put_map(struct gntdev_priv *priv, struct grant_map *map) +void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map) { if (!map) return; - if (!atomic_dec_and_test(&map->users)) + if (!refcount_dec_and_test(&map->users)) return; - atomic_sub(map->count, &pages_mapped); + if (map->pages && !xen_pv_domain()) { + /* + * Increment the reference count. This ensures that the + * subsequent call to unmap_grant_pages() will not wind up + * re-entering itself. It *can* wind up calling + * gntdev_put_map() recursively, but such calls will be with a + * reference count greater than 1, so they will return before + * this code is reached. The recursion depth is thus limited to + * 1. Do NOT use refcount_inc() here, as it will detect that + * the reference count is zero and WARN(). + */ + refcount_set(&map->users, 1); + + /* + * Unmap the grants. This may or may not be asynchronous, so it + * is possible that the reference count is 1 on return, but it + * could also be greater than 1. + */ + unmap_grant_pages(map, 0, map->count); + + /* Check if the memory now needs to be freed */ + if (!refcount_dec_and_test(&map->users)) + return; + + /* + * All pages have been returned to the hypervisor, so free the + * map. + */ + } + + if (xen_pv_domain() && map->notifier_init) + mmu_interval_notifier_remove(&map->notifier); if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) { notify_remote_via_evtchn(map->notify.event); evtchn_put(map->notify.event); } - - if (populate_freeable_maps && priv) { - spin_lock(&priv->lock); - list_del(&map->next); - spin_unlock(&priv->lock); - } - - if (map->pages && !use_ptemod) - unmap_grant_pages(map, 0, map->count); gntdev_free_map(map); } /* ------------------------------------------------------------------ */ -static int find_grant_ptes(pte_t *pte, pgtable_t token, - unsigned long addr, void *data) +static int find_grant_ptes(pte_t *pte, unsigned long addr, void *data) { - struct grant_map *map = data; - unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT; - int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte; + struct gntdev_grant_map *map = data; + unsigned int pgnr = (addr - map->pages_vm_start) >> PAGE_SHIFT; + int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte | + (1 << _GNTMAP_guest_avail0); u64 pte_maddr; BUG_ON(pgnr >= map->count); pte_maddr = arbitrary_virt_to_machine(pte).maddr; + /* Note: this will perform a pte_mkspecial() through the hypercall. */ gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, flags, map->grants[pgnr].ref, map->grants[pgnr].domid); gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, flags, - -1 /* handle */); + INVALID_GRANT_HANDLE); return 0; } -static int map_grant_pages(struct grant_map *map) +int gntdev_map_grant_pages(struct gntdev_grant_map *map) { + size_t alloced = 0; int i, err = 0; - if (!use_ptemod) { + if (!xen_pv_domain()) { /* Note: it could already be mapped */ - if (map->map_ops[0].handle != -1) + if (map->map_ops[0].handle != INVALID_GRANT_HANDLE) return 0; for (i = 0; i < map->count; i++) { unsigned long addr = (unsigned long) @@ -262,7 +343,7 @@ static int map_grant_pages(struct grant_map *map) map->grants[i].ref, map->grants[i].domid); gnttab_set_unmap_op(&map->unmap_ops[i], addr, - map->flags, -1 /* handle */); + map->flags, INVALID_GRANT_HANDLE); } } else { /* @@ -270,226 +351,234 @@ static int map_grant_pages(struct grant_map *map) * to the kernel linear addresses of the struct pages. * These ptes are completely different from the user ptes dealt * with find_grant_ptes. + * Note that GNTMAP_device_map isn't needed here: The + * dev_bus_addr output field gets consumed only from ->map_ops, + * and by not requesting it when mapping we also avoid needing + * to mirror dev_bus_addr into ->unmap_ops (and holding an extra + * reference to the page in the hypervisor). */ + unsigned int flags = (map->flags & ~GNTMAP_device_map) | + GNTMAP_host_map; + for (i = 0; i < map->count; i++) { - unsigned level; unsigned long address = (unsigned long) pfn_to_kaddr(page_to_pfn(map->pages[i])); - pte_t *ptep; - u64 pte_maddr = 0; BUG_ON(PageHighMem(map->pages[i])); - ptep = lookup_address(address, &level); - pte_maddr = arbitrary_virt_to_machine(ptep).maddr; - gnttab_set_map_op(&map->kmap_ops[i], pte_maddr, - map->flags | - GNTMAP_host_map | - GNTMAP_contains_pte, + gnttab_set_map_op(&map->kmap_ops[i], address, flags, map->grants[i].ref, map->grants[i].domid); + gnttab_set_unmap_op(&map->kunmap_ops[i], address, + flags, INVALID_GRANT_HANDLE); } } pr_debug("map %d+%d\n", map->index, map->count); - err = gnttab_map_refs(map->map_ops, use_ptemod ? map->kmap_ops : NULL, - map->pages, map->count); - if (err) - return err; + err = gnttab_map_refs(map->map_ops, map->kmap_ops, map->pages, + map->count); for (i = 0; i < map->count; i++) { - if (map->map_ops[i].status) - err = -EINVAL; - else { - BUG_ON(map->map_ops[i].handle == -1); + if (map->map_ops[i].status == GNTST_okay) { map->unmap_ops[i].handle = map->map_ops[i].handle; - pr_debug("map handle=%d\n", map->map_ops[i].handle); + alloced++; + } else if (!err) + err = -EINVAL; + + if (map->flags & GNTMAP_device_map) + map->unmap_ops[i].dev_bus_addr = map->map_ops[i].dev_bus_addr; + + if (xen_pv_domain()) { + if (map->kmap_ops[i].status == GNTST_okay) { + alloced++; + map->kunmap_ops[i].handle = map->kmap_ops[i].handle; + } else if (!err) + err = -EINVAL; } } + atomic_add(alloced, &map->live_grants); return err; } -static int __unmap_grant_pages(struct grant_map *map, int offset, int pages) +static void __unmap_grant_pages_done(int result, + struct gntab_unmap_queue_data *data) { - int i, err = 0; + unsigned int i; + struct gntdev_grant_map *map = data->data; + unsigned int offset = data->unmap_ops - map->unmap_ops; + int successful_unmaps = 0; + int live_grants; + + for (i = 0; i < data->count; i++) { + if (map->unmap_ops[offset + i].status == GNTST_okay && + map->unmap_ops[offset + i].handle != INVALID_GRANT_HANDLE) + successful_unmaps++; + + WARN_ON(map->unmap_ops[offset + i].status != GNTST_okay && + map->unmap_ops[offset + i].handle != INVALID_GRANT_HANDLE); + pr_debug("unmap handle=%d st=%d\n", + map->unmap_ops[offset+i].handle, + map->unmap_ops[offset+i].status); + map->unmap_ops[offset+i].handle = INVALID_GRANT_HANDLE; + if (xen_pv_domain()) { + if (map->kunmap_ops[offset + i].status == GNTST_okay && + map->kunmap_ops[offset + i].handle != INVALID_GRANT_HANDLE) + successful_unmaps++; + + WARN_ON(map->kunmap_ops[offset + i].status != GNTST_okay && + map->kunmap_ops[offset + i].handle != INVALID_GRANT_HANDLE); + pr_debug("kunmap handle=%u st=%d\n", + map->kunmap_ops[offset+i].handle, + map->kunmap_ops[offset+i].status); + map->kunmap_ops[offset+i].handle = INVALID_GRANT_HANDLE; + } + } + + /* + * Decrease the live-grant counter. This must happen after the loop to + * prevent premature reuse of the grants by gnttab_mmap(). + */ + live_grants = atomic_sub_return(successful_unmaps, &map->live_grants); + if (WARN_ON(live_grants < 0)) + pr_err("%s: live_grants became negative (%d) after unmapping %d pages!\n", + __func__, live_grants, successful_unmaps); + /* Release reference taken by __unmap_grant_pages */ + gntdev_put_map(NULL, map); +} + +static void __unmap_grant_pages(struct gntdev_grant_map *map, int offset, + int pages) +{ if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { int pgno = (map->notify.addr >> PAGE_SHIFT); + if (pgno >= offset && pgno < offset + pages) { /* No need for kmap, pages are in lowmem */ uint8_t *tmp = pfn_to_kaddr(page_to_pfn(map->pages[pgno])); + tmp[map->notify.addr & (PAGE_SIZE-1)] = 0; map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE; } } - err = gnttab_unmap_refs(map->unmap_ops + offset, - use_ptemod ? map->kmap_ops + offset : NULL, map->pages + offset, - pages); - if (err) - return err; + map->unmap_data.unmap_ops = map->unmap_ops + offset; + map->unmap_data.kunmap_ops = xen_pv_domain() ? map->kunmap_ops + offset : NULL; + map->unmap_data.pages = map->pages + offset; + map->unmap_data.count = pages; + map->unmap_data.done = __unmap_grant_pages_done; + map->unmap_data.data = map; + refcount_inc(&map->users); /* to keep map alive during async call below */ - for (i = 0; i < pages; i++) { - if (map->unmap_ops[offset+i].status) - err = -EINVAL; - pr_debug("unmap handle=%d st=%d\n", - map->unmap_ops[offset+i].handle, - map->unmap_ops[offset+i].status); - map->unmap_ops[offset+i].handle = -1; - } - return err; + gnttab_unmap_refs_async(&map->unmap_data); } -static int unmap_grant_pages(struct grant_map *map, int offset, int pages) +static void unmap_grant_pages(struct gntdev_grant_map *map, int offset, + int pages) { - int range, err = 0; + int range; + + if (atomic_read(&map->live_grants) == 0) + return; /* Nothing to do */ pr_debug("unmap %d+%d [%d+%d]\n", map->index, map->count, offset, pages); /* It is possible the requested range will have a "hole" where we * already unmapped some of the grants. Only unmap valid ranges. */ - while (pages && !err) { - while (pages && map->unmap_ops[offset].handle == -1) { + while (pages) { + while (pages && map->being_removed[offset]) { offset++; pages--; } range = 0; while (range < pages) { - if (map->unmap_ops[offset+range].handle == -1) { - range--; + if (map->being_removed[offset + range]) break; - } + map->being_removed[offset + range] = true; range++; } - err = __unmap_grant_pages(map, offset, range); + if (range) + __unmap_grant_pages(map, offset, range); offset += range; pages -= range; } - - return err; } /* ------------------------------------------------------------------ */ static void gntdev_vma_open(struct vm_area_struct *vma) { - struct grant_map *map = vma->vm_private_data; + struct gntdev_grant_map *map = vma->vm_private_data; pr_debug("gntdev_vma_open %p\n", vma); - atomic_inc(&map->users); + refcount_inc(&map->users); } static void gntdev_vma_close(struct vm_area_struct *vma) { - struct grant_map *map = vma->vm_private_data; + struct gntdev_grant_map *map = vma->vm_private_data; struct file *file = vma->vm_file; struct gntdev_priv *priv = file->private_data; pr_debug("gntdev_vma_close %p\n", vma); - if (use_ptemod) { - /* It is possible that an mmu notifier could be running - * concurrently, so take priv->lock to ensure that the vma won't - * vanishing during the unmap_grant_pages call, since we will - * spin here until that completes. Such a concurrent call will - * not do any unmapping, since that has been done prior to - * closing the vma, but it may still iterate the unmap_ops list. - */ - spin_lock(&priv->lock); - map->vma = NULL; - spin_unlock(&priv->lock); - } + vma->vm_private_data = NULL; gntdev_put_map(priv, map); } -static struct vm_operations_struct gntdev_vmops = { +static struct page *gntdev_vma_find_normal_page(struct vm_area_struct *vma, + unsigned long addr) +{ + struct gntdev_grant_map *map = vma->vm_private_data; + + return map->pages[(addr - map->pages_vm_start) >> PAGE_SHIFT]; +} + +static const struct vm_operations_struct gntdev_vmops = { .open = gntdev_vma_open, .close = gntdev_vma_close, + .find_normal_page = gntdev_vma_find_normal_page, }; /* ------------------------------------------------------------------ */ -static void unmap_if_in_range(struct grant_map *map, - unsigned long start, unsigned long end) +static bool gntdev_invalidate(struct mmu_interval_notifier *mn, + const struct mmu_notifier_range *range, + unsigned long cur_seq) { + struct gntdev_grant_map *map = + container_of(mn, struct gntdev_grant_map, notifier); unsigned long mstart, mend; - int err; + unsigned long map_start, map_end; - if (!map->vma) - return; - if (map->vma->vm_start >= end) - return; - if (map->vma->vm_end <= start) - return; - mstart = max(start, map->vma->vm_start); - mend = min(end, map->vma->vm_end); - pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", - map->index, map->count, - map->vma->vm_start, map->vma->vm_end, - start, end, mstart, mend); - err = unmap_grant_pages(map, - (mstart - map->vma->vm_start) >> PAGE_SHIFT, - (mend - mstart) >> PAGE_SHIFT); - WARN_ON(err); -} + if (!mmu_notifier_range_blockable(range)) + return false; -static void mn_invl_range_start(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); - struct grant_map *map; + map_start = map->pages_vm_start; + map_end = map->pages_vm_start + (map->count << PAGE_SHIFT); - spin_lock(&priv->lock); - list_for_each_entry(map, &priv->maps, next) { - unmap_if_in_range(map, start, end); - } - list_for_each_entry(map, &priv->freeable_maps, next) { - unmap_if_in_range(map, start, end); - } - spin_unlock(&priv->lock); -} - -static void mn_invl_page(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address) -{ - mn_invl_range_start(mn, mm, address, address + PAGE_SIZE); -} + /* + * If the VMA is split or otherwise changed the notifier is not + * updated, but we don't want to process VA's outside the modified + * VMA. FIXME: It would be much more understandable to just prevent + * modifying the VMA in the first place. + */ + if (map_start >= range->end || map_end <= range->start) + return true; -static void mn_release(struct mmu_notifier *mn, - struct mm_struct *mm) -{ - struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); - struct grant_map *map; - int err; + mstart = max(range->start, map_start); + mend = min(range->end, map_end); + pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", + map->index, map->count, map_start, map_end, + range->start, range->end, mstart, mend); + unmap_grant_pages(map, (mstart - map_start) >> PAGE_SHIFT, + (mend - mstart) >> PAGE_SHIFT); - spin_lock(&priv->lock); - list_for_each_entry(map, &priv->maps, next) { - if (!map->vma) - continue; - pr_debug("map %d+%d (%lx %lx)\n", - map->index, map->count, - map->vma->vm_start, map->vma->vm_end); - err = unmap_grant_pages(map, /* offset */ 0, map->count); - WARN_ON(err); - } - list_for_each_entry(map, &priv->freeable_maps, next) { - if (!map->vma) - continue; - pr_debug("map %d+%d (%lx %lx)\n", - map->index, map->count, - map->vma->vm_start, map->vma->vm_end); - err = unmap_grant_pages(map, /* offset */ 0, map->count); - WARN_ON(err); - } - spin_unlock(&priv->lock); + return true; } -static struct mmu_notifier_ops gntdev_mmu_ops = { - .release = mn_release, - .invalidate_page = mn_invl_page, - .invalidate_range_start = mn_invl_range_start, +static const struct mmu_interval_notifier_ops gntdev_mmu_ops = { + .invalidate = gntdev_invalidate, }; /* ------------------------------------------------------------------ */ @@ -497,33 +586,31 @@ static struct mmu_notifier_ops gntdev_mmu_ops = { static int gntdev_open(struct inode *inode, struct file *flip) { struct gntdev_priv *priv; - int ret = 0; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; INIT_LIST_HEAD(&priv->maps); - INIT_LIST_HEAD(&priv->freeable_maps); - spin_lock_init(&priv->lock); - - if (use_ptemod) { - priv->mm = get_task_mm(current); - if (!priv->mm) { - kfree(priv); - return -ENOMEM; - } - priv->mn.ops = &gntdev_mmu_ops; - ret = mmu_notifier_register(&priv->mn, priv->mm); - mmput(priv->mm); - } + mutex_init(&priv->lock); + + mutex_init(&priv->batch_lock); + +#ifdef CONFIG_XEN_GNTDEV_DMABUF + priv->dmabuf_priv = gntdev_dmabuf_init(flip); + if (IS_ERR(priv->dmabuf_priv)) { + int ret = PTR_ERR(priv->dmabuf_priv); - if (ret) { kfree(priv); return ret; } +#endif flip->private_data = priv; +#ifdef CONFIG_XEN_GRANT_DMA_ALLOC + priv->dma_dev = gntdev_miscdev.this_device; + dma_coerce_mask_and_coherent(priv->dma_dev, DMA_BIT_MASK(64)); +#endif pr_debug("priv %p\n", priv); return 0; @@ -532,19 +619,32 @@ static int gntdev_open(struct inode *inode, struct file *flip) static int gntdev_release(struct inode *inode, struct file *flip) { struct gntdev_priv *priv = flip->private_data; - struct grant_map *map; + struct gntdev_grant_map *map; + struct gntdev_copy_batch *batch; pr_debug("priv %p\n", priv); + mutex_lock(&priv->lock); while (!list_empty(&priv->maps)) { - map = list_entry(priv->maps.next, struct grant_map, next); + map = list_entry(priv->maps.next, + struct gntdev_grant_map, next); list_del(&map->next); gntdev_put_map(NULL /* already removed */, map); } - WARN_ON(!list_empty(&priv->freeable_maps)); + mutex_unlock(&priv->lock); + + mutex_lock(&priv->batch_lock); + while (priv->batch) { + batch = priv->batch; + priv->batch = batch->next; + kfree(batch); + } + mutex_unlock(&priv->batch_lock); + +#ifdef CONFIG_XEN_GNTDEV_DMABUF + gntdev_dmabuf_fini(priv->dmabuf_priv); +#endif - if (use_ptemod) - mmu_notifier_unregister(&priv->mn, priv->mm); kfree(priv); return 0; } @@ -553,36 +653,30 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, struct ioctl_gntdev_map_grant_ref __user *u) { struct ioctl_gntdev_map_grant_ref op; - struct grant_map *map; + struct gntdev_grant_map *map; int err; if (copy_from_user(&op, u, sizeof(op)) != 0) return -EFAULT; pr_debug("priv %p, add %d\n", priv, op.count); - if (unlikely(op.count <= 0)) + if (unlikely(gntdev_test_page_count(op.count))) return -EINVAL; err = -ENOMEM; - map = gntdev_alloc_map(priv, op.count); + map = gntdev_alloc_map(priv, op.count, 0 /* This is not a dma-buf. */); if (!map) return err; - if (unlikely(atomic_add_return(op.count, &pages_mapped) > limit)) { - pr_debug("can't map: over limit\n"); - gntdev_put_map(NULL, map); - return err; - } - if (copy_from_user(map->grants, &u->refs, sizeof(map->grants[0]) * op.count) != 0) { gntdev_put_map(NULL, map); return -EFAULT; } - spin_lock(&priv->lock); + mutex_lock(&priv->lock); gntdev_add_map(priv, map); op.index = map->index << PAGE_SHIFT; - spin_unlock(&priv->lock); + mutex_unlock(&priv->lock); if (copy_to_user(u, &op, sizeof(op)) != 0) return -EFAULT; @@ -594,22 +688,20 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, struct ioctl_gntdev_unmap_grant_ref __user *u) { struct ioctl_gntdev_unmap_grant_ref op; - struct grant_map *map; + struct gntdev_grant_map *map; int err = -ENOENT; if (copy_from_user(&op, u, sizeof(op)) != 0) return -EFAULT; pr_debug("priv %p, del %d+%d\n", priv, (int)op.index, (int)op.count); - spin_lock(&priv->lock); + mutex_lock(&priv->lock); map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); if (map) { list_del(&map->next); - if (populate_freeable_maps) - list_add_tail(&map->next, &priv->freeable_maps); err = 0; } - spin_unlock(&priv->lock); + mutex_unlock(&priv->lock); if (map) gntdev_put_map(priv, map); return err; @@ -620,14 +712,14 @@ static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv, { struct ioctl_gntdev_get_offset_for_vaddr op; struct vm_area_struct *vma; - struct grant_map *map; + struct gntdev_grant_map *map; int rv = -EINVAL; if (copy_from_user(&op, u, sizeof(op)) != 0) return -EFAULT; pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr); - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); vma = find_vma(current->mm, op.vaddr); if (!vma || vma->vm_ops != &gntdev_vmops) goto out_unlock; @@ -641,7 +733,7 @@ static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv, rv = 0; out_unlock: - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (rv == 0 && copy_to_user(u, &op, sizeof(op)) != 0) return -EFAULT; @@ -651,10 +743,10 @@ static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv, static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) { struct ioctl_gntdev_unmap_notify op; - struct grant_map *map; + struct gntdev_grant_map *map; int rc; int out_flags; - unsigned int out_event; + evtchn_port_t out_event; if (copy_from_user(&op, u, sizeof(op))) return -EFAULT; @@ -677,7 +769,7 @@ static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) out_flags = op.action; out_event = op.event_channel_port; - spin_lock(&priv->lock); + mutex_lock(&priv->lock); list_for_each_entry(map, &priv->maps, next) { uint64_t begin = map->index << PAGE_SHIFT; @@ -705,7 +797,7 @@ static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) rc = 0; unlock_out: - spin_unlock(&priv->lock); + mutex_unlock(&priv->lock); /* Drop the reference to the event channel we did not save in the map */ if (out_flags & UNMAP_NOTIFY_SEND_EVENT) @@ -714,6 +806,213 @@ static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) return rc; } +static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt, + unsigned long *gfn) +{ + unsigned long addr = (unsigned long)virt; + struct page *page; + unsigned long xen_pfn; + int ret; + + ret = pin_user_pages_fast(addr, 1, batch->writeable ? FOLL_WRITE : 0, &page); + if (ret < 0) + return ret; + + batch->pages[batch->nr_pages++] = page; + + xen_pfn = page_to_xen_pfn(page) + XEN_PFN_DOWN(addr & ~PAGE_MASK); + *gfn = pfn_to_gfn(xen_pfn); + + return 0; +} + +static void gntdev_put_pages(struct gntdev_copy_batch *batch) +{ + unpin_user_pages_dirty_lock(batch->pages, batch->nr_pages, batch->writeable); + batch->nr_pages = 0; + batch->writeable = false; +} + +static int gntdev_copy(struct gntdev_copy_batch *batch) +{ + unsigned int i; + + gnttab_batch_copy(batch->ops, batch->nr_ops); + gntdev_put_pages(batch); + + /* + * For each completed op, update the status if the op failed + * and all previous ops for the segment were successful. + */ + for (i = 0; i < batch->nr_ops; i++) { + s16 status = batch->ops[i].status; + s16 old_status; + + if (status == GNTST_okay) + continue; + + if (__get_user(old_status, batch->status[i])) + return -EFAULT; + + if (old_status != GNTST_okay) + continue; + + if (__put_user(status, batch->status[i])) + return -EFAULT; + } + + batch->nr_ops = 0; + return 0; +} + +static int gntdev_grant_copy_seg(struct gntdev_copy_batch *batch, + struct gntdev_grant_copy_segment *seg, + s16 __user *status) +{ + uint16_t copied = 0; + + /* + * Disallow local -> local copies since there is only space in + * batch->pages for one page per-op and this would be a very + * expensive memcpy(). + */ + if (!(seg->flags & (GNTCOPY_source_gref | GNTCOPY_dest_gref))) + return -EINVAL; + + /* Can't cross page if source/dest is a grant ref. */ + if (seg->flags & GNTCOPY_source_gref) { + if (seg->source.foreign.offset + seg->len > XEN_PAGE_SIZE) + return -EINVAL; + } + if (seg->flags & GNTCOPY_dest_gref) { + if (seg->dest.foreign.offset + seg->len > XEN_PAGE_SIZE) + return -EINVAL; + } + + if (put_user(GNTST_okay, status)) + return -EFAULT; + + while (copied < seg->len) { + struct gnttab_copy *op; + void __user *virt; + size_t len, off; + unsigned long gfn; + int ret; + + if (batch->nr_ops >= GNTDEV_COPY_BATCH) { + ret = gntdev_copy(batch); + if (ret < 0) + return ret; + } + + len = seg->len - copied; + + op = &batch->ops[batch->nr_ops]; + op->flags = 0; + + if (seg->flags & GNTCOPY_source_gref) { + op->source.u.ref = seg->source.foreign.ref; + op->source.domid = seg->source.foreign.domid; + op->source.offset = seg->source.foreign.offset + copied; + op->flags |= GNTCOPY_source_gref; + } else { + virt = seg->source.virt + copied; + off = (unsigned long)virt & ~XEN_PAGE_MASK; + len = min(len, (size_t)XEN_PAGE_SIZE - off); + batch->writeable = false; + + ret = gntdev_get_page(batch, virt, &gfn); + if (ret < 0) + return ret; + + op->source.u.gmfn = gfn; + op->source.domid = DOMID_SELF; + op->source.offset = off; + } + + if (seg->flags & GNTCOPY_dest_gref) { + op->dest.u.ref = seg->dest.foreign.ref; + op->dest.domid = seg->dest.foreign.domid; + op->dest.offset = seg->dest.foreign.offset + copied; + op->flags |= GNTCOPY_dest_gref; + } else { + virt = seg->dest.virt + copied; + off = (unsigned long)virt & ~XEN_PAGE_MASK; + len = min(len, (size_t)XEN_PAGE_SIZE - off); + batch->writeable = true; + + ret = gntdev_get_page(batch, virt, &gfn); + if (ret < 0) + return ret; + + op->dest.u.gmfn = gfn; + op->dest.domid = DOMID_SELF; + op->dest.offset = off; + } + + op->len = len; + copied += len; + + batch->status[batch->nr_ops] = status; + batch->nr_ops++; + } + + return 0; +} + +static long gntdev_ioctl_grant_copy(struct gntdev_priv *priv, void __user *u) +{ + struct ioctl_gntdev_grant_copy copy; + struct gntdev_copy_batch *batch; + unsigned int i; + int ret = 0; + + if (copy_from_user(©, u, sizeof(copy))) + return -EFAULT; + + mutex_lock(&priv->batch_lock); + if (!priv->batch) { + batch = kmalloc(sizeof(*batch), GFP_KERNEL); + } else { + batch = priv->batch; + priv->batch = batch->next; + } + mutex_unlock(&priv->batch_lock); + if (!batch) + return -ENOMEM; + + batch->nr_ops = 0; + batch->nr_pages = 0; + + for (i = 0; i < copy.count; i++) { + struct gntdev_grant_copy_segment seg; + + if (copy_from_user(&seg, ©.segments[i], sizeof(seg))) { + ret = -EFAULT; + gntdev_put_pages(batch); + goto out; + } + + ret = gntdev_grant_copy_seg(batch, &seg, ©.segments[i].status); + if (ret < 0) { + gntdev_put_pages(batch); + goto out; + } + + cond_resched(); + } + if (batch->nr_ops) + ret = gntdev_copy(batch); + + out: + mutex_lock(&priv->batch_lock); + batch->next = priv->batch; + priv->batch = batch; + mutex_unlock(&priv->batch_lock); + + return ret; +} + static long gntdev_ioctl(struct file *flip, unsigned int cmd, unsigned long arg) { @@ -733,6 +1032,23 @@ static long gntdev_ioctl(struct file *flip, case IOCTL_GNTDEV_SET_UNMAP_NOTIFY: return gntdev_ioctl_notify(priv, ptr); + case IOCTL_GNTDEV_GRANT_COPY: + return gntdev_ioctl_grant_copy(priv, ptr); + +#ifdef CONFIG_XEN_GNTDEV_DMABUF + case IOCTL_GNTDEV_DMABUF_EXP_FROM_REFS: + return gntdev_ioctl_dmabuf_exp_from_refs(priv, ptr); + + case IOCTL_GNTDEV_DMABUF_EXP_WAIT_RELEASED: + return gntdev_ioctl_dmabuf_exp_wait_released(priv, ptr); + + case IOCTL_GNTDEV_DMABUF_IMP_TO_REFS: + return gntdev_ioctl_dmabuf_imp_to_refs(priv, ptr); + + case IOCTL_GNTDEV_DMABUF_IMP_RELEASE: + return gntdev_ioctl_dmabuf_imp_release(priv, ptr); +#endif + default: pr_debug("priv %p, unknown cmd %x\n", priv, cmd); return -ENOIOCTLCMD; @@ -745,41 +1061,33 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) { struct gntdev_priv *priv = flip->private_data; int index = vma->vm_pgoff; - int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - struct grant_map *map; - int i, err = -EINVAL; + int count = vma_pages(vma); + struct gntdev_grant_map *map; + int err = -EINVAL; if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) return -EINVAL; pr_debug("map %d+%d at %lx (pgoff %lx)\n", - index, count, vma->vm_start, vma->vm_pgoff); + index, count, vma->vm_start, vma->vm_pgoff); - spin_lock(&priv->lock); + mutex_lock(&priv->lock); map = gntdev_find_map_index(priv, index, count); if (!map) goto unlock_out; - if (use_ptemod && map->vma) - goto unlock_out; - if (use_ptemod && priv->mm != vma->vm_mm) { - pr_warn("Huh? Other mm?\n"); + if (!atomic_add_unless(&map->in_use, 1, 1)) goto unlock_out; - } - atomic_inc(&map->users); + refcount_inc(&map->users); vma->vm_ops = &gntdev_vmops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP | VM_MIXEDMAP); - if (use_ptemod) - vma->vm_flags |= VM_DONTCOPY; + if (xen_pv_domain()) + vm_flags_set(vma, VM_DONTCOPY); vma->vm_private_data = map; - - if (use_ptemod) - map->vma = vma; - if (map->flags) { if ((vma->vm_flags & VM_WRITE) && (map->flags & GNTMAP_readonly)) @@ -790,9 +1098,32 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) map->flags |= GNTMAP_readonly; } - spin_unlock(&priv->lock); + map->pages_vm_start = vma->vm_start; + + if (xen_pv_domain()) { + err = mmu_interval_notifier_insert_locked( + &map->notifier, vma->vm_mm, vma->vm_start, + vma->vm_end - vma->vm_start, &gntdev_mmu_ops); + if (err) + goto out_unlock_put; + + map->notifier_init = true; + } + mutex_unlock(&priv->lock); + + if (xen_pv_domain()) { + /* + * gntdev takes the address of the PTE in find_grant_ptes() and + * passes it to the hypervisor in gntdev_map_grant_pages(). The + * purpose of the notifier is to prevent the hypervisor pointer + * to the PTE from going stale. + * + * Since this vma's mappings can't be touched without the + * mmap_lock, and we are holding it now, there is no need for + * the notifier_range locking pattern. + */ + mmu_interval_read_begin(&map->notifier); - if (use_ptemod) { err = apply_to_page_range(vma->vm_mm, vma->vm_start, vma->vm_end - vma->vm_start, find_grant_ptes, map); @@ -802,30 +1133,27 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) } } - err = map_grant_pages(map); + err = gntdev_map_grant_pages(map); if (err) goto out_put_map; - if (!use_ptemod) { - for (i = 0; i < count; i++) { - err = vm_insert_page(vma, vma->vm_start + i*PAGE_SIZE, - map->pages[i]); - if (err) - goto out_put_map; - } + if (!xen_pv_domain()) { + err = vm_map_pages_zero(vma, map->pages, map->count); + if (err) + goto out_put_map; } return 0; unlock_out: - spin_unlock(&priv->lock); + mutex_unlock(&priv->lock); return err; out_unlock_put: - spin_unlock(&priv->lock); + mutex_unlock(&priv->lock); out_put_map: - if (use_ptemod) - map->vma = NULL; + if (xen_pv_domain()) + unmap_grant_pages(map, 0, map->count); gntdev_put_map(priv, map); return err; } @@ -853,8 +1181,6 @@ static int __init gntdev_init(void) if (!xen_domain()) return -ENODEV; - use_ptemod = xen_pv_domain(); - err = misc_register(&gntdev_miscdev); if (err != 0) { pr_err("Could not register gntdev device\n"); diff --git a/drivers/xen/grant-dma-iommu.c b/drivers/xen/grant-dma-iommu.c new file mode 100644 index 000000000000..0965e2dd4edf --- /dev/null +++ b/drivers/xen/grant-dma-iommu.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Stub IOMMU driver which does nothing. + * The main purpose of it being present is to reuse generic IOMMU device tree + * bindings by Xen grant DMA-mapping layer. + * + * Copyright (C) 2022 EPAM Systems Inc. + */ + +#include <linux/iommu.h> +#include <linux/of.h> +#include <linux/platform_device.h> + +struct grant_dma_iommu_device { + struct device *dev; + struct iommu_device iommu; +}; + +static struct iommu_device *grant_dma_iommu_probe_device(struct device *dev) +{ + return ERR_PTR(-ENODEV); +} + +/* Nothing is really needed here except a dummy probe_device callback */ +static const struct iommu_ops grant_dma_iommu_ops = { + .probe_device = grant_dma_iommu_probe_device, +}; + +static const struct of_device_id grant_dma_iommu_of_match[] = { + { .compatible = "xen,grant-dma" }, + { }, +}; + +static int grant_dma_iommu_probe(struct platform_device *pdev) +{ + struct grant_dma_iommu_device *mmu; + int ret; + + mmu = devm_kzalloc(&pdev->dev, sizeof(*mmu), GFP_KERNEL); + if (!mmu) + return -ENOMEM; + + mmu->dev = &pdev->dev; + + ret = iommu_device_register(&mmu->iommu, &grant_dma_iommu_ops, &pdev->dev); + if (ret) + return ret; + + platform_set_drvdata(pdev, mmu); + + return 0; +} + +static void grant_dma_iommu_remove(struct platform_device *pdev) +{ + struct grant_dma_iommu_device *mmu = platform_get_drvdata(pdev); + + platform_set_drvdata(pdev, NULL); + iommu_device_unregister(&mmu->iommu); +} + +static struct platform_driver grant_dma_iommu_driver = { + .driver = { + .name = "grant-dma-iommu", + .of_match_table = grant_dma_iommu_of_match, + }, + .probe = grant_dma_iommu_probe, + .remove = grant_dma_iommu_remove, +}; + +static int __init grant_dma_iommu_init(void) +{ + struct device_node *iommu_np; + + iommu_np = of_find_matching_node(NULL, grant_dma_iommu_of_match); + if (!iommu_np) + return 0; + + of_node_put(iommu_np); + + return platform_driver_register(&grant_dma_iommu_driver); +} +subsys_initcall(grant_dma_iommu_init); diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c new file mode 100644 index 000000000000..14077d23f2a1 --- /dev/null +++ b/drivers/xen/grant-dma-ops.c @@ -0,0 +1,422 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Xen grant DMA-mapping layer - contains special DMA-mapping routines + * for providing grant references as DMA addresses to be used by frontends + * (e.g. virtio) in Xen guests + * + * Copyright (c) 2021, Juergen Gross <jgross@suse.com> + */ + +#include <linux/module.h> +#include <linux/dma-map-ops.h> +#include <linux/of.h> +#include <linux/pci.h> +#include <linux/pfn.h> +#include <linux/xarray.h> +#include <linux/virtio_anchor.h> +#include <linux/virtio.h> +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/grant_table.h> + +struct xen_grant_dma_data { + /* The ID of backend domain */ + domid_t backend_domid; + /* Is device behaving sane? */ + bool broken; +}; + +static DEFINE_XARRAY_FLAGS(xen_grant_dma_devices, XA_FLAGS_LOCK_IRQ); + +#define XEN_GRANT_DMA_ADDR_OFF (1ULL << 63) + +static inline dma_addr_t grant_to_dma(grant_ref_t grant) +{ + return XEN_GRANT_DMA_ADDR_OFF | ((dma_addr_t)grant << XEN_PAGE_SHIFT); +} + +static inline grant_ref_t dma_to_grant(dma_addr_t dma) +{ + return (grant_ref_t)((dma & ~XEN_GRANT_DMA_ADDR_OFF) >> XEN_PAGE_SHIFT); +} + +static struct xen_grant_dma_data *find_xen_grant_dma_data(struct device *dev) +{ + struct xen_grant_dma_data *data; + unsigned long flags; + + xa_lock_irqsave(&xen_grant_dma_devices, flags); + data = xa_load(&xen_grant_dma_devices, (unsigned long)dev); + xa_unlock_irqrestore(&xen_grant_dma_devices, flags); + + return data; +} + +static int store_xen_grant_dma_data(struct device *dev, + struct xen_grant_dma_data *data) +{ + unsigned long flags; + int ret; + + xa_lock_irqsave(&xen_grant_dma_devices, flags); + ret = xa_err(__xa_store(&xen_grant_dma_devices, (unsigned long)dev, data, + GFP_ATOMIC)); + xa_unlock_irqrestore(&xen_grant_dma_devices, flags); + + return ret; +} + +/* + * DMA ops for Xen frontends (e.g. virtio). + * + * Used to act as a kind of software IOMMU for Xen guests by using grants as + * DMA addresses. + * Such a DMA address is formed by using the grant reference as a frame + * number and setting the highest address bit (this bit is for the backend + * to be able to distinguish it from e.g. a mmio address). + */ +static void *xen_grant_dma_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, + unsigned long attrs) +{ + struct xen_grant_dma_data *data; + unsigned int i, n_pages = XEN_PFN_UP(size); + unsigned long pfn; + grant_ref_t grant; + void *ret; + + data = find_xen_grant_dma_data(dev); + if (!data) + return NULL; + + if (unlikely(data->broken)) + return NULL; + + ret = alloc_pages_exact(n_pages * XEN_PAGE_SIZE, gfp); + if (!ret) + return NULL; + + pfn = virt_to_pfn(ret); + + if (gnttab_alloc_grant_reference_seq(n_pages, &grant)) { + free_pages_exact(ret, n_pages * XEN_PAGE_SIZE); + return NULL; + } + + for (i = 0; i < n_pages; i++) { + gnttab_grant_foreign_access_ref(grant + i, data->backend_domid, + pfn_to_gfn(pfn + i), 0); + } + + *dma_handle = grant_to_dma(grant); + + return ret; +} + +static void xen_grant_dma_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, unsigned long attrs) +{ + struct xen_grant_dma_data *data; + unsigned int i, n_pages = XEN_PFN_UP(size); + grant_ref_t grant; + + data = find_xen_grant_dma_data(dev); + if (!data) + return; + + if (unlikely(data->broken)) + return; + + grant = dma_to_grant(dma_handle); + + for (i = 0; i < n_pages; i++) { + if (unlikely(!gnttab_end_foreign_access_ref(grant + i))) { + dev_alert(dev, "Grant still in use by backend domain, disabled for further use\n"); + data->broken = true; + return; + } + } + + gnttab_free_grant_reference_seq(grant, n_pages); + + free_pages_exact(vaddr, n_pages * XEN_PAGE_SIZE); +} + +static struct page *xen_grant_dma_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, + enum dma_data_direction dir, + gfp_t gfp) +{ + void *vaddr; + + vaddr = xen_grant_dma_alloc(dev, size, dma_handle, gfp, 0); + if (!vaddr) + return NULL; + + return virt_to_page(vaddr); +} + +static void xen_grant_dma_free_pages(struct device *dev, size_t size, + struct page *vaddr, dma_addr_t dma_handle, + enum dma_data_direction dir) +{ + xen_grant_dma_free(dev, size, page_to_virt(vaddr), dma_handle, 0); +} + +static dma_addr_t xen_grant_dma_map_phys(struct device *dev, phys_addr_t phys, + size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + struct xen_grant_dma_data *data; + unsigned long offset = offset_in_page(phys); + unsigned long dma_offset = xen_offset_in_page(offset), + pfn_offset = XEN_PFN_DOWN(offset); + unsigned int i, n_pages = XEN_PFN_UP(dma_offset + size); + grant_ref_t grant; + dma_addr_t dma_handle; + + if (unlikely(attrs & DMA_ATTR_MMIO)) + return DMA_MAPPING_ERROR; + + if (WARN_ON(dir == DMA_NONE)) + return DMA_MAPPING_ERROR; + + data = find_xen_grant_dma_data(dev); + if (!data) + return DMA_MAPPING_ERROR; + + if (unlikely(data->broken)) + return DMA_MAPPING_ERROR; + + if (gnttab_alloc_grant_reference_seq(n_pages, &grant)) + return DMA_MAPPING_ERROR; + + for (i = 0; i < n_pages; i++) { + gnttab_grant_foreign_access_ref(grant + i, data->backend_domid, + pfn_to_gfn(page_to_xen_pfn(phys_to_page(phys)) + i + pfn_offset), + dir == DMA_TO_DEVICE); + } + + dma_handle = grant_to_dma(grant) + dma_offset; + + return dma_handle; +} + +static void xen_grant_dma_unmap_phys(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + struct xen_grant_dma_data *data; + unsigned long dma_offset = xen_offset_in_page(dma_handle); + unsigned int i, n_pages = XEN_PFN_UP(dma_offset + size); + grant_ref_t grant; + + if (WARN_ON(dir == DMA_NONE)) + return; + + data = find_xen_grant_dma_data(dev); + if (!data) + return; + + if (unlikely(data->broken)) + return; + + grant = dma_to_grant(dma_handle); + + for (i = 0; i < n_pages; i++) { + if (unlikely(!gnttab_end_foreign_access_ref(grant + i))) { + dev_alert(dev, "Grant still in use by backend domain, disabled for further use\n"); + data->broken = true; + return; + } + } + + gnttab_free_grant_reference_seq(grant, n_pages); +} + +static void xen_grant_dma_unmap_sg(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + unsigned long attrs) +{ + struct scatterlist *s; + unsigned int i; + + if (WARN_ON(dir == DMA_NONE)) + return; + + for_each_sg(sg, s, nents, i) + xen_grant_dma_unmap_phys(dev, s->dma_address, sg_dma_len(s), dir, + attrs); +} + +static int xen_grant_dma_map_sg(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + unsigned long attrs) +{ + struct scatterlist *s; + unsigned int i; + + if (WARN_ON(dir == DMA_NONE)) + return -EINVAL; + + for_each_sg(sg, s, nents, i) { + s->dma_address = xen_grant_dma_map_phys(dev, sg_phys(s), + s->length, dir, attrs); + if (s->dma_address == DMA_MAPPING_ERROR) + goto out; + + sg_dma_len(s) = s->length; + } + + return nents; + +out: + xen_grant_dma_unmap_sg(dev, sg, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); + sg_dma_len(sg) = 0; + + return -EIO; +} + +static int xen_grant_dma_supported(struct device *dev, u64 mask) +{ + return mask == DMA_BIT_MASK(64); +} + +static const struct dma_map_ops xen_grant_dma_ops = { + .alloc = xen_grant_dma_alloc, + .free = xen_grant_dma_free, + .alloc_pages_op = xen_grant_dma_alloc_pages, + .free_pages = xen_grant_dma_free_pages, + .mmap = dma_common_mmap, + .get_sgtable = dma_common_get_sgtable, + .map_phys = xen_grant_dma_map_phys, + .unmap_phys = xen_grant_dma_unmap_phys, + .map_sg = xen_grant_dma_map_sg, + .unmap_sg = xen_grant_dma_unmap_sg, + .dma_supported = xen_grant_dma_supported, +}; + +static struct device_node *xen_dt_get_node(struct device *dev) +{ + if (dev_is_pci(dev)) { + struct pci_dev *pdev = to_pci_dev(dev); + struct pci_bus *bus = pdev->bus; + + /* Walk up to the root bus to look for PCI Host controller */ + while (!pci_is_root_bus(bus)) + bus = bus->parent; + + if (!bus->bridge->parent) + return NULL; + return of_node_get(bus->bridge->parent->of_node); + } + + return of_node_get(dev->of_node); +} + +static int xen_dt_grant_init_backend_domid(struct device *dev, + struct device_node *np, + domid_t *backend_domid) +{ + struct of_phandle_args iommu_spec = { .args_count = 1 }; + + if (dev_is_pci(dev)) { + struct pci_dev *pdev = to_pci_dev(dev); + u32 rid = PCI_DEVID(pdev->bus->number, pdev->devfn); + + if (of_map_id(np, rid, "iommu-map", "iommu-map-mask", &iommu_spec.np, + iommu_spec.args)) { + dev_dbg(dev, "Cannot translate ID\n"); + return -ESRCH; + } + } else { + if (of_parse_phandle_with_args(np, "iommus", "#iommu-cells", + 0, &iommu_spec)) { + dev_dbg(dev, "Cannot parse iommus property\n"); + return -ESRCH; + } + } + + if (!of_device_is_compatible(iommu_spec.np, "xen,grant-dma") || + iommu_spec.args_count != 1) { + dev_dbg(dev, "Incompatible IOMMU node\n"); + of_node_put(iommu_spec.np); + return -ESRCH; + } + + of_node_put(iommu_spec.np); + + /* + * The endpoint ID here means the ID of the domain where the + * corresponding backend is running + */ + *backend_domid = iommu_spec.args[0]; + + return 0; +} + +static int xen_grant_init_backend_domid(struct device *dev, + domid_t *backend_domid) +{ + struct device_node *np; + int ret = -ENODEV; + + np = xen_dt_get_node(dev); + if (np) { + ret = xen_dt_grant_init_backend_domid(dev, np, backend_domid); + of_node_put(np); + } else if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT) || xen_pv_domain()) { + dev_info(dev, "Using dom0 as backend\n"); + *backend_domid = 0; + ret = 0; + } + + return ret; +} + +static void xen_grant_setup_dma_ops(struct device *dev, domid_t backend_domid) +{ + struct xen_grant_dma_data *data; + + data = find_xen_grant_dma_data(dev); + if (data) { + dev_err(dev, "Xen grant DMA data is already created\n"); + return; + } + + data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); + if (!data) + goto err; + + data->backend_domid = backend_domid; + + if (store_xen_grant_dma_data(dev, data)) { + dev_err(dev, "Cannot store Xen grant DMA data\n"); + goto err; + } + + dev->dma_ops = &xen_grant_dma_ops; + + return; + +err: + devm_kfree(dev, data); + dev_err(dev, "Cannot set up Xen grant DMA ops, retain platform DMA ops\n"); +} + +bool xen_virtio_restricted_mem_acc(struct virtio_device *dev) +{ + domid_t backend_domid; + + if (!xen_grant_init_backend_domid(dev->dev.parent, &backend_domid)) { + xen_grant_setup_dma_ops(dev->dev.parent, backend_domid); + return true; + } + + return false; +} + +MODULE_DESCRIPTION("Xen grant DMA-mapping layer"); +MODULE_AUTHOR("Juergen Gross <jgross@suse.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 04cdeb8e3719..3e76e33f6e08 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -33,7 +33,8 @@ #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt -#include <linux/module.h> +#include <linux/bitmap.h> +#include <linux/memblock.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/slab.h> @@ -42,6 +43,12 @@ #include <linux/io.h> #include <linux/delay.h> #include <linux/hardirq.h> +#include <linux/workqueue.h> +#include <linux/ratelimit.h> +#include <linux/moduleparam.h> +#ifdef CONFIG_XEN_GRANT_DMA_ALLOC +#include <linux/dma-mapping.h> +#endif #include <xen/xen.h> #include <xen/interface/xen.h> @@ -49,24 +56,50 @@ #include <xen/grant_table.h> #include <xen/interface/memory.h> #include <xen/hvc-console.h> +#include <xen/swiotlb-xen.h> +#include <xen/balloon.h> +#ifdef CONFIG_X86 +#include <asm/xen/cpuid.h> +#endif +#include <xen/mem-reservation.h> #include <asm/xen/hypercall.h> #include <asm/xen/interface.h> -#include <asm/pgtable.h> #include <asm/sync_bitops.h> -/* External tools reserve first few grant table entries. */ -#define NR_RESERVED_ENTRIES 8 #define GNTTAB_LIST_END 0xffffffff static grant_ref_t **gnttab_list; static unsigned int nr_grant_frames; -static unsigned int boot_max_nr_grant_frames; + +/* + * Handling of free grants: + * + * Free grants are in a simple list anchored in gnttab_free_head. They are + * linked by grant ref, the last element contains GNTTAB_LIST_END. The number + * of free entries is stored in gnttab_free_count. + * Additionally there is a bitmap of free entries anchored in + * gnttab_free_bitmap. This is being used for simplifying allocation of + * multiple consecutive grants, which is needed e.g. for support of virtio. + * gnttab_last_free is used to add free entries of new frames at the end of + * the free list. + * gnttab_free_tail_ptr specifies the variable which references the start + * of consecutive free grants ending with gnttab_last_free. This pointer is + * updated in a rather defensive way, in order to avoid performance hits in + * hot paths. + * All those variables are protected by gnttab_list_lock. + */ static int gnttab_free_count; -static grant_ref_t gnttab_free_head; +static unsigned int gnttab_size; +static grant_ref_t gnttab_free_head = GNTTAB_LIST_END; +static grant_ref_t gnttab_last_free = GNTTAB_LIST_END; +static grant_ref_t *gnttab_free_tail_ptr; +static unsigned long *gnttab_free_bitmap; static DEFINE_SPINLOCK(gnttab_list_lock); -unsigned long xen_hvm_resume_frames; -EXPORT_SYMBOL_GPL(xen_hvm_resume_frames); + +struct grant_frames xen_auto_xlat_grant_frames; +static unsigned int xen_gnttab_version; +module_param_named(version, xen_gnttab_version, uint, 0); static union { struct grant_entry_v1 *v1; @@ -77,6 +110,14 @@ static union { /*This is a structure of function pointers for grant table*/ struct gnttab_ops { /* + * Version of the grant interface. + */ + unsigned int version; + /* + * Grant refs per grant frame. + */ + unsigned int grefs_per_grant_frame; + /* * Mapping a list of frames for storing grant entries. Frames parameter * is used to store grant table address when grant table being setup, * nr_gframes is the number of frames to map grant table. Returning @@ -90,7 +131,7 @@ struct gnttab_ops { void (*unmap_frames)(void); /* * Introducing a valid entry into the grant table, granting the frame of - * this grant entry to domain for accessing or transfering. Ref + * this grant entry to domain for accessing. Ref * parameter is reference of this introduced grant entry, domid is id of * granted domain, frame is the page frame to be granted, and flags is * status of the grant entry to be updated. @@ -99,60 +140,27 @@ struct gnttab_ops { unsigned long frame, unsigned flags); /* * Stop granting a grant entry to domain for accessing. Ref parameter is - * reference of a grant entry whose grant access will be stopped, - * readonly is not in use in this function. If the grant entry is - * currently mapped for reading or writing, just return failure(==0) - * directly and don't tear down the grant access. Otherwise, stop grant - * access for this entry and return success(==1). - */ - int (*end_foreign_access_ref)(grant_ref_t ref, int readonly); - /* - * Stop granting a grant entry to domain for transfer. Ref parameter is - * reference of a grant entry whose grant transfer will be stopped. If - * tranfer has not started, just reclaim the grant entry and return - * failure(==0). Otherwise, wait for the transfer to complete and then - * return the frame. - */ - unsigned long (*end_foreign_transfer_ref)(grant_ref_t ref); - /* - * Query the status of a grant entry. Ref parameter is reference of - * queried grant entry, return value is the status of queried entry. - * Detailed status(writing/reading) can be gotten from the return value - * by bit operations. + * reference of a grant entry whose grant access will be stopped. + * If the grant entry is currently mapped for reading or writing, just + * return failure(==0) directly and don't tear down the grant access. + * Otherwise, stop grant access for this entry and return success(==1). */ - int (*query_foreign_access)(grant_ref_t ref); + int (*end_foreign_access_ref)(grant_ref_t ref); /* - * Grant a domain to access a range of bytes within the page referred by - * an available grant entry. Ref parameter is reference of a grant entry - * which will be sub-page accessed, domid is id of grantee domain, frame - * is frame address of subpage grant, flags is grant type and flag - * information, page_off is offset of the range of bytes, and length is - * length of bytes to be accessed. + * Read the frame number related to a given grant reference. */ - void (*update_subpage_entry)(grant_ref_t ref, domid_t domid, - unsigned long frame, int flags, - unsigned page_off, unsigned length); - /* - * Redirect an available grant entry on domain A to another grant - * reference of domain B, then allow domain C to use grant reference - * of domain B transitively. Ref parameter is an available grant entry - * reference on domain A, domid is id of domain C which accesses grant - * entry transitively, flags is grant type and flag information, - * trans_domid is id of domain B whose grant entry is finally accessed - * transitively, trans_gref is grant entry transitive reference of - * domain B. - */ - void (*update_trans_entry)(grant_ref_t ref, domid_t domid, int flags, - domid_t trans_domid, grant_ref_t trans_gref); + unsigned long (*read_frame)(grant_ref_t ref); }; -static struct gnttab_ops *gnttab_interface; +struct unmap_refs_callback_data { + struct completion completion; + int result; +}; -/*This reflects status of grant entries, so act as a global value*/ -static grant_status_t *grstatus; +static const struct gnttab_ops *gnttab_interface; -static int grant_table_version; -static int grefs_per_grant_frame; +/* This reflects status of grant entries, so act as a global value. */ +static grant_status_t *grstatus; static struct gnttab_free_callback *gnttab_free_callback_list; @@ -184,16 +192,116 @@ static int get_free_entries(unsigned count) ref = head = gnttab_free_head; gnttab_free_count -= count; - while (count-- > 1) - head = gnttab_entry(head); + while (count--) { + bitmap_clear(gnttab_free_bitmap, head, 1); + if (gnttab_free_tail_ptr == __gnttab_entry(head)) + gnttab_free_tail_ptr = &gnttab_free_head; + if (count) + head = gnttab_entry(head); + } gnttab_free_head = gnttab_entry(head); gnttab_entry(head) = GNTTAB_LIST_END; + if (!gnttab_free_count) { + gnttab_last_free = GNTTAB_LIST_END; + gnttab_free_tail_ptr = NULL; + } + spin_unlock_irqrestore(&gnttab_list_lock, flags); return ref; } +static int get_seq_entry_count(void) +{ + if (gnttab_last_free == GNTTAB_LIST_END || !gnttab_free_tail_ptr || + *gnttab_free_tail_ptr == GNTTAB_LIST_END) + return 0; + + return gnttab_last_free - *gnttab_free_tail_ptr + 1; +} + +/* Rebuilds the free grant list and tries to find count consecutive entries. */ +static int get_free_seq(unsigned int count) +{ + int ret = -ENOSPC; + unsigned int from, to; + grant_ref_t *last; + + gnttab_free_tail_ptr = &gnttab_free_head; + last = &gnttab_free_head; + + for (from = find_first_bit(gnttab_free_bitmap, gnttab_size); + from < gnttab_size; + from = find_next_bit(gnttab_free_bitmap, gnttab_size, to + 1)) { + to = find_next_zero_bit(gnttab_free_bitmap, gnttab_size, + from + 1); + if (ret < 0 && to - from >= count) { + ret = from; + bitmap_clear(gnttab_free_bitmap, ret, count); + from += count; + gnttab_free_count -= count; + if (from == to) + continue; + } + + /* + * Recreate the free list in order to have it properly sorted. + * This is needed to make sure that the free tail has the maximum + * possible size. + */ + while (from < to) { + *last = from; + last = __gnttab_entry(from); + gnttab_last_free = from; + from++; + } + if (to < gnttab_size) + gnttab_free_tail_ptr = __gnttab_entry(to - 1); + } + + *last = GNTTAB_LIST_END; + if (gnttab_last_free != gnttab_size - 1) + gnttab_free_tail_ptr = NULL; + + return ret; +} + +static int get_free_entries_seq(unsigned int count) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&gnttab_list_lock, flags); + + if (gnttab_free_count < count) { + ret = gnttab_expand(count - gnttab_free_count); + if (ret < 0) + goto out; + } + + if (get_seq_entry_count() < count) { + ret = get_free_seq(count); + if (ret >= 0) + goto out; + ret = gnttab_expand(count - get_seq_entry_count()); + if (ret < 0) + goto out; + } + + ret = *gnttab_free_tail_ptr; + *gnttab_free_tail_ptr = gnttab_entry(ret + count - 1); + gnttab_free_count -= count; + if (!gnttab_free_count) + gnttab_free_tail_ptr = NULL; + bitmap_clear(gnttab_free_bitmap, ret, count); + + out: + spin_unlock_irqrestore(&gnttab_list_lock, flags); + + return ret; +} + static void do_free_callbacks(void) { struct gnttab_free_callback *callback, *next; @@ -220,25 +328,56 @@ static inline void check_free_callbacks(void) do_free_callbacks(); } -static void put_free_entry(grant_ref_t ref) +static void put_free_entry_locked(grant_ref_t ref) { - unsigned long flags; - spin_lock_irqsave(&gnttab_list_lock, flags); + if (unlikely(ref < GNTTAB_NR_RESERVED_ENTRIES)) + return; + gnttab_entry(ref) = gnttab_free_head; gnttab_free_head = ref; + if (!gnttab_free_count) + gnttab_last_free = ref; + if (gnttab_free_tail_ptr == &gnttab_free_head) + gnttab_free_tail_ptr = __gnttab_entry(ref); gnttab_free_count++; + bitmap_set(gnttab_free_bitmap, ref, 1); +} + +static void put_free_entry(grant_ref_t ref) +{ + unsigned long flags; + + spin_lock_irqsave(&gnttab_list_lock, flags); + put_free_entry_locked(ref); check_free_callbacks(); spin_unlock_irqrestore(&gnttab_list_lock, flags); } +static void gnttab_set_free(unsigned int start, unsigned int n) +{ + unsigned int i; + + for (i = start; i < start + n - 1; i++) + gnttab_entry(i) = i + 1; + + gnttab_entry(i) = GNTTAB_LIST_END; + if (!gnttab_free_count) { + gnttab_free_head = start; + gnttab_free_tail_ptr = &gnttab_free_head; + } else { + gnttab_entry(gnttab_last_free) = start; + } + gnttab_free_count += n; + gnttab_last_free = i; + + bitmap_set(gnttab_free_bitmap, start, n); +} + /* * Following applies to gnttab_update_entry_v1 and gnttab_update_entry_v2. * Introducing a valid entry into the grant table: * 1. Write ent->domid. - * 2. Write ent->frame: - * GTF_permit_access: Frame to which access is permitted. - * GTF_accept_transfer: Pseudo-phys frame slot being filled by new - * frame, or zero if none. + * 2. Write ent->frame: Frame to which access is permitted. * 3. Write memory barrier (WMB). * 4. Write ent->flags, inc. valid type. */ @@ -252,11 +391,11 @@ static void gnttab_update_entry_v1(grant_ref_t ref, domid_t domid, } static void gnttab_update_entry_v2(grant_ref_t ref, domid_t domid, - unsigned long frame, unsigned flags) + unsigned long frame, unsigned int flags) { gnttab_shared.v2[ref].hdr.domid = domid; gnttab_shared.v2[ref].full_page.frame = frame; - wmb(); + wmb(); /* Hypervisor concurrent accesses. */ gnttab_shared.v2[ref].hdr.flags = GTF_permit_access | flags; } @@ -286,167 +425,33 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, } EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); -static void gnttab_update_subpage_entry_v2(grant_ref_t ref, domid_t domid, - unsigned long frame, int flags, - unsigned page_off, unsigned length) -{ - gnttab_shared.v2[ref].sub_page.frame = frame; - gnttab_shared.v2[ref].sub_page.page_off = page_off; - gnttab_shared.v2[ref].sub_page.length = length; - gnttab_shared.v2[ref].hdr.domid = domid; - wmb(); - gnttab_shared.v2[ref].hdr.flags = - GTF_permit_access | GTF_sub_page | flags; -} - -int gnttab_grant_foreign_access_subpage_ref(grant_ref_t ref, domid_t domid, - unsigned long frame, int flags, - unsigned page_off, - unsigned length) -{ - if (flags & (GTF_accept_transfer | GTF_reading | - GTF_writing | GTF_transitive)) - return -EPERM; - - if (gnttab_interface->update_subpage_entry == NULL) - return -ENOSYS; - - gnttab_interface->update_subpage_entry(ref, domid, frame, flags, - page_off, length); - - return 0; -} -EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_subpage_ref); - -int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame, - int flags, unsigned page_off, - unsigned length) -{ - int ref, rc; - - ref = get_free_entries(1); - if (unlikely(ref < 0)) - return -ENOSPC; - - rc = gnttab_grant_foreign_access_subpage_ref(ref, domid, frame, flags, - page_off, length); - if (rc < 0) { - put_free_entry(ref); - return rc; - } - - return ref; -} -EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_subpage); - -bool gnttab_subpage_grants_available(void) -{ - return gnttab_interface->update_subpage_entry != NULL; -} -EXPORT_SYMBOL_GPL(gnttab_subpage_grants_available); - -static void gnttab_update_trans_entry_v2(grant_ref_t ref, domid_t domid, - int flags, domid_t trans_domid, - grant_ref_t trans_gref) -{ - gnttab_shared.v2[ref].transitive.trans_domid = trans_domid; - gnttab_shared.v2[ref].transitive.gref = trans_gref; - gnttab_shared.v2[ref].hdr.domid = domid; - wmb(); - gnttab_shared.v2[ref].hdr.flags = - GTF_permit_access | GTF_transitive | flags; -} - -int gnttab_grant_foreign_access_trans_ref(grant_ref_t ref, domid_t domid, - int flags, domid_t trans_domid, - grant_ref_t trans_gref) -{ - if (flags & (GTF_accept_transfer | GTF_reading | - GTF_writing | GTF_sub_page)) - return -EPERM; - - if (gnttab_interface->update_trans_entry == NULL) - return -ENOSYS; - - gnttab_interface->update_trans_entry(ref, domid, flags, trans_domid, - trans_gref); - - return 0; -} -EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_trans_ref); - -int gnttab_grant_foreign_access_trans(domid_t domid, int flags, - domid_t trans_domid, - grant_ref_t trans_gref) +static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref) { - int ref, rc; + u16 *pflags = &gnttab_shared.v1[ref].flags; + u16 flags; - ref = get_free_entries(1); - if (unlikely(ref < 0)) - return -ENOSPC; - - rc = gnttab_grant_foreign_access_trans_ref(ref, domid, flags, - trans_domid, trans_gref); - if (rc < 0) { - put_free_entry(ref); - return rc; - } - - return ref; -} -EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_trans); - -bool gnttab_trans_grants_available(void) -{ - return gnttab_interface->update_trans_entry != NULL; -} -EXPORT_SYMBOL_GPL(gnttab_trans_grants_available); - -static int gnttab_query_foreign_access_v1(grant_ref_t ref) -{ - return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing); -} - -static int gnttab_query_foreign_access_v2(grant_ref_t ref) -{ - return grstatus[ref] & (GTF_reading|GTF_writing); -} - -int gnttab_query_foreign_access(grant_ref_t ref) -{ - return gnttab_interface->query_foreign_access(ref); -} -EXPORT_SYMBOL_GPL(gnttab_query_foreign_access); - -static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly) -{ - u16 flags, nflags; - u16 *pflags; - - pflags = &gnttab_shared.v1[ref].flags; - nflags = *pflags; + flags = *pflags; do { - flags = nflags; if (flags & (GTF_reading|GTF_writing)) return 0; - } while ((nflags = sync_cmpxchg(pflags, flags, 0)) != flags); + } while (!sync_try_cmpxchg(pflags, &flags, 0)); return 1; } -static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref, int readonly) +static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref) { gnttab_shared.v2[ref].hdr.flags = 0; - mb(); + mb(); /* Concurrent access by hypervisor. */ if (grstatus[ref] & (GTF_reading|GTF_writing)) { return 0; } else { - /* The read of grstatus needs to have acquire - semantics. On x86, reads already have - that, and we just need to protect against - compiler reorderings. On other - architectures we may need a full - barrier. */ + /* + * The read of grstatus needs to have acquire semantics. + * On x86, reads already have that, and we just need to + * protect against compiler reorderings. + * On other architectures we may need a full barrier. + */ #ifdef CONFIG_X86 barrier(); #else @@ -457,39 +462,55 @@ static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref, int readonly) return 1; } -static inline int _gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) +static inline int _gnttab_end_foreign_access_ref(grant_ref_t ref) { - return gnttab_interface->end_foreign_access_ref(ref, readonly); + return gnttab_interface->end_foreign_access_ref(ref); } -int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) +int gnttab_end_foreign_access_ref(grant_ref_t ref) { - if (_gnttab_end_foreign_access_ref(ref, readonly)) + if (_gnttab_end_foreign_access_ref(ref)) return 1; pr_warn("WARNING: g.e. %#x still in use!\n", ref); return 0; } EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref); +static unsigned long gnttab_read_frame_v1(grant_ref_t ref) +{ + return gnttab_shared.v1[ref].frame; +} + +static unsigned long gnttab_read_frame_v2(grant_ref_t ref) +{ + return gnttab_shared.v2[ref].full_page.frame; +} + struct deferred_entry { struct list_head list; grant_ref_t ref; - bool ro; uint16_t warn_delay; struct page *page; }; static LIST_HEAD(deferred_list); -static void gnttab_handle_deferred(unsigned long); -static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred, 0, 0); +static void gnttab_handle_deferred(struct timer_list *); +static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred); -static void gnttab_handle_deferred(unsigned long unused) +static atomic64_t deferred_count; +static atomic64_t leaked_count; +static unsigned int free_per_iteration = 10; +module_param(free_per_iteration, uint, 0600); + +static void gnttab_handle_deferred(struct timer_list *unused) { - unsigned int nr = 10; + unsigned int nr = READ_ONCE(free_per_iteration); + const bool ignore_limit = nr == 0; struct deferred_entry *first = NULL; unsigned long flags; + size_t freed = 0; spin_lock_irqsave(&gnttab_list_lock, flags); - while (nr--) { + while ((ignore_limit || nr--) && !list_empty(&deferred_list)) { struct deferred_entry *entry = list_first_entry(&deferred_list, struct deferred_entry, list); @@ -498,14 +519,15 @@ static void gnttab_handle_deferred(unsigned long unused) break; list_del(&entry->list); spin_unlock_irqrestore(&gnttab_list_lock, flags); - if (_gnttab_end_foreign_access_ref(entry->ref, entry->ro)) { + if (_gnttab_end_foreign_access_ref(entry->ref)) { + uint64_t ret = atomic64_dec_return(&deferred_count); + put_free_entry(entry->ref); - if (entry->page) { - pr_debug("freeing g.e. %#x (pfn %#lx)\n", - entry->ref, page_to_pfn(entry->page)); - __free_page(entry->page); - } else - pr_info("freeing g.e. %#x\n", entry->ref); + pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n", + entry->ref, page_to_pfn(entry->page), + (unsigned long long)ret); + put_page(entry->page); + freed++; kfree(entry); entry = NULL; } else { @@ -517,27 +539,35 @@ static void gnttab_handle_deferred(unsigned long unused) spin_lock_irqsave(&gnttab_list_lock, flags); if (entry) list_add_tail(&entry->list, &deferred_list); - else if (list_empty(&deferred_list)) - break; } - if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) { + if (list_empty(&deferred_list)) + WARN_ON(atomic64_read(&deferred_count)); + else if (!timer_pending(&deferred_timer)) { deferred_timer.expires = jiffies + HZ; add_timer(&deferred_timer); } spin_unlock_irqrestore(&gnttab_list_lock, flags); + pr_debug("Freed %zu references", freed); } -static void gnttab_add_deferred(grant_ref_t ref, bool readonly, - struct page *page) +static void gnttab_add_deferred(grant_ref_t ref, struct page *page) { - struct deferred_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); - const char *what = KERN_WARNING "leaking"; + struct deferred_entry *entry; + gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL; + uint64_t leaked, deferred; + + entry = kmalloc(sizeof(*entry), gfp); + if (!page) { + unsigned long gfn = gnttab_interface->read_frame(ref); + + page = pfn_to_page(gfn_to_pfn(gfn)); + get_page(page); + } if (entry) { unsigned long flags; entry->ref = ref; - entry->ro = readonly; entry->page = page; entry->warn_delay = 60; spin_lock_irqsave(&gnttab_list_lock, flags); @@ -547,120 +577,38 @@ static void gnttab_add_deferred(grant_ref_t ref, bool readonly, add_timer(&deferred_timer); } spin_unlock_irqrestore(&gnttab_list_lock, flags); - what = KERN_DEBUG "deferring"; - } - printk("%s g.e. %#x (pfn %#lx)\n", - what, ref, page ? page_to_pfn(page) : -1); -} - -void gnttab_end_foreign_access(grant_ref_t ref, int readonly, - unsigned long page) -{ - if (gnttab_end_foreign_access_ref(ref, readonly)) { - put_free_entry(ref); - if (page != 0) - free_page(page); - } else - gnttab_add_deferred(ref, readonly, - page ? virt_to_page(page) : NULL); -} -EXPORT_SYMBOL_GPL(gnttab_end_foreign_access); - -int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn) -{ - int ref; - - ref = get_free_entries(1); - if (unlikely(ref < 0)) - return -ENOSPC; - gnttab_grant_foreign_transfer_ref(ref, domid, pfn); - - return ref; -} -EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer); - -void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid, - unsigned long pfn) -{ - gnttab_interface->update_entry(ref, domid, pfn, GTF_accept_transfer); -} -EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref); - -static unsigned long gnttab_end_foreign_transfer_ref_v1(grant_ref_t ref) -{ - unsigned long frame; - u16 flags; - u16 *pflags; - - pflags = &gnttab_shared.v1[ref].flags; - - /* - * If a transfer is not even yet started, try to reclaim the grant - * reference and return failure (== 0). - */ - while (!((flags = *pflags) & GTF_transfer_committed)) { - if (sync_cmpxchg(pflags, flags, 0) == flags) - return 0; - cpu_relax(); - } - - /* If a transfer is in progress then wait until it is completed. */ - while (!(flags & GTF_transfer_completed)) { - flags = *pflags; - cpu_relax(); + deferred = atomic64_inc_return(&deferred_count); + leaked = atomic64_read(&leaked_count); + pr_debug("deferring g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n", + ref, page ? page_to_pfn(page) : -1, deferred, leaked); + } else { + deferred = atomic64_read(&deferred_count); + leaked = atomic64_inc_return(&leaked_count); + pr_warn("leaking g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n", + ref, page ? page_to_pfn(page) : -1, deferred, leaked); } - - rmb(); /* Read the frame number /after/ reading completion status. */ - frame = gnttab_shared.v1[ref].frame; - BUG_ON(frame == 0); - - return frame; } -static unsigned long gnttab_end_foreign_transfer_ref_v2(grant_ref_t ref) +int gnttab_try_end_foreign_access(grant_ref_t ref) { - unsigned long frame; - u16 flags; - u16 *pflags; - - pflags = &gnttab_shared.v2[ref].hdr.flags; + int ret = _gnttab_end_foreign_access_ref(ref); - /* - * If a transfer is not even yet started, try to reclaim the grant - * reference and return failure (== 0). - */ - while (!((flags = *pflags) & GTF_transfer_committed)) { - if (sync_cmpxchg(pflags, flags, 0) == flags) - return 0; - cpu_relax(); - } - - /* If a transfer is in progress then wait until it is completed. */ - while (!(flags & GTF_transfer_completed)) { - flags = *pflags; - cpu_relax(); - } - - rmb(); /* Read the frame number /after/ reading completion status. */ - frame = gnttab_shared.v2[ref].full_page.frame; - BUG_ON(frame == 0); - - return frame; -} + if (ret) + put_free_entry(ref); -unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref) -{ - return gnttab_interface->end_foreign_transfer_ref(ref); + return ret; } -EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref); +EXPORT_SYMBOL_GPL(gnttab_try_end_foreign_access); -unsigned long gnttab_end_foreign_transfer(grant_ref_t ref) +void gnttab_end_foreign_access(grant_ref_t ref, struct page *page) { - unsigned long frame = gnttab_end_foreign_transfer_ref(ref); - put_free_entry(ref); - return frame; + if (gnttab_try_end_foreign_access(ref)) { + if (page) + put_page(page); + } else + gnttab_add_deferred(ref, page); } -EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer); +EXPORT_SYMBOL_GPL(gnttab_end_foreign_access); void gnttab_free_grant_reference(grant_ref_t ref) { @@ -672,23 +620,31 @@ void gnttab_free_grant_references(grant_ref_t head) { grant_ref_t ref; unsigned long flags; - int count = 1; - if (head == GNTTAB_LIST_END) - return; + spin_lock_irqsave(&gnttab_list_lock, flags); - ref = head; - while (gnttab_entry(ref) != GNTTAB_LIST_END) { - ref = gnttab_entry(ref); - count++; + while (head != GNTTAB_LIST_END) { + ref = gnttab_entry(head); + put_free_entry_locked(head); + head = ref; } - gnttab_entry(ref) = gnttab_free_head; - gnttab_free_head = head; - gnttab_free_count += count; check_free_callbacks(); spin_unlock_irqrestore(&gnttab_list_lock, flags); } EXPORT_SYMBOL_GPL(gnttab_free_grant_references); +void gnttab_free_grant_reference_seq(grant_ref_t head, unsigned int count) +{ + unsigned long flags; + unsigned int i; + + spin_lock_irqsave(&gnttab_list_lock, flags); + for (i = count; i > 0; i--) + put_free_entry_locked(head + i - 1); + check_free_callbacks(); + spin_unlock_irqrestore(&gnttab_list_lock, flags); +} +EXPORT_SYMBOL_GPL(gnttab_free_grant_reference_seq); + int gnttab_alloc_grant_references(u16 count, grant_ref_t *head) { int h = get_free_entries(count); @@ -702,6 +658,24 @@ int gnttab_alloc_grant_references(u16 count, grant_ref_t *head) } EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references); +int gnttab_alloc_grant_reference_seq(unsigned int count, grant_ref_t *first) +{ + int h; + + if (count == 1) + h = get_free_entries(1); + else + h = get_free_entries_seq(count); + + if (h < 0) + return -ENOSPC; + + *first = h; + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_alloc_grant_reference_seq); + int gnttab_empty_grant_references(const grant_ref_t *private_head) { return (*private_head == GNTTAB_LIST_END); @@ -730,9 +704,18 @@ void gnttab_request_free_callback(struct gnttab_free_callback *callback, void (*fn)(void *), void *arg, u16 count) { unsigned long flags; + struct gnttab_free_callback *cb; + spin_lock_irqsave(&gnttab_list_lock, flags); - if (callback->next) - goto out; + + /* Check if the callback is already on the list */ + cb = gnttab_free_callback_list; + while (cb) { + if (cb == callback) + goto out; + cb = cb->next; + } + callback->fn = fn; callback->arg = arg; callback->count = count; @@ -760,42 +743,45 @@ void gnttab_cancel_free_callback(struct gnttab_free_callback *callback) } EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback); +static unsigned int gnttab_frames(unsigned int frames, unsigned int align) +{ + return (frames * gnttab_interface->grefs_per_grant_frame + align - 1) / + align; +} + static int grow_gnttab_list(unsigned int more_frames) { unsigned int new_nr_grant_frames, extra_entries, i; unsigned int nr_glist_frames, new_nr_glist_frames; + unsigned int grefs_per_frame; - BUG_ON(grefs_per_grant_frame == 0); + grefs_per_frame = gnttab_interface->grefs_per_grant_frame; new_nr_grant_frames = nr_grant_frames + more_frames; - extra_entries = more_frames * grefs_per_grant_frame; + extra_entries = more_frames * grefs_per_frame; - nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; - new_nr_glist_frames = - (new_nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; + nr_glist_frames = gnttab_frames(nr_grant_frames, RPP); + new_nr_glist_frames = gnttab_frames(new_nr_grant_frames, RPP); for (i = nr_glist_frames; i < new_nr_glist_frames; i++) { gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC); if (!gnttab_list[i]) goto grow_nomem; } + gnttab_set_free(gnttab_size, extra_entries); - for (i = grefs_per_grant_frame * nr_grant_frames; - i < grefs_per_grant_frame * new_nr_grant_frames - 1; i++) - gnttab_entry(i) = i + 1; - - gnttab_entry(i) = gnttab_free_head; - gnttab_free_head = grefs_per_grant_frame * nr_grant_frames; - gnttab_free_count += extra_entries; + if (!gnttab_free_tail_ptr) + gnttab_free_tail_ptr = __gnttab_entry(gnttab_size); nr_grant_frames = new_nr_grant_frames; + gnttab_size += extra_entries; check_free_callbacks(); return 0; grow_nomem: - for ( ; i >= nr_glist_frames; i--) + while (i-- > nr_glist_frames) free_page((unsigned long) gnttab_list[i]); return -ENOMEM; } @@ -817,6 +803,11 @@ static unsigned int __max_nr_grant_frames(void) unsigned int gnttab_max_grant_frames(void) { unsigned int xen_max = __max_nr_grant_frames(); + static unsigned int boot_max_nr_grant_frames; + + /* First time, initialize it properly. */ + if (!boot_max_nr_grant_frames) + boot_max_nr_grant_frames = __max_nr_grant_frames(); if (xen_max > boot_max_nr_grant_frames) return boot_max_nr_grant_frames; @@ -824,6 +815,339 @@ unsigned int gnttab_max_grant_frames(void) } EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); +int gnttab_setup_auto_xlat_frames(phys_addr_t addr) +{ + xen_pfn_t *pfn; + unsigned int max_nr_gframes = __max_nr_grant_frames(); + unsigned int i; + void *vaddr; + + if (xen_auto_xlat_grant_frames.count) + return -EINVAL; + + vaddr = memremap(addr, XEN_PAGE_SIZE * max_nr_gframes, MEMREMAP_WB); + if (vaddr == NULL) { + pr_warn("Failed to ioremap gnttab share frames (addr=%pa)!\n", + &addr); + return -ENOMEM; + } + pfn = kcalloc(max_nr_gframes, sizeof(pfn[0]), GFP_KERNEL); + if (!pfn) { + memunmap(vaddr); + return -ENOMEM; + } + for (i = 0; i < max_nr_gframes; i++) + pfn[i] = XEN_PFN_DOWN(addr) + i; + + xen_auto_xlat_grant_frames.vaddr = vaddr; + xen_auto_xlat_grant_frames.pfn = pfn; + xen_auto_xlat_grant_frames.count = max_nr_gframes; + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_setup_auto_xlat_frames); + +void gnttab_free_auto_xlat_frames(void) +{ + if (!xen_auto_xlat_grant_frames.count) + return; + kfree(xen_auto_xlat_grant_frames.pfn); + memunmap(xen_auto_xlat_grant_frames.vaddr); + + xen_auto_xlat_grant_frames.pfn = NULL; + xen_auto_xlat_grant_frames.count = 0; + xen_auto_xlat_grant_frames.vaddr = NULL; +} +EXPORT_SYMBOL_GPL(gnttab_free_auto_xlat_frames); + +int gnttab_pages_set_private(int nr_pages, struct page **pages) +{ + int i; + + for (i = 0; i < nr_pages; i++) { +#if BITS_PER_LONG < 64 + struct xen_page_foreign *foreign; + + foreign = kzalloc(sizeof(*foreign), GFP_KERNEL); + if (!foreign) + return -ENOMEM; + + set_page_private(pages[i], (unsigned long)foreign); +#endif + SetPagePrivate(pages[i]); + } + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_pages_set_private); + +/** + * gnttab_alloc_pages - alloc pages suitable for grant mapping into + * @nr_pages: number of pages to alloc + * @pages: returns the pages + */ +int gnttab_alloc_pages(int nr_pages, struct page **pages) +{ + int ret; + + ret = xen_alloc_unpopulated_pages(nr_pages, pages); + if (ret < 0) + return ret; + + ret = gnttab_pages_set_private(nr_pages, pages); + if (ret < 0) + gnttab_free_pages(nr_pages, pages); + + return ret; +} +EXPORT_SYMBOL_GPL(gnttab_alloc_pages); + +#ifdef CONFIG_XEN_UNPOPULATED_ALLOC +static inline void cache_init(struct gnttab_page_cache *cache) +{ + cache->pages = NULL; +} + +static inline bool cache_empty(struct gnttab_page_cache *cache) +{ + return !cache->pages; +} + +static inline struct page *cache_deq(struct gnttab_page_cache *cache) +{ + struct page *page; + + page = cache->pages; + cache->pages = page->zone_device_data; + + return page; +} + +static inline void cache_enq(struct gnttab_page_cache *cache, struct page *page) +{ + page->zone_device_data = cache->pages; + cache->pages = page; +} +#else +static inline void cache_init(struct gnttab_page_cache *cache) +{ + INIT_LIST_HEAD(&cache->pages); +} + +static inline bool cache_empty(struct gnttab_page_cache *cache) +{ + return list_empty(&cache->pages); +} + +static inline struct page *cache_deq(struct gnttab_page_cache *cache) +{ + struct page *page; + + page = list_first_entry(&cache->pages, struct page, lru); + list_del(&page->lru); + + return page; +} + +static inline void cache_enq(struct gnttab_page_cache *cache, struct page *page) +{ + list_add(&page->lru, &cache->pages); +} +#endif + +void gnttab_page_cache_init(struct gnttab_page_cache *cache) +{ + spin_lock_init(&cache->lock); + cache_init(cache); + cache->num_pages = 0; +} +EXPORT_SYMBOL_GPL(gnttab_page_cache_init); + +int gnttab_page_cache_get(struct gnttab_page_cache *cache, struct page **page) +{ + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + + if (cache_empty(cache)) { + spin_unlock_irqrestore(&cache->lock, flags); + return gnttab_alloc_pages(1, page); + } + + page[0] = cache_deq(cache); + cache->num_pages--; + + spin_unlock_irqrestore(&cache->lock, flags); + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_page_cache_get); + +void gnttab_page_cache_put(struct gnttab_page_cache *cache, struct page **page, + unsigned int num) +{ + unsigned long flags; + unsigned int i; + + spin_lock_irqsave(&cache->lock, flags); + + for (i = 0; i < num; i++) + cache_enq(cache, page[i]); + cache->num_pages += num; + + spin_unlock_irqrestore(&cache->lock, flags); +} +EXPORT_SYMBOL_GPL(gnttab_page_cache_put); + +void gnttab_page_cache_shrink(struct gnttab_page_cache *cache, unsigned int num) +{ + struct page *page[10]; + unsigned int i = 0; + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + + while (cache->num_pages > num) { + page[i] = cache_deq(cache); + cache->num_pages--; + if (++i == ARRAY_SIZE(page)) { + spin_unlock_irqrestore(&cache->lock, flags); + gnttab_free_pages(i, page); + i = 0; + spin_lock_irqsave(&cache->lock, flags); + } + } + + spin_unlock_irqrestore(&cache->lock, flags); + + if (i != 0) + gnttab_free_pages(i, page); +} +EXPORT_SYMBOL_GPL(gnttab_page_cache_shrink); + +void gnttab_pages_clear_private(int nr_pages, struct page **pages) +{ + int i; + + for (i = 0; i < nr_pages; i++) { + if (PagePrivate(pages[i])) { +#if BITS_PER_LONG < 64 + kfree((void *)page_private(pages[i])); +#endif + ClearPagePrivate(pages[i]); + } + } +} +EXPORT_SYMBOL_GPL(gnttab_pages_clear_private); + +/** + * gnttab_free_pages - free pages allocated by gnttab_alloc_pages() + * @nr_pages: number of pages to free + * @pages: the pages + */ +void gnttab_free_pages(int nr_pages, struct page **pages) +{ + gnttab_pages_clear_private(nr_pages, pages); + xen_free_unpopulated_pages(nr_pages, pages); +} +EXPORT_SYMBOL_GPL(gnttab_free_pages); + +#ifdef CONFIG_XEN_GRANT_DMA_ALLOC +/** + * gnttab_dma_alloc_pages - alloc DMAable pages suitable for grant mapping into + * @args: arguments to the function + */ +int gnttab_dma_alloc_pages(struct gnttab_dma_alloc_args *args) +{ + unsigned long pfn, start_pfn; + size_t size; + int i, ret; + + if (args->nr_pages < 0 || args->nr_pages > (INT_MAX >> PAGE_SHIFT)) + return -ENOMEM; + + size = args->nr_pages << PAGE_SHIFT; + if (args->coherent) + args->vaddr = dma_alloc_coherent(args->dev, size, + &args->dev_bus_addr, + GFP_KERNEL | __GFP_NOWARN); + else + args->vaddr = dma_alloc_wc(args->dev, size, + &args->dev_bus_addr, + GFP_KERNEL | __GFP_NOWARN); + if (!args->vaddr) { + pr_debug("Failed to allocate DMA buffer of size %zu\n", size); + return -ENOMEM; + } + + start_pfn = __phys_to_pfn(args->dev_bus_addr); + for (pfn = start_pfn, i = 0; pfn < start_pfn + args->nr_pages; + pfn++, i++) { + struct page *page = pfn_to_page(pfn); + + args->pages[i] = page; + args->frames[i] = xen_page_to_gfn(page); + xenmem_reservation_scrub_page(page); + } + + xenmem_reservation_va_mapping_reset(args->nr_pages, args->pages); + + ret = xenmem_reservation_decrease(args->nr_pages, args->frames); + if (ret != args->nr_pages) { + pr_debug("Failed to decrease reservation for DMA buffer\n"); + ret = -EFAULT; + goto fail; + } + + ret = gnttab_pages_set_private(args->nr_pages, args->pages); + if (ret < 0) + goto fail; + + return 0; + +fail: + gnttab_dma_free_pages(args); + return ret; +} +EXPORT_SYMBOL_GPL(gnttab_dma_alloc_pages); + +/** + * gnttab_dma_free_pages - free DMAable pages + * @args: arguments to the function + */ +int gnttab_dma_free_pages(struct gnttab_dma_alloc_args *args) +{ + size_t size; + int i, ret; + + gnttab_pages_clear_private(args->nr_pages, args->pages); + + for (i = 0; i < args->nr_pages; i++) + args->frames[i] = page_to_xen_pfn(args->pages[i]); + + ret = xenmem_reservation_increase(args->nr_pages, args->frames); + if (ret != args->nr_pages) { + pr_debug("Failed to increase reservation for DMA buffer\n"); + ret = -EFAULT; + } else { + ret = 0; + } + + xenmem_reservation_va_mapping_update(args->nr_pages, args->pages, + args->frames); + + size = args->nr_pages << PAGE_SHIFT; + if (args->coherent) + dma_free_coherent(args->dev, size, + args->vaddr, args->dev_bus_addr); + else + dma_free_wc(args->dev, size, + args->vaddr, args->dev_bus_addr); + return ret; +} +EXPORT_SYMBOL_GPL(gnttab_dma_free_pages); +#endif + /* Handling of paged out grant targets (GNTST_eagain) */ #define MAX_DELAY 256 static inline void @@ -870,95 +1194,186 @@ void gnttab_batch_copy(struct gnttab_copy *batch, unsigned count) } EXPORT_SYMBOL_GPL(gnttab_batch_copy); +void gnttab_foreach_grant_in_range(struct page *page, + unsigned int offset, + unsigned int len, + xen_grant_fn_t fn, + void *data) +{ + unsigned int goffset; + unsigned int glen; + unsigned long xen_pfn; + + len = min(PAGE_SIZE - offset, len); + goffset = xen_offset_in_page(offset); + + xen_pfn = page_to_xen_pfn(page) + XEN_PFN_DOWN(offset); + + while (len) { + glen = min_t(unsigned int, XEN_PAGE_SIZE - goffset, len); + fn(pfn_to_gfn(xen_pfn), goffset, glen, data); + + goffset = 0; + xen_pfn++; + len -= glen; + } +} +EXPORT_SYMBOL_GPL(gnttab_foreach_grant_in_range); + +void gnttab_foreach_grant(struct page **pages, + unsigned int nr_grefs, + xen_grant_fn_t fn, + void *data) +{ + unsigned int goffset = 0; + unsigned long xen_pfn = 0; + unsigned int i; + + for (i = 0; i < nr_grefs; i++) { + if ((i % XEN_PFN_PER_PAGE) == 0) { + xen_pfn = page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]); + goffset = 0; + } + + fn(pfn_to_gfn(xen_pfn), goffset, XEN_PAGE_SIZE, data); + + goffset += XEN_PAGE_SIZE; + xen_pfn++; + } +} + int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count) { int i, ret; - bool lazy = false; - pte_t *pte; - unsigned long mfn; ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count); if (ret) return ret; - /* Retry eagain maps */ - for (i = 0; i < count; i++) - if (map_ops[i].status == GNTST_eagain) - gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i, - &map_ops[i].status, __func__); + for (i = 0; i < count; i++) { + switch (map_ops[i].status) { + case GNTST_okay: + { + struct xen_page_foreign *foreign; + + SetPageForeign(pages[i]); + foreign = xen_page_foreign(pages[i]); + foreign->domid = map_ops[i].dom; + foreign->gref = map_ops[i].ref; + break; + } - if (xen_feature(XENFEAT_auto_translated_physmap)) - return ret; + case GNTST_no_device_space: + pr_warn_ratelimited("maptrack limit reached, can't map all guest pages\n"); + break; - if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { - arch_enter_lazy_mmu_mode(); - lazy = true; - } + case GNTST_eagain: + /* Retry eagain maps */ + gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, + map_ops + i, + &map_ops[i].status, __func__); + /* Test status in next loop iteration. */ + i--; + break; - for (i = 0; i < count; i++) { - /* Do not add to override if the map failed. */ - if (map_ops[i].status) - continue; - - if (map_ops[i].flags & GNTMAP_contains_pte) { - pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + - (map_ops[i].host_addr & ~PAGE_MASK)); - mfn = pte_mfn(*pte); - } else { - mfn = PFN_DOWN(map_ops[i].dev_bus_addr); + default: + break; } - ret = m2p_add_override(mfn, pages[i], kmap_ops ? - &kmap_ops[i] : NULL); - if (ret) - return ret; } - if (lazy) - arch_leave_lazy_mmu_mode(); - - return ret; + return set_foreign_p2m_mapping(map_ops, kmap_ops, pages, count); } EXPORT_SYMBOL_GPL(gnttab_map_refs); int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, - struct gnttab_map_grant_ref *kmap_ops, + struct gnttab_unmap_grant_ref *kunmap_ops, struct page **pages, unsigned int count) { - int i, ret; - bool lazy = false; + unsigned int i; + int ret; ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count); if (ret) return ret; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return ret; + for (i = 0; i < count; i++) + ClearPageForeign(pages[i]); - if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { - arch_enter_lazy_mmu_mode(); - lazy = true; - } + return clear_foreign_p2m_mapping(unmap_ops, kunmap_ops, pages, count); +} +EXPORT_SYMBOL_GPL(gnttab_unmap_refs); - for (i = 0; i < count; i++) { - ret = m2p_remove_override(pages[i], kmap_ops ? - &kmap_ops[i] : NULL); - if (ret) - return ret; +#define GNTTAB_UNMAP_REFS_DELAY 5 + +static void __gnttab_unmap_refs_async(struct gntab_unmap_queue_data* item); + +static void gnttab_unmap_work(struct work_struct *work) +{ + struct gntab_unmap_queue_data + *unmap_data = container_of(work, + struct gntab_unmap_queue_data, + gnttab_work.work); + if (unmap_data->age != UINT_MAX) + unmap_data->age++; + __gnttab_unmap_refs_async(unmap_data); +} + +static void __gnttab_unmap_refs_async(struct gntab_unmap_queue_data* item) +{ + int ret; + int pc; + + for (pc = 0; pc < item->count; pc++) { + if (page_count(item->pages[pc]) > 1) { + unsigned long delay = GNTTAB_UNMAP_REFS_DELAY * (item->age + 1); + schedule_delayed_work(&item->gnttab_work, + msecs_to_jiffies(delay)); + return; + } } - if (lazy) - arch_leave_lazy_mmu_mode(); + ret = gnttab_unmap_refs(item->unmap_ops, item->kunmap_ops, + item->pages, item->count); + item->done(ret, item); +} - return ret; +void gnttab_unmap_refs_async(struct gntab_unmap_queue_data* item) +{ + INIT_DELAYED_WORK(&item->gnttab_work, gnttab_unmap_work); + item->age = 0; + + __gnttab_unmap_refs_async(item); } -EXPORT_SYMBOL_GPL(gnttab_unmap_refs); +EXPORT_SYMBOL_GPL(gnttab_unmap_refs_async); -static unsigned nr_status_frames(unsigned nr_grant_frames) +static void unmap_refs_callback(int result, + struct gntab_unmap_queue_data *data) { - BUG_ON(grefs_per_grant_frame == 0); - return (nr_grant_frames * grefs_per_grant_frame + SPP - 1) / SPP; + struct unmap_refs_callback_data *d = data->data; + + d->result = result; + complete(&d->completion); +} + +int gnttab_unmap_refs_sync(struct gntab_unmap_queue_data *item) +{ + struct unmap_refs_callback_data data; + + init_completion(&data.completion); + item->data = &data; + item->done = &unmap_refs_callback; + gnttab_unmap_refs_async(item); + wait_for_completion(&data.completion); + + return data.result; +} +EXPORT_SYMBOL_GPL(gnttab_unmap_refs_sync); + +static unsigned int nr_status_frames(unsigned int nr_grant_frames) +{ + return gnttab_frames(nr_grant_frames, SPP); } static int gnttab_map_frames_v1(xen_pfn_t *frames, unsigned int nr_gframes) @@ -990,7 +1405,7 @@ static int gnttab_map_frames_v2(xen_pfn_t *frames, unsigned int nr_gframes) /* No need for kzalloc as it is initialized in following hypercall * GNTTABOP_get_status_frames. */ - sframes = kmalloc(nr_sframes * sizeof(uint64_t), GFP_ATOMIC); + sframes = kmalloc_array(nr_sframes, sizeof(uint64_t), GFP_ATOMIC); if (!sframes) return -ENOMEM; @@ -1034,10 +1449,11 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) unsigned int nr_gframes = end_idx + 1; int rc; - if (xen_hvm_domain()) { + if (!xen_pv_domain()) { struct xen_add_to_physmap xatp; unsigned int i = end_idx; rc = 0; + BUG_ON(xen_auto_xlat_grant_frames.count < nr_gframes); /* * Loop backwards, so that the first hypercall has the largest * index, ensuring that the table will grow only once. @@ -1046,7 +1462,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) xatp.domid = DOMID_SELF; xatp.idx = i; xatp.space = XENMAPSPACE_grant_table; - xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i; + xatp.gpfn = xen_auto_xlat_grant_frames.pfn[i]; rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp); if (rc != 0) { pr_warn("grant table add_to_physmap failed, err=%d\n", @@ -1061,7 +1477,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) /* No need for kzalloc as it is initialized in following hypercall * GNTTABOP_setup_table. */ - frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC); + frames = kmalloc_array(nr_gframes, sizeof(unsigned long), GFP_ATOMIC); if (!frames) return -ENOMEM; @@ -1084,55 +1500,66 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) return rc; } -static struct gnttab_ops gnttab_v1_ops = { +static const struct gnttab_ops gnttab_v1_ops = { + .version = 1, + .grefs_per_grant_frame = XEN_PAGE_SIZE / + sizeof(struct grant_entry_v1), .map_frames = gnttab_map_frames_v1, .unmap_frames = gnttab_unmap_frames_v1, .update_entry = gnttab_update_entry_v1, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v1, - .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v1, - .query_foreign_access = gnttab_query_foreign_access_v1, + .read_frame = gnttab_read_frame_v1, }; -static struct gnttab_ops gnttab_v2_ops = { +static const struct gnttab_ops gnttab_v2_ops = { + .version = 2, + .grefs_per_grant_frame = XEN_PAGE_SIZE / + sizeof(union grant_entry_v2), .map_frames = gnttab_map_frames_v2, .unmap_frames = gnttab_unmap_frames_v2, .update_entry = gnttab_update_entry_v2, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v2, - .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v2, - .query_foreign_access = gnttab_query_foreign_access_v2, - .update_subpage_entry = gnttab_update_subpage_entry_v2, - .update_trans_entry = gnttab_update_trans_entry_v2, + .read_frame = gnttab_read_frame_v2, }; +static bool gnttab_need_v2(void) +{ +#ifdef CONFIG_X86 + uint32_t base, width; + + if (xen_pv_domain()) { + base = xen_cpuid_base(); + if (cpuid_eax(base) < 5) + return false; /* Information not available, use V1. */ + width = cpuid_ebx(base + 5) & + XEN_CPUID_MACHINE_ADDRESS_WIDTH_MASK; + return width > 32 + PAGE_SHIFT; + } +#endif + return !!(max_possible_pfn >> 32); +} + static void gnttab_request_version(void) { - int rc; + long rc; struct gnttab_set_version gsv; - if (xen_hvm_domain()) - gsv.version = 1; - else + if (gnttab_need_v2()) gsv.version = 2; + else + gsv.version = 1; + + /* Boot parameter overrides automatic selection. */ + if (xen_gnttab_version >= 1 && xen_gnttab_version <= 2) + gsv.version = xen_gnttab_version; + rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1); - if (rc == 0 && gsv.version == 2) { - grant_table_version = 2; - grefs_per_grant_frame = PAGE_SIZE / sizeof(union grant_entry_v2); + if (rc == 0 && gsv.version == 2) gnttab_interface = &gnttab_v2_ops; - } else if (grant_table_version == 2) { - /* - * If we've already used version 2 features, - * but then suddenly discover that they're not - * available (e.g. migrating to an older - * version of Xen), almost unbounded badness - * can happen. - */ - panic("we need grant tables version 2, but only version 1 is available"); - } else { - grant_table_version = 1; - grefs_per_grant_frame = PAGE_SIZE / sizeof(struct grant_entry_v1); + else gnttab_interface = &gnttab_v1_ops; - } - pr_info("Grant tables using version %d layout\n", grant_table_version); + pr_info("Grant tables using version %d layout\n", + gnttab_interface->version); } static int gnttab_setup(void) @@ -1143,21 +1570,14 @@ static int gnttab_setup(void) if (max_nr_gframes < nr_grant_frames) return -ENOSYS; - if (xen_pv_domain()) - return gnttab_map(0, nr_grant_frames - 1); - - if (gnttab_shared.addr == NULL) { - gnttab_shared.addr = xen_remap(xen_hvm_resume_frames, - PAGE_SIZE * max_nr_gframes); + if (!xen_pv_domain() && gnttab_shared.addr == NULL) { + gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr; if (gnttab_shared.addr == NULL) { - pr_warn("Failed to ioremap gnttab share frames!\n"); + pr_warn("gnttab share frames is not mapped!\n"); return -ENOMEM; } } - - gnttab_map(0, nr_grant_frames - 1); - - return 0; + return gnttab_map(0, nr_grant_frames - 1); } int gnttab_resume(void) @@ -1168,7 +1588,8 @@ int gnttab_resume(void) int gnttab_suspend(void) { - gnttab_interface->unmap_frames(); + if (xen_pv_domain()) + gnttab_interface->unmap_frames(); return 0; } @@ -1177,12 +1598,17 @@ static int gnttab_expand(unsigned int req_entries) int rc; unsigned int cur, extra; - BUG_ON(grefs_per_grant_frame == 0); cur = nr_grant_frames; - extra = ((req_entries + (grefs_per_grant_frame-1)) / - grefs_per_grant_frame); - if (cur + extra > gnttab_max_grant_frames()) + extra = ((req_entries + gnttab_interface->grefs_per_grant_frame - 1) / + gnttab_interface->grefs_per_grant_frame); + if (cur + extra > gnttab_max_grant_frames()) { + pr_warn_ratelimited("xen/grant-table: max_grant_frames reached" + " cur=%u extra=%u limit=%u" + " gnttab_free_count=%u req_entries=%u\n", + cur, extra, gnttab_max_grant_frames(), + gnttab_free_count, req_entries); return -ENOSPC; + } rc = gnttab_map(cur, cur + extra - 1); if (rc == 0) @@ -1194,27 +1620,28 @@ static int gnttab_expand(unsigned int req_entries) int gnttab_init(void) { int i; + unsigned long max_nr_grant_frames, max_nr_grefs; unsigned int max_nr_glist_frames, nr_glist_frames; - unsigned int nr_init_grefs; int ret; gnttab_request_version(); + max_nr_grant_frames = gnttab_max_grant_frames(); + max_nr_grefs = max_nr_grant_frames * + gnttab_interface->grefs_per_grant_frame; nr_grant_frames = 1; - boot_max_nr_grant_frames = __max_nr_grant_frames(); /* Determine the maximum number of frames required for the * grant reference free list on the current hypervisor. */ - BUG_ON(grefs_per_grant_frame == 0); - max_nr_glist_frames = (boot_max_nr_grant_frames * - grefs_per_grant_frame / RPP); + max_nr_glist_frames = max_nr_grefs / RPP; - gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *), - GFP_KERNEL); + gnttab_list = kmalloc_array(max_nr_glist_frames, + sizeof(grant_ref_t *), + GFP_KERNEL); if (gnttab_list == NULL) return -ENOMEM; - nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; + nr_glist_frames = gnttab_frames(nr_grant_frames, RPP); for (i = 0; i < nr_glist_frames; i++) { gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL); if (gnttab_list[i] == NULL) { @@ -1223,19 +1650,26 @@ int gnttab_init(void) } } + gnttab_free_bitmap = bitmap_zalloc(max_nr_grefs, GFP_KERNEL); + if (!gnttab_free_bitmap) { + ret = -ENOMEM; + goto ini_nomem; + } + + ret = arch_gnttab_init(max_nr_grant_frames, + nr_status_frames(max_nr_grant_frames)); + if (ret < 0) + goto ini_nomem; + if (gnttab_setup() < 0) { ret = -ENODEV; goto ini_nomem; } - nr_init_grefs = nr_grant_frames * grefs_per_grant_frame; + gnttab_size = nr_grant_frames * gnttab_interface->grefs_per_grant_frame; - for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++) - gnttab_entry(i) = i + 1; - - gnttab_entry(nr_init_grefs - 1) = GNTTAB_LIST_END; - gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES; - gnttab_free_head = NR_RESERVED_ENTRIES; + gnttab_set_free(GNTTAB_NR_RESERVED_ENTRIES, + gnttab_size - GNTTAB_NR_RESERVED_ENTRIES); printk("Grant table initialized\n"); return 0; @@ -1244,20 +1678,22 @@ int gnttab_init(void) for (i--; i >= 0; i--) free_page((unsigned long)gnttab_list[i]); kfree(gnttab_list); + bitmap_free(gnttab_free_bitmap); return ret; } EXPORT_SYMBOL_GPL(gnttab_init); static int __gnttab_init(void) { + if (!xen_domain()) + return -ENODEV; + /* Delay grant-table initialization in the PV on HVM case */ - if (xen_hvm_domain()) + if (xen_hvm_domain() && !xen_pvh_domain()) return 0; - if (!xen_pv_domain()) - return -ENODEV; - return gnttab_init(); } - -core_initcall(__gnttab_init); +/* Starts after core_initcall so that xen_pvh_gnttab_setup can be called + * beforehand to initialize xen_auto_xlat_grant_frames. */ +core_initcall_sync(__gnttab_init); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 624e8dc24532..e20c40a62e64 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Handle extern requests for shutdown, reboot and sysrq */ @@ -10,6 +11,7 @@ #include <linux/reboot.h> #include <linux/sysrq.h> #include <linux/stop_machine.h> +#include <linux/suspend.h> #include <linux/freezer.h> #include <linux/syscore_ops.h> #include <linux/export.h> @@ -19,10 +21,10 @@ #include <xen/grant_table.h> #include <xen/events.h> #include <xen/hvc-console.h> +#include <xen/page.h> #include <xen/xen-ops.h> #include <asm/xen/hypercall.h> -#include <asm/xen/page.h> #include <asm/xen/hypervisor.h> enum shutdown_state { @@ -41,32 +43,17 @@ static enum shutdown_state shutting_down = SHUTDOWN_INVALID; struct suspend_info { int cancelled; - unsigned long arg; /* extra hypercall argument */ - void (*pre)(void); - void (*post)(int cancelled); }; -#ifdef CONFIG_HIBERNATE_CALLBACKS -static void xen_hvm_post_suspend(int cancelled) -{ - xen_arch_hvm_post_suspend(cancelled); - gnttab_resume(); -} - -static void xen_pre_suspend(void) -{ - xen_mm_pin_all(); - gnttab_suspend(); - xen_arch_pre_suspend(); -} +static RAW_NOTIFIER_HEAD(xen_resume_notifier); -static void xen_post_suspend(int cancelled) +void xen_resume_notifier_register(struct notifier_block *nb) { - xen_arch_post_suspend(cancelled); - gnttab_resume(); - xen_mm_unpin_all(); + raw_notifier_chain_register(&xen_resume_notifier, nb); } +EXPORT_SYMBOL_GPL(xen_resume_notifier_register); +#ifdef CONFIG_HIBERNATE_CALLBACKS static int xen_suspend(void *data) { struct suspend_info *si = data; @@ -80,22 +67,20 @@ static int xen_suspend(void *data) return err; } - if (si->pre) - si->pre(); + gnttab_suspend(); + xen_manage_runstate_time(-1); + xen_arch_pre_suspend(); - /* - * This hypercall returns 1 if suspend was cancelled - * or the domain was merely checkpointed, and 0 if it - * is resuming in a new domain. - */ - si->cancelled = HYPERVISOR_suspend(si->arg); + si->cancelled = HYPERVISOR_suspend(xen_pv_domain() + ? virt_to_gfn(xen_start_info) + : 0); - if (si->post) - si->post(si->cancelled); + xen_arch_post_suspend(si->cancelled); + xen_manage_runstate_time(si->cancelled ? 1 : 0); + gnttab_resume(); if (!si->cancelled) { xen_irq_resume(); - xen_console_resume(); xen_timer_resume(); } @@ -111,21 +96,28 @@ static void do_suspend(void) shutting_down = SHUTDOWN_SUSPEND; -#ifdef CONFIG_PREEMPT - /* If the kernel is preemptible, we need to freeze all the processes - to prevent them from being in the middle of a pagetable update - during suspend. */ + if (!mutex_trylock(&system_transition_mutex)) + { + pr_err("%s: failed to take system_transition_mutex\n", __func__); + goto out; + } + err = freeze_processes(); if (err) { - pr_err("%s: freeze failed %d\n", __func__, err); - goto out; + pr_err("%s: freeze processes failed %d\n", __func__, err); + goto out_unlock; + } + + err = freeze_kernel_threads(); + if (err) { + pr_err("%s: freeze kernel threads failed %d\n", __func__, err); + goto out_thaw; } -#endif err = dpm_suspend_start(PMSG_FREEZE); if (err) { pr_err("%s: dpm_suspend_start %d\n", __func__, err); - goto out_thaw; + goto out_resume_end; } printk(KERN_DEBUG "suspending xenstore...\n"); @@ -138,20 +130,20 @@ static void do_suspend(void) goto out_resume; } - si.cancelled = 1; + xen_arch_suspend(); - if (xen_hvm_domain()) { - si.arg = 0UL; - si.pre = NULL; - si.post = &xen_hvm_post_suspend; - } else { - si.arg = virt_to_mfn(xen_start_info); - si.pre = &xen_pre_suspend; - si.post = &xen_post_suspend; - } + si.cancelled = 1; err = stop_machine(xen_suspend, &si, cpumask_of(0)); + /* Resume console as early as possible. */ + if (!si.cancelled) + xen_console_resume(); + + raw_notifier_call_chain(&xen_resume_notifier, 0, NULL); + + xen_arch_resume(); + dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE); if (err) { @@ -160,56 +152,82 @@ static void do_suspend(void) } out_resume: - if (!si.cancelled) { - xen_arch_resume(); + if (!si.cancelled) xs_resume(); - } else + else xs_suspend_cancel(); +out_resume_end: dpm_resume_end(si.cancelled ? PMSG_THAW : PMSG_RESTORE); out_thaw: -#ifdef CONFIG_PREEMPT thaw_processes(); +out_unlock: + mutex_unlock(&system_transition_mutex); out: -#endif shutting_down = SHUTDOWN_INVALID; } #endif /* CONFIG_HIBERNATE_CALLBACKS */ struct shutdown_handler { - const char *command; +#define SHUTDOWN_CMD_SIZE 11 + const char command[SHUTDOWN_CMD_SIZE]; + bool flag; void (*cb)(void); }; +static int poweroff_nb(struct notifier_block *cb, unsigned long code, void *unused) +{ + switch (code) { + case SYS_DOWN: + case SYS_HALT: + case SYS_POWER_OFF: + shutting_down = SHUTDOWN_POWEROFF; + break; + default: + break; + } + return NOTIFY_DONE; +} static void do_poweroff(void) { - shutting_down = SHUTDOWN_POWEROFF; - orderly_poweroff(false); + switch (system_state) { + case SYSTEM_BOOTING: + case SYSTEM_SCHEDULING: + orderly_poweroff(true); + break; + case SYSTEM_RUNNING: + orderly_poweroff(false); + break; + default: + /* Don't do it when we are halting/rebooting. */ + pr_info("Ignoring Xen toolstack shutdown.\n"); + break; + } } static void do_reboot(void) { shutting_down = SHUTDOWN_POWEROFF; /* ? */ - ctrl_alt_del(); + orderly_reboot(); } +static const struct shutdown_handler shutdown_handlers[] = { + { "poweroff", true, do_poweroff }, + { "halt", false, do_poweroff }, + { "reboot", true, do_reboot }, +#ifdef CONFIG_HIBERNATE_CALLBACKS + { "suspend", true, do_suspend }, +#endif +}; + static void shutdown_handler(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { char *str; struct xenbus_transaction xbt; int err; - static struct shutdown_handler handlers[] = { - { "poweroff", do_poweroff }, - { "halt", do_poweroff }, - { "reboot", do_reboot }, -#ifdef CONFIG_HIBERNATE_CALLBACKS - { "suspend", do_suspend }, -#endif - {NULL, NULL}, - }; - static struct shutdown_handler *handler; + int idx; if (shutting_down != SHUTDOWN_INVALID) return; @@ -226,13 +244,13 @@ static void shutdown_handler(struct xenbus_watch *watch, return; } - for (handler = &handlers[0]; handler->command; handler++) { - if (strcmp(str, handler->command) == 0) + for (idx = 0; idx < ARRAY_SIZE(shutdown_handlers); idx++) { + if (strcmp(str, shutdown_handlers[idx].command) == 0) break; } /* Only acknowledge commands which we are prepared to handle. */ - if (handler->cb) + if (idx < ARRAY_SIZE(shutdown_handlers)) xenbus_write(xbt, "control", "shutdown", ""); err = xenbus_transaction_end(xbt, 0); @@ -241,8 +259,8 @@ static void shutdown_handler(struct xenbus_watch *watch, goto again; } - if (handler->cb) { - handler->cb(); + if (idx < ARRAY_SIZE(shutdown_handlers)) { + shutdown_handlers[idx].cb(); } else { pr_info("Ignoring shutdown request: %s\n", str); shutting_down = SHUTDOWN_INVALID; @@ -252,8 +270,8 @@ static void shutdown_handler(struct xenbus_watch *watch, } #ifdef CONFIG_MAGIC_SYSRQ -static void sysrq_handler(struct xenbus_watch *watch, const char **vec, - unsigned int len) +static void sysrq_handler(struct xenbus_watch *watch, const char *path, + const char *token) { char sysrq_key = '\0'; struct xenbus_transaction xbt; @@ -263,14 +281,31 @@ static void sysrq_handler(struct xenbus_watch *watch, const char **vec, err = xenbus_transaction_start(&xbt); if (err) return; - if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { - pr_err("Unable to read sysrq code in control/sysrq\n"); + err = xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key); + if (err < 0) { + /* + * The Xenstore watch fires directly after registering it and + * after a suspend/resume cycle. So ENOENT is no error but + * might happen in those cases. ERANGE is observed when we get + * an empty value (''), this happens when we acknowledge the + * request by writing '\0' below. + */ + if (err != -ENOENT && err != -ERANGE) + pr_err("Error %d reading sysrq code in control/sysrq\n", + err); xenbus_transaction_end(xbt, 1); return; } - if (sysrq_key != '\0') - xenbus_printf(xbt, "control", "sysrq", "%c", '\0'); + if (sysrq_key != '\0') { + err = xenbus_printf(xbt, "control", "sysrq", "%c", '\0'); + if (err) { + pr_err("%s: Error %d writing sysrq in control/sysrq\n", + __func__, err); + xenbus_transaction_end(xbt, 1); + return; + } + } err = xenbus_transaction_end(xbt, 0); if (err == -EAGAIN) @@ -291,9 +326,16 @@ static struct xenbus_watch shutdown_watch = { .callback = shutdown_handler }; +static struct notifier_block xen_reboot_nb = { + .notifier_call = poweroff_nb, +}; + static int setup_shutdown_watcher(void) { int err; + int idx; +#define FEATURE_PATH_SIZE (SHUTDOWN_CMD_SIZE + sizeof("feature-")) + char node[FEATURE_PATH_SIZE]; err = register_xenbus_watch(&shutdown_watch); if (err) { @@ -301,6 +343,7 @@ static int setup_shutdown_watcher(void) return err; } + #ifdef CONFIG_MAGIC_SYSRQ err = register_xenbus_watch(&sysrq_watch); if (err) { @@ -309,6 +352,19 @@ static int setup_shutdown_watcher(void) } #endif + for (idx = 0; idx < ARRAY_SIZE(shutdown_handlers); idx++) { + if (!shutdown_handlers[idx].flag) + continue; + snprintf(node, FEATURE_PATH_SIZE, "feature-%s", + shutdown_handlers[idx].command); + err = xenbus_printf(XBT_NIL, "control", node, "%u", 1); + if (err) { + pr_err("%s: Error %d writing %s\n", __func__, + err, node); + return err; + } + } + return 0; } @@ -329,6 +385,7 @@ int xen_setup_shutdown_event(void) if (!xen_domain()) return -ENODEV; register_xenstore_notifier(&xenstore_notifier); + register_reboot_notifier(&xen_reboot_nb); return 0; } diff --git a/drivers/xen/mcelog.c b/drivers/xen/mcelog.c index 6ab6a79c38a5..4f65b641c054 100644 --- a/drivers/xen/mcelog.c +++ b/drivers/xen/mcelog.c @@ -139,12 +139,12 @@ out: return err ? err : buf - ubuf; } -static unsigned int xen_mce_chrdev_poll(struct file *file, poll_table *wait) +static __poll_t xen_mce_chrdev_poll(struct file *file, poll_table *wait) { poll_wait(file, &xen_mce_chrdev_wait, wait); if (xen_mcelog.next) - return POLLIN | POLLRDNORM; + return EPOLLIN | EPOLLRDNORM; return 0; } @@ -182,7 +182,6 @@ static const struct file_operations xen_mce_chrdev_ops = { .read = xen_mce_chrdev_read, .poll = xen_mce_chrdev_poll, .unlocked_ioctl = xen_mce_chrdev_ioctl, - .llseek = no_llseek, }; static struct miscdevice xen_mce_chrdev_device = { @@ -222,7 +221,7 @@ static int convert_log(struct mc_info *mi) struct mcinfo_global *mc_global; struct mcinfo_bank *mc_bank; struct xen_mce m; - uint32_t i; + unsigned int i, j; mic = NULL; x86_mcinfo_lookup(&mic, mi, MC_TYPE_GLOBAL); @@ -248,7 +247,17 @@ static int convert_log(struct mc_info *mi) m.socketid = g_physinfo[i].mc_chipid; m.cpu = m.extcpu = g_physinfo[i].mc_cpunr; m.cpuvendor = (__u8)g_physinfo[i].mc_vendor; - m.mcgcap = g_physinfo[i].mc_msrvalues[__MC_MSR_MCGCAP].value; + for (j = 0; j < g_physinfo[i].mc_nmsrvals; ++j) + switch (g_physinfo[i].mc_msrvalues[j].reg) { + case MSR_IA32_MCG_CAP: + m.mcgcap = g_physinfo[i].mc_msrvalues[j].value; + break; + + case MSR_PPIN: + case MSR_AMD_PPIN: + m.ppin = g_physinfo[i].mc_msrvalues[j].value; + break; + } mic = NULL; x86_mcinfo_lookup(&mic, mi, MC_TYPE_BANK); @@ -288,7 +297,6 @@ static int mc_queue_handle(uint32_t flags) int ret = 0; mc_op.cmd = XEN_MC_fetch; - mc_op.interface_version = XEN_MCA_INTERFACE_VERSION; set_xen_guest_handle(mc_op.u.mc_fetch.data, &g_mi); do { mc_op.u.mc_fetch.flags = flags; @@ -358,7 +366,6 @@ static int bind_virq_for_mce(void) /* Fetch physical CPU Numbers */ mc_op.cmd = XEN_MC_physcpuinfo; - mc_op.interface_version = XEN_MCA_INTERFACE_VERSION; set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo); ret = HYPERVISOR_mca(&mc_op); if (ret) { @@ -393,14 +400,27 @@ static int bind_virq_for_mce(void) static int __init xen_late_init_mcelog(void) { + int ret; + /* Only DOM0 is responsible for MCE logging */ - if (xen_initial_domain()) { - /* register character device /dev/mcelog for xen mcelog */ - if (misc_register(&xen_mce_chrdev_device)) - return -ENODEV; - return bind_virq_for_mce(); - } + if (!xen_initial_domain()) + return -ENODEV; - return -ENODEV; + /* register character device /dev/mcelog for xen mcelog */ + ret = misc_register(&xen_mce_chrdev_device); + if (ret) + return ret; + + ret = bind_virq_for_mce(); + if (ret) + goto deregister; + + pr_info("/dev/mcelog registered by Xen\n"); + + return 0; + +deregister: + misc_deregister(&xen_mce_chrdev_device); + return ret; } device_initcall(xen_late_init_mcelog); diff --git a/drivers/xen/mem-reservation.c b/drivers/xen/mem-reservation.c new file mode 100644 index 000000000000..24648836e0d4 --- /dev/null +++ b/drivers/xen/mem-reservation.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0 + +/****************************************************************************** + * Xen memory reservation utilities. + * + * Copyright (c) 2003, B Dragovic + * Copyright (c) 2003-2004, M Williamson, K Fraser + * Copyright (c) 2005 Dan M. Smith, IBM Corporation + * Copyright (c) 2010 Daniel Kiper + * Copyright (c) 2018 Oleksandr Andrushchenko, EPAM Systems Inc. + */ + +#include <asm/xen/hypercall.h> + +#include <xen/interface/memory.h> +#include <xen/mem-reservation.h> +#include <linux/moduleparam.h> + +bool __read_mostly xen_scrub_pages = IS_ENABLED(CONFIG_XEN_SCRUB_PAGES_DEFAULT); +core_param(xen_scrub_pages, xen_scrub_pages, bool, 0); + +/* + * Use one extent per PAGE_SIZE to avoid to break down the page into + * multiple frame. + */ +#define EXTENT_ORDER (fls(XEN_PFN_PER_PAGE) - 1) + +#ifdef CONFIG_XEN_HAVE_PVMMU +void __xenmem_reservation_va_mapping_update(unsigned long count, + struct page **pages, + xen_pfn_t *frames) +{ + int i; + + for (i = 0; i < count; i++) { + struct page *page = pages[i]; + unsigned long pfn = page_to_pfn(page); + int ret; + + BUG_ON(!page); + + /* + * We don't support PV MMU when Linux and Xen is using + * different page granularity. + */ + BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); + + set_phys_to_machine(pfn, frames[i]); + + ret = HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + mfn_pte(frames[i], PAGE_KERNEL), 0); + BUG_ON(ret); + } +} +EXPORT_SYMBOL_GPL(__xenmem_reservation_va_mapping_update); + +void __xenmem_reservation_va_mapping_reset(unsigned long count, + struct page **pages) +{ + int i; + + for (i = 0; i < count; i++) { + struct page *page = pages[i]; + unsigned long pfn = page_to_pfn(page); + int ret; + + /* + * We don't support PV MMU when Linux and Xen are using + * different page granularity. + */ + BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); + + ret = HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + __pte_ma(0), 0); + BUG_ON(ret); + + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + } +} +EXPORT_SYMBOL_GPL(__xenmem_reservation_va_mapping_reset); +#endif /* CONFIG_XEN_HAVE_PVMMU */ + +/* @frames is an array of PFNs */ +int xenmem_reservation_increase(int count, xen_pfn_t *frames) +{ + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = EXTENT_ORDER, + .domid = DOMID_SELF + }; + + /* XENMEM_populate_physmap requires a PFN based on Xen granularity. */ + set_xen_guest_handle(reservation.extent_start, frames); + reservation.nr_extents = count; + return HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); +} +EXPORT_SYMBOL_GPL(xenmem_reservation_increase); + +/* @frames is an array of GFNs */ +int xenmem_reservation_decrease(int count, xen_pfn_t *frames) +{ + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = EXTENT_ORDER, + .domid = DOMID_SELF + }; + + /* XENMEM_decrease_reservation requires a GFN */ + set_xen_guest_handle(reservation.extent_start, frames); + reservation.nr_extents = count; + return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); +} +EXPORT_SYMBOL_GPL(xenmem_reservation_decrease); diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index 18fff88254eb..bfe07adb3e3a 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -1,24 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2009, Intel Corporation. * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * * Author: Weidong Han <weidong.han@intel.com> */ #include <linux/pci.h> #include <linux/acpi.h> +#include <linux/pci-acpi.h> +#include <xen/pci.h> #include <xen/xen.h> #include <xen/interface/physdev.h> #include <xen/interface/xen.h> @@ -26,6 +16,11 @@ #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> #include "../pci/pci.h" +#ifdef CONFIG_PCI_MMCONFIG +#include <asm/pci_x86.h> + +static int xen_mcfg_late(void); +#endif static bool __read_mostly pci_seg_supported = true; @@ -36,35 +31,69 @@ static int xen_add_device(struct device *dev) #ifdef CONFIG_PCI_IOV struct pci_dev *physfn = pci_dev->physfn; #endif +#ifdef CONFIG_PCI_MMCONFIG + static bool pci_mcfg_reserved = false; + /* + * Reserve MCFG areas in Xen on first invocation due to this being + * potentially called from inside of acpi_init immediately after + * MCFG table has been finally parsed. + */ + if (!pci_mcfg_reserved) { + xen_mcfg_late(); + pci_mcfg_reserved = true; + } +#endif + + if (pci_domain_nr(pci_dev->bus) >> 16) { + /* + * The hypercall interface is limited to 16bit PCI segment + * values, do not attempt to register devices with Xen in + * segments greater or equal than 0x10000. + */ + dev_info(dev, + "not registering with Xen: invalid PCI segment\n"); + return 0; + } if (pci_seg_supported) { - struct physdev_pci_device_add add = { - .seg = pci_domain_nr(pci_dev->bus), - .bus = pci_dev->bus->number, - .devfn = pci_dev->devfn - }; + DEFINE_RAW_FLEX(struct physdev_pci_device_add, add, optarr, 1); + + add->seg = pci_domain_nr(pci_dev->bus); + add->bus = pci_dev->bus->number; + add->devfn = pci_dev->devfn; + #ifdef CONFIG_ACPI acpi_handle handle; #endif #ifdef CONFIG_PCI_IOV if (pci_dev->is_virtfn) { - add.flags = XEN_PCI_DEV_VIRTFN; - add.physfn.bus = physfn->bus->number; - add.physfn.devfn = physfn->devfn; + add->flags = XEN_PCI_DEV_VIRTFN; + add->physfn.bus = physfn->bus->number; + add->physfn.devfn = physfn->devfn; } else #endif if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) - add.flags = XEN_PCI_DEV_EXTFN; + add->flags = XEN_PCI_DEV_EXTFN; #ifdef CONFIG_ACPI - handle = DEVICE_ACPI_HANDLE(&pci_dev->dev); - if (!handle && pci_dev->bus->bridge) - handle = DEVICE_ACPI_HANDLE(pci_dev->bus->bridge); + handle = ACPI_HANDLE(&pci_dev->dev); #ifdef CONFIG_PCI_IOV if (!handle && pci_dev->is_virtfn) - handle = DEVICE_ACPI_HANDLE(physfn->bus->bridge); + handle = ACPI_HANDLE(physfn->bus->bridge); #endif + if (!handle) { + /* + * This device was not listed in the ACPI name space at + * all. Try to get acpi handle of parent pci bus. + */ + struct pci_bus *pbus; + for (pbus = pci_dev->bus; pbus; pbus = pbus->parent) { + handle = acpi_pci_get_bridge_handle(pbus); + if (handle) + break; + } + } if (handle) { acpi_status status; @@ -74,8 +103,8 @@ static int xen_add_device(struct device *dev) status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm); if (ACPI_SUCCESS(status)) { - add.optarr[0] = pxm; - add.flags |= XEN_PCI_DEV_PXM; + add->optarr[0] = pxm; + add->flags |= XEN_PCI_DEV_PXM; break; } status = acpi_get_parent(handle, &handle); @@ -83,7 +112,7 @@ static int xen_add_device(struct device *dev) } #endif /* CONFIG_ACPI */ - r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, &add); + r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, add); if (r != -ENOSYS) return r; pci_seg_supported = false; @@ -132,6 +161,16 @@ static int xen_remove_device(struct device *dev) int r; struct pci_dev *pci_dev = to_pci_dev(dev); + if (pci_domain_nr(pci_dev->bus) >> 16) { + /* + * The hypercall interface is limited to 16bit PCI segment + * values. + */ + dev_info(dev, + "not unregistering with Xen: invalid PCI segment\n"); + return 0; + } + if (pci_seg_supported) { struct physdev_pci_device device = { .seg = pci_domain_nr(pci_dev->bus), @@ -156,6 +195,29 @@ static int xen_remove_device(struct device *dev) return r; } +int xen_reset_device(const struct pci_dev *dev) +{ + struct pci_device_reset device = { + .dev.seg = pci_domain_nr(dev->bus), + .dev.bus = dev->bus->number, + .dev.devfn = dev->devfn, + .flags = PCI_DEVICE_RESET_FLR, + }; + + if (pci_domain_nr(dev->bus) >> 16) { + /* + * The hypercall interface is limited to 16bit PCI segment + * values. + */ + dev_info(&dev->dev, + "unable to notify Xen of device reset: invalid PCI segment\n"); + return 0; + } + + return HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_reset, &device); +} +EXPORT_SYMBOL_GPL(xen_reset_device); + static int xen_pci_notifier(struct notifier_block *nb, unsigned long action, void *data) { @@ -192,3 +254,120 @@ static int __init register_xen_pci_notifier(void) } arch_initcall(register_xen_pci_notifier); + +#ifdef CONFIG_PCI_MMCONFIG +static int xen_mcfg_late(void) +{ + struct pci_mmcfg_region *cfg; + int rc; + + if (!xen_initial_domain()) + return 0; + + if ((pci_probe & PCI_PROBE_MMCONF) == 0) + return 0; + + if (list_empty(&pci_mmcfg_list)) + return 0; + + /* Check whether they are in the right area. */ + list_for_each_entry(cfg, &pci_mmcfg_list, list) { + struct physdev_pci_mmcfg_reserved r; + + r.address = cfg->address; + r.segment = cfg->segment; + r.start_bus = cfg->start_bus; + r.end_bus = cfg->end_bus; + r.flags = XEN_PCI_MMCFG_RESERVED; + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r); + switch (rc) { + case 0: + case -ENOSYS: + continue; + + default: + pr_warn("Failed to report MMCONFIG reservation" + " state for %s to hypervisor" + " (%d)\n", + cfg->name, rc); + } + } + return 0; +} +#endif + +#ifdef CONFIG_XEN_DOM0 +struct xen_device_domain_owner { + domid_t domain; + struct pci_dev *dev; + struct list_head list; +}; + +static DEFINE_SPINLOCK(dev_domain_list_spinlock); +static LIST_HEAD(dev_domain_list); + +static struct xen_device_domain_owner *find_device(struct pci_dev *dev) +{ + struct xen_device_domain_owner *owner; + + list_for_each_entry(owner, &dev_domain_list, list) { + if (owner->dev == dev) + return owner; + } + return NULL; +} + +int xen_find_device_domain_owner(struct pci_dev *dev) +{ + struct xen_device_domain_owner *owner; + int domain = -ENODEV; + + spin_lock(&dev_domain_list_spinlock); + owner = find_device(dev); + if (owner) + domain = owner->domain; + spin_unlock(&dev_domain_list_spinlock); + return domain; +} +EXPORT_SYMBOL_GPL(xen_find_device_domain_owner); + +int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain) +{ + struct xen_device_domain_owner *owner; + + owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL); + if (!owner) + return -ENODEV; + + spin_lock(&dev_domain_list_spinlock); + if (find_device(dev)) { + spin_unlock(&dev_domain_list_spinlock); + kfree(owner); + return -EEXIST; + } + owner->domain = domain; + owner->dev = dev; + list_add_tail(&owner->list, &dev_domain_list); + spin_unlock(&dev_domain_list_spinlock); + return 0; +} +EXPORT_SYMBOL_GPL(xen_register_device_domain_owner); + +int xen_unregister_device_domain_owner(struct pci_dev *dev) +{ + struct xen_device_domain_owner *owner; + + spin_lock(&dev_domain_list_spinlock); + owner = find_device(dev); + if (!owner) { + spin_unlock(&dev_domain_list_spinlock); + return -ENODEV; + } + list_del(&owner->list); + spin_unlock(&dev_domain_list_spinlock); + kfree(owner); + return 0; +} +EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner); +#endif diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c index 79e1dff7ed4f..093ad4a08672 100644 --- a/drivers/xen/pcpu.c +++ b/drivers/xen/pcpu.c @@ -40,12 +40,16 @@ #include <linux/capability.h> #include <xen/xen.h> +#include <xen/acpi.h> #include <xen/xenbus.h> #include <xen/events.h> #include <xen/interface/platform.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> +#ifdef CONFIG_ACPI +#include <acpi/processor.h> +#endif /* * @cpu_id: Xen physical cpu logic number @@ -57,10 +61,11 @@ struct pcpu { struct list_head list; struct device dev; uint32_t cpu_id; + uint32_t acpi_id; uint32_t flags; }; -static struct bus_type xen_pcpu_subsys = { +static const struct bus_type xen_pcpu_subsys = { .name = "xen_cpu", .dev_name = "xen_cpu", }; @@ -77,7 +82,7 @@ static int xen_pcpu_down(uint32_t cpu_id) .u.cpu_ol.cpuid = cpu_id, }; - return HYPERVISOR_dom0_op(&op); + return HYPERVISOR_platform_op(&op); } static int xen_pcpu_up(uint32_t cpu_id) @@ -88,10 +93,10 @@ static int xen_pcpu_up(uint32_t cpu_id) .u.cpu_ol.cpuid = cpu_id, }; - return HYPERVISOR_dom0_op(&op); + return HYPERVISOR_platform_op(&op); } -static ssize_t show_online(struct device *dev, +static ssize_t online_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -100,7 +105,7 @@ static ssize_t show_online(struct device *dev, return sprintf(buf, "%u\n", !!(cpu->flags & XEN_PCPU_FLAGS_ONLINE)); } -static ssize_t __ref store_online(struct device *dev, +static ssize_t online_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -129,7 +134,34 @@ static ssize_t __ref store_online(struct device *dev, ret = count; return ret; } -static DEVICE_ATTR(online, S_IRUGO | S_IWUSR, show_online, store_online); +static DEVICE_ATTR_RW(online); + +static struct attribute *pcpu_dev_attrs[] = { + &dev_attr_online.attr, + NULL +}; + +static umode_t pcpu_dev_is_visible(struct kobject *kobj, + struct attribute *attr, int idx) +{ + struct device *dev = kobj_to_dev(kobj); + /* + * Xen never offline cpu0 due to several restrictions + * and assumptions. This basically doesn't add a sys control + * to user, one cannot attempt to offline BSP. + */ + return dev->id ? attr->mode : 0; +} + +static const struct attribute_group pcpu_dev_group = { + .attrs = pcpu_dev_attrs, + .is_visible = pcpu_dev_is_visible, +}; + +static const struct attribute_group *pcpu_dev_groups[] = { + &pcpu_dev_group, + NULL +}; static bool xen_pcpu_online(uint32_t flags) { @@ -180,9 +212,6 @@ static void unregister_and_remove_pcpu(struct pcpu *pcpu) return; dev = &pcpu->dev; - if (dev->id) - device_remove_file(dev, &dev_attr_online); - /* pcpu remove would be implicitly done */ device_unregister(dev); } @@ -199,26 +228,14 @@ static int register_pcpu(struct pcpu *pcpu) dev->bus = &xen_pcpu_subsys; dev->id = pcpu->cpu_id; dev->release = pcpu_release; + dev->groups = pcpu_dev_groups; err = device_register(dev); if (err) { - pcpu_release(dev); + put_device(dev); return err; } - /* - * Xen never offline cpu0 due to several restrictions - * and assumptions. This basically doesn't add a sys control - * to user, one cannot attempt to offline BSP. - */ - if (dev->id) { - err = device_create_file(dev, &dev_attr_online); - if (err) { - device_unregister(dev); - return err; - } - } - return 0; } @@ -236,6 +253,7 @@ static struct pcpu *create_and_register_pcpu(struct xenpf_pcpuinfo *info) INIT_LIST_HEAD(&pcpu->list); pcpu->cpu_id = info->xen_cpuid; + pcpu->acpi_id = info->acpi_id; pcpu->flags = info->flags; /* Need hold on xen_pcpu_lock before pcpu list manipulations */ @@ -264,7 +282,7 @@ static int sync_pcpu(uint32_t cpu, uint32_t *max_cpu) .u.pcpu_info.xen_cpuid = cpu, }; - ret = HYPERVISOR_dom0_op(&op); + ret = HYPERVISOR_platform_op(&op); if (ret) return ret; @@ -332,41 +350,6 @@ static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -/* Sync with Xen hypervisor after cpu hotadded */ -void xen_pcpu_hotplug_sync(void) -{ - schedule_work(&xen_pcpu_work); -} -EXPORT_SYMBOL_GPL(xen_pcpu_hotplug_sync); - -/* - * For hypervisor presented cpu, return logic cpu id; - * For hypervisor non-presented cpu, return -ENODEV. - */ -int xen_pcpu_id(uint32_t acpi_id) -{ - int cpu_id = 0, max_id = 0; - struct xen_platform_op op; - - op.cmd = XENPF_get_cpuinfo; - while (cpu_id <= max_id) { - op.u.pcpu_info.xen_cpuid = cpu_id; - if (HYPERVISOR_dom0_op(&op)) { - cpu_id++; - continue; - } - - if (acpi_id == op.u.pcpu_info.acpi_id) - return cpu_id; - if (op.u.pcpu_info.max_present > max_id) - max_id = op.u.pcpu_info.max_present; - cpu_id++; - } - - return -ENODEV; -} -EXPORT_SYMBOL_GPL(xen_pcpu_id); - static int __init xen_pcpu_init(void) { int irq, ret; @@ -403,3 +386,40 @@ err1: return ret; } arch_initcall(xen_pcpu_init); + +#ifdef CONFIG_ACPI +bool __init xen_processor_present(uint32_t acpi_id) +{ + const struct pcpu *pcpu; + bool online = false; + + mutex_lock(&xen_pcpu_lock); + list_for_each_entry(pcpu, &xen_pcpus, list) + if (pcpu->acpi_id == acpi_id) { + online = pcpu->flags & XEN_PCPU_FLAGS_ONLINE; + break; + } + mutex_unlock(&xen_pcpu_lock); + + return online; +} + +void xen_sanitize_proc_cap_bits(uint32_t *cap) +{ + struct xen_platform_op op = { + .cmd = XENPF_set_processor_pminfo, + .u.set_pminfo.id = -1, + .u.set_pminfo.type = XEN_PM_PDC, + }; + u32 buf[3] = { ACPI_PDC_REVISION_ID, 1, *cap }; + int ret; + + set_xen_guest_handle(op.u.set_pminfo.pdc, buf); + ret = HYPERVISOR_platform_op(&op); + if (ret) + pr_err("sanitize of _PDC buffer bits from Xen failed: %d\n", + ret); + else + *cap = buf[2]; +} +#endif diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index 99db9e1eb8ba..1db82da56db6 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -1,30 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** * platform-pci.c * * Xen platform PCI device driver + * + * Authors: ssmith@xensource.com and stefano.stabellini@eu.citrix.com + * * Copyright (c) 2005, Intel Corporation. * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * */ #include <linux/interrupt.h> #include <linux/io.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/pci.h> #include <xen/platform_pci.h> @@ -36,16 +26,14 @@ #define DRV_NAME "xen-platform-pci" -MODULE_AUTHOR("ssmith@xensource.com and stefano.stabellini@eu.citrix.com"); -MODULE_DESCRIPTION("Xen platform PCI device"); -MODULE_LICENSE("GPL"); +#define PCI_DEVICE_ID_XEN_PLATFORM_XS61 0x0002 static unsigned long platform_mmio; static unsigned long platform_mmio_alloc; static unsigned long platform_mmiolen; static uint64_t callback_via; -unsigned long alloc_xen_mmio(unsigned long len) +static unsigned long alloc_xen_mmio(unsigned long len) { unsigned long addr; @@ -68,7 +56,8 @@ static uint64_t get_callback_via(struct pci_dev *pdev) pin = pdev->pin; /* We don't know the GSI. Specify the PCI INTx line instead. */ - return ((uint64_t)0x01 << 56) | /* PCI INTx identifier */ + return ((uint64_t)HVM_PARAM_CALLBACK_TYPE_PCI_INTX << + HVM_CALLBACK_VIA_TYPE_SHIFT) | ((uint64_t)pci_domain_nr(pdev->bus) << 32) | ((uint64_t)pdev->bus->number << 16) | ((uint64_t)(pdev->devfn & 0xff) << 8) | @@ -77,37 +66,39 @@ static uint64_t get_callback_via(struct pci_dev *pdev) static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id) { - xen_hvm_evtchn_do_upcall(); - return IRQ_HANDLED; + return xen_evtchn_do_upcall(); } static int xen_allocate_irq(struct pci_dev *pdev) { return request_irq(pdev->irq, do_hvm_evtchn_intr, - IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING, + IRQF_NOBALANCING | IRQF_SHARED, "xen-platform-pci", pdev); } -static int platform_pci_resume(struct pci_dev *pdev) +static int platform_pci_resume(struct device *dev) { int err; + if (xen_have_vector_callback) return 0; + err = xen_set_callback_via(callback_via); if (err) { - dev_err(&pdev->dev, "platform_pci_resume failure!\n"); + dev_err(dev, "platform_pci_resume failure!\n"); return err; } return 0; } -static int platform_pci_init(struct pci_dev *pdev, - const struct pci_device_id *ent) +static int platform_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *ent) { int i, ret; long ioaddr; long mmio_addr, mmio_len; unsigned int max_nr_gframes; + unsigned long grant_frames; if (!xen_domain()) return -ENODEV; @@ -137,30 +128,42 @@ static int platform_pci_init(struct pci_dev *pdev, platform_mmio = mmio_addr; platform_mmiolen = mmio_len; - if (!xen_have_vector_callback) { ret = xen_allocate_irq(pdev); if (ret) { dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret); goto out; } + /* + * It doesn't strictly *have* to run on CPU0 but it sure + * as hell better process the event channel ports delivered + * to CPU0. + */ + irq_set_affinity(pdev->irq, cpumask_of(0)); + callback_via = get_callback_via(pdev); ret = xen_set_callback_via(callback_via); if (ret) { dev_warn(&pdev->dev, "Unable to set the evtchn callback " "err=%d\n", ret); - goto out; + goto irq_out; } } max_nr_gframes = gnttab_max_grant_frames(); - xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); + grant_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); + ret = gnttab_setup_auto_xlat_frames(grant_frames); + if (ret) + goto irq_out; ret = gnttab_init(); if (ret) - goto out; - xenbus_probe(NULL); + goto grant_out; return 0; - +grant_out: + gnttab_free_auto_xlat_frames(); +irq_out: + if (!xen_have_vector_callback) + free_irq(pdev->irq, pdev); out: pci_release_region(pdev, 0); mem_out: @@ -170,26 +173,25 @@ pci_out: return ret; } -static struct pci_device_id platform_pci_tbl[] = { +static const struct pci_device_id platform_pci_tbl[] = { {PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, + {PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM_XS61, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, {0,} }; -MODULE_DEVICE_TABLE(pci, platform_pci_tbl); +static const struct dev_pm_ops platform_pm_ops = { + .resume_noirq = platform_pci_resume, +}; static struct pci_driver platform_driver = { .name = DRV_NAME, - .probe = platform_pci_init, + .probe = platform_pci_probe, .id_table = platform_pci_tbl, -#ifdef CONFIG_PM - .resume_early = platform_pci_resume, -#endif + .driver = { + .pm = &platform_pm_ops, + }, }; -static int __init platform_pci_module_init(void) -{ - return pci_register_driver(&platform_driver); -} - -module_init(platform_pci_module_init); +builtin_pci_driver(platform_driver); diff --git a/drivers/xen/privcmd-buf.c b/drivers/xen/privcmd-buf.c new file mode 100644 index 000000000000..0f0dad427d7e --- /dev/null +++ b/drivers/xen/privcmd-buf.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT + +/****************************************************************************** + * privcmd-buf.c + * + * Mmap of hypercall buffers. + * + * Copyright (c) 2018 Juergen Gross + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/miscdevice.h> +#include <linux/mm.h> +#include <linux/slab.h> + +#include "privcmd.h" + +MODULE_DESCRIPTION("Xen Mmap of hypercall buffers"); +MODULE_LICENSE("GPL"); + +struct privcmd_buf_private { + struct mutex lock; + struct list_head list; +}; + +struct privcmd_buf_vma_private { + struct privcmd_buf_private *file_priv; + struct list_head list; + unsigned int users; + unsigned int n_pages; + struct page *pages[]; +}; + +static int privcmd_buf_open(struct inode *ino, struct file *file) +{ + struct privcmd_buf_private *file_priv; + + file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL); + if (!file_priv) + return -ENOMEM; + + mutex_init(&file_priv->lock); + INIT_LIST_HEAD(&file_priv->list); + + file->private_data = file_priv; + + return 0; +} + +static void privcmd_buf_vmapriv_free(struct privcmd_buf_vma_private *vma_priv) +{ + unsigned int i; + + list_del(&vma_priv->list); + + for (i = 0; i < vma_priv->n_pages; i++) + __free_page(vma_priv->pages[i]); + + kfree(vma_priv); +} + +static int privcmd_buf_release(struct inode *ino, struct file *file) +{ + struct privcmd_buf_private *file_priv = file->private_data; + struct privcmd_buf_vma_private *vma_priv; + + mutex_lock(&file_priv->lock); + + while (!list_empty(&file_priv->list)) { + vma_priv = list_first_entry(&file_priv->list, + struct privcmd_buf_vma_private, + list); + privcmd_buf_vmapriv_free(vma_priv); + } + + mutex_unlock(&file_priv->lock); + + kfree(file_priv); + + return 0; +} + +static void privcmd_buf_vma_open(struct vm_area_struct *vma) +{ + struct privcmd_buf_vma_private *vma_priv = vma->vm_private_data; + + if (!vma_priv) + return; + + mutex_lock(&vma_priv->file_priv->lock); + vma_priv->users++; + mutex_unlock(&vma_priv->file_priv->lock); +} + +static void privcmd_buf_vma_close(struct vm_area_struct *vma) +{ + struct privcmd_buf_vma_private *vma_priv = vma->vm_private_data; + struct privcmd_buf_private *file_priv; + + if (!vma_priv) + return; + + file_priv = vma_priv->file_priv; + + mutex_lock(&file_priv->lock); + + vma_priv->users--; + if (!vma_priv->users) + privcmd_buf_vmapriv_free(vma_priv); + + mutex_unlock(&file_priv->lock); +} + +static vm_fault_t privcmd_buf_vma_fault(struct vm_fault *vmf) +{ + pr_debug("fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", + vmf->vma, vmf->vma->vm_start, vmf->vma->vm_end, + vmf->pgoff, (void *)vmf->address); + + return VM_FAULT_SIGBUS; +} + +static const struct vm_operations_struct privcmd_buf_vm_ops = { + .open = privcmd_buf_vma_open, + .close = privcmd_buf_vma_close, + .fault = privcmd_buf_vma_fault, +}; + +static int privcmd_buf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct privcmd_buf_private *file_priv = file->private_data; + struct privcmd_buf_vma_private *vma_priv; + unsigned long count = vma_pages(vma); + unsigned int i; + int ret = 0; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma_priv = kzalloc(struct_size(vma_priv, pages, count), GFP_KERNEL); + if (!vma_priv) + return -ENOMEM; + + for (i = 0; i < count; i++) { + vma_priv->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!vma_priv->pages[i]) + break; + vma_priv->n_pages++; + } + + mutex_lock(&file_priv->lock); + + vma_priv->file_priv = file_priv; + vma_priv->users = 1; + + vm_flags_set(vma, VM_IO | VM_DONTEXPAND); + vma->vm_ops = &privcmd_buf_vm_ops; + vma->vm_private_data = vma_priv; + + list_add(&vma_priv->list, &file_priv->list); + + if (vma_priv->n_pages != count) + ret = -ENOMEM; + else + ret = vm_map_pages_zero(vma, vma_priv->pages, + vma_priv->n_pages); + + if (ret) + privcmd_buf_vmapriv_free(vma_priv); + + mutex_unlock(&file_priv->lock); + + return ret; +} + +const struct file_operations xen_privcmdbuf_fops = { + .owner = THIS_MODULE, + .open = privcmd_buf_open, + .release = privcmd_buf_release, + .mmap = privcmd_buf_mmap, +}; +EXPORT_SYMBOL_GPL(xen_privcmdbuf_fops); + +struct miscdevice xen_privcmdbuf_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "xen/hypercall", + .fops = &xen_privcmdbuf_fops, +}; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index f8e5dd701ecb..f52a457b302d 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** * privcmd.c * @@ -8,11 +9,17 @@ #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt +#include <linux/eventfd.h> +#include <linux/file.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/mutex.h> +#include <linux/poll.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/srcu.h> #include <linux/string.h> +#include <linux/workqueue.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/mman.h> @@ -22,43 +29,73 @@ #include <linux/pagemap.h> #include <linux/seq_file.h> #include <linux/miscdevice.h> +#include <linux/moduleparam.h> +#include <linux/virtio_mmio.h> -#include <asm/pgalloc.h> -#include <asm/pgtable.h> -#include <asm/tlb.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> #include <xen/xen.h> +#include <xen/events.h> #include <xen/privcmd.h> #include <xen/interface/xen.h> +#include <xen/interface/memory.h> +#include <xen/interface/hvm/dm_op.h> +#include <xen/interface/hvm/ioreq.h> #include <xen/features.h> #include <xen/page.h> #include <xen/xen-ops.h> #include <xen/balloon.h> +#ifdef CONFIG_XEN_ACPI +#include <xen/acpi.h> +#endif #include "privcmd.h" +MODULE_DESCRIPTION("Xen hypercall passthrough driver"); MODULE_LICENSE("GPL"); #define PRIV_VMA_LOCKED ((void *)1) -#ifndef HAVE_ARCH_PRIVCMD_MMAP -static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma); -#endif +static unsigned int privcmd_dm_op_max_num = 16; +module_param_named(dm_op_max_nr_bufs, privcmd_dm_op_max_num, uint, 0644); +MODULE_PARM_DESC(dm_op_max_nr_bufs, + "Maximum number of buffers per dm_op hypercall"); -static long privcmd_ioctl_hypercall(void __user *udata) +static unsigned int privcmd_dm_op_buf_max_size = 4096; +module_param_named(dm_op_buf_max_size, privcmd_dm_op_buf_max_size, uint, + 0644); +MODULE_PARM_DESC(dm_op_buf_max_size, + "Maximum size of a dm_op hypercall buffer"); + +struct privcmd_data { + domid_t domid; +}; + +static int privcmd_vma_range_is_mapped( + struct vm_area_struct *vma, + unsigned long addr, + unsigned long nr_pages); + +static long privcmd_ioctl_hypercall(struct file *file, void __user *udata) { + struct privcmd_data *data = file->private_data; struct privcmd_hypercall hypercall; long ret; + /* Disallow arbitrary hypercalls if restricted */ + if (data->domid != DOMID_INVALID) + return -EPERM; + if (copy_from_user(&hypercall, udata, sizeof(hypercall))) return -EFAULT; + xen_preemptible_hcall_begin(); ret = privcmd_call(hypercall.op, hypercall.arg[0], hypercall.arg[1], hypercall.arg[2], hypercall.arg[3], hypercall.arg[4]); + xen_preemptible_hcall_end(); return ret; } @@ -156,16 +193,47 @@ static int traverse_pages(unsigned nelem, size_t size, return ret; } -struct mmap_mfn_state { +/* + * Similar to traverse_pages, but use each page as a "block" of + * data to be processed as one unit. + */ +static int traverse_pages_block(unsigned nelem, size_t size, + struct list_head *pos, + int (*fn)(void *data, int nr, void *state), + void *state) +{ + void *pagedata; + int ret = 0; + + BUG_ON(size > PAGE_SIZE); + + while (nelem) { + int nr = (PAGE_SIZE/size); + struct page *page; + if (nr > nelem) + nr = nelem; + pos = pos->next; + page = list_entry(pos, struct page, lru); + pagedata = page_address(page); + ret = (*fn)(pagedata, nr, state); + if (ret) + break; + nelem -= nr; + } + + return ret; +} + +struct mmap_gfn_state { unsigned long va; struct vm_area_struct *vma; domid_t domain; }; -static int mmap_mfn_range(void *data, void *state) +static int mmap_gfn_range(void *data, void *state) { struct privcmd_mmap_entry *msg = data; - struct mmap_mfn_state *st = state; + struct mmap_gfn_state *st = state; struct vm_area_struct *vma = st->vma; int rc; @@ -179,7 +247,7 @@ static int mmap_mfn_range(void *data, void *state) ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end)) return -EINVAL; - rc = xen_remap_domain_mfn_range(vma, + rc = xen_remap_domain_gfn_range(vma, msg->va & PAGE_MASK, msg->mfn, msg->npages, vma->vm_page_prot, @@ -192,22 +260,27 @@ static int mmap_mfn_range(void *data, void *state) return 0; } -static long privcmd_ioctl_mmap(void __user *udata) +static long privcmd_ioctl_mmap(struct file *file, void __user *udata) { + struct privcmd_data *data = file->private_data; struct privcmd_mmap mmapcmd; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int rc; LIST_HEAD(pagelist); - struct mmap_mfn_state state; + struct mmap_gfn_state state; - /* We only support privcmd_ioctl_mmap_batch for auto translated. */ - if (xen_feature(XENFEAT_auto_translated_physmap)) + /* We only support privcmd_ioctl_mmap_batch for non-auto-translated. */ + if (!xen_pv_domain()) return -ENOSYS; if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) return -EFAULT; + /* If restriction is in place, check the domid matches */ + if (data->domid != DOMID_INVALID && data->domid != mmapcmd.dom) + return -EPERM; + rc = gather_array(&pagelist, mmapcmd.num, sizeof(struct privcmd_mmap_entry), mmapcmd.entry); @@ -215,19 +288,19 @@ static long privcmd_ioctl_mmap(void __user *udata) if (rc || list_empty(&pagelist)) goto out; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); { struct page *page = list_first_entry(&pagelist, struct page, lru); struct privcmd_mmap_entry *msg = page_address(page); - vma = find_vma(mm, msg->va); + vma = vma_lookup(mm, msg->va); rc = -EINVAL; - if (!vma || (msg->va != vma->vm_start) || - !privcmd_enforce_singleshot_mapping(vma)) + if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) goto out_up; + vma->vm_private_data = PRIV_VMA_LOCKED; } state.va = vma->vm_start; @@ -236,11 +309,11 @@ static long privcmd_ioctl_mmap(void __user *udata) rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry), &pagelist, - mmap_mfn_range, &state); + mmap_gfn_range, &state); out_up: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); out: free_page_list(&pagelist); @@ -262,48 +335,34 @@ struct mmap_batch_state { int global_error; int version; - /* User-space mfn array to store errors in the second pass for V1. */ - xen_pfn_t __user *user_mfn; + /* User-space gfn array to store errors in the second pass for V1. */ + xen_pfn_t __user *user_gfn; /* User-space int array to store errors in the second pass for V2. */ int __user *user_err; }; -/* auto translated dom0 note: if domU being created is PV, then mfn is - * mfn(addr on bus). If it's auto xlated, then mfn is pfn (input to HAP). +/* auto translated dom0 note: if domU being created is PV, then gfn is + * mfn(addr on bus). If it's auto xlated, then gfn is pfn (input to HAP). */ -static int mmap_batch_fn(void *data, void *state) +static int mmap_batch_fn(void *data, int nr, void *state) { - xen_pfn_t *mfnp = data; + xen_pfn_t *gfnp = data; struct mmap_batch_state *st = state; struct vm_area_struct *vma = st->vma; struct page **pages = vma->vm_private_data; - struct page *cur_page = NULL; + struct page **cur_pages = NULL; int ret; - if (xen_feature(XENFEAT_auto_translated_physmap)) - cur_page = pages[st->index++]; - - ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, - st->vma->vm_page_prot, st->domain, - &cur_page); + if (!xen_pv_domain()) + cur_pages = &pages[st->index]; - /* Store error code for second pass. */ - if (st->version == 1) { - if (ret < 0) { - /* - * V1 encodes the error codes in the 32bit top nibble of the - * mfn (with its known limitations vis-a-vis 64 bit callers). - */ - *mfnp |= (ret == -ENOENT) ? - PRIVCMD_MMAPBATCH_PAGED_ERROR : - PRIVCMD_MMAPBATCH_MFN_ERROR; - } - } else { /* st->version == 2 */ - *((int *) mfnp) = ret; - } + BUG_ON(nr < 0); + ret = xen_remap_domain_gfn_array(st->vma, st->va & PAGE_MASK, gfnp, nr, + (int *)gfnp, st->vma->vm_page_prot, + st->domain, cur_pages); - /* And see if it affects the global_error. */ - if (ret < 0) { + /* Adjust the global_error? */ + if (ret != nr) { if (ret == -ENOENT) st->global_error = -ENOENT; else { @@ -312,23 +371,35 @@ static int mmap_batch_fn(void *data, void *state) st->global_error = 1; } } - st->va += PAGE_SIZE; + st->va += XEN_PAGE_SIZE * nr; + st->index += nr / XEN_PFN_PER_PAGE; return 0; } -static int mmap_return_errors(void *data, void *state) +static int mmap_return_error(int err, struct mmap_batch_state *st) { - struct mmap_batch_state *st = state; + int ret; if (st->version == 1) { - xen_pfn_t mfnp = *((xen_pfn_t *) data); - if (mfnp & PRIVCMD_MMAPBATCH_MFN_ERROR) - return __put_user(mfnp, st->user_mfn++); - else - st->user_mfn++; + if (err) { + xen_pfn_t gfn; + + ret = get_user(gfn, st->user_gfn); + if (ret < 0) + return ret; + /* + * V1 encodes the error codes in the 32bit top + * nibble of the gfn (with its known + * limitations vis-a-vis 64 bit callers). + */ + gfn |= (err == -ENOENT) ? + PRIVCMD_MMAPBATCH_PAGED_ERROR : + PRIVCMD_MMAPBATCH_MFN_ERROR; + return __put_user(gfn, st->user_gfn++); + } else + st->user_gfn++; } else { /* st->version == 2 */ - int err = *((int *) data); if (err) return __put_user(err, st->user_err++); else @@ -338,7 +409,22 @@ static int mmap_return_errors(void *data, void *state) return 0; } -/* Allocate pfns that are then mapped with gmfns from foreign domid. Update +static int mmap_return_errors(void *data, int nr, void *state) +{ + struct mmap_batch_state *st = state; + int *errs = data; + int i; + int ret; + + for (i = 0; i < nr; i++) { + ret = mmap_return_error(errs[i], st); + if (ret < 0) + return ret; + } + return 0; +} + +/* Allocate pfns that are then mapped with gfns from foreign domid. Update * the vma with the page info to use later. * Returns: 0 if success, otherwise -errno */ @@ -347,27 +433,29 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) int rc; struct page **pages; - pages = kcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); + pages = kvcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); if (pages == NULL) return -ENOMEM; - rc = alloc_xenballooned_pages(numpgs, pages, 0); + rc = xen_alloc_unpopulated_pages(numpgs, pages); if (rc != 0) { pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, numpgs, rc); - kfree(pages); + kvfree(pages); return -ENOMEM; } - BUG_ON(vma->vm_private_data != PRIV_VMA_LOCKED); + BUG_ON(vma->vm_private_data != NULL); vma->vm_private_data = pages; return 0; } -static struct vm_operations_struct privcmd_vm_ops; +static const struct vm_operations_struct privcmd_vm_ops; -static long privcmd_ioctl_mmap_batch(void __user *udata, int version) +static long privcmd_ioctl_mmap_batch( + struct file *file, void __user *udata, int version) { + struct privcmd_data *data = file->private_data; int ret; struct privcmd_mmapbatch_v2 m; struct mm_struct *mm = current->mm; @@ -382,21 +470,25 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) return -EFAULT; /* Returns per-frame error in m.arr. */ m.err = NULL; - if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) + if (!access_ok(m.arr, m.num * sizeof(*m.arr))) return -EFAULT; break; case 2: if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) return -EFAULT; /* Returns per-frame error code in m.err. */ - if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) + if (!access_ok(m.err, m.num * (sizeof(*m.err)))) return -EFAULT; break; default: return -EINVAL; } - nr_pages = m.num; + /* If restriction is in place, check the domid matches */ + if (data->domid != DOMID_INVALID && data->domid != m.dom) + return -EPERM; + + nr_pages = DIV_ROUND_UP(m.num, XEN_PFN_PER_PAGE); if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) return -EINVAL; @@ -417,23 +509,47 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) } } - down_write(&mm->mmap_sem); + mmap_write_lock(mm); vma = find_vma(mm, m.addr); if (!vma || - vma->vm_ops != &privcmd_vm_ops || - (m.addr != vma->vm_start) || - ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) || - !privcmd_enforce_singleshot_mapping(vma)) { - up_write(&mm->mmap_sem); + vma->vm_ops != &privcmd_vm_ops) { ret = -EINVAL; - goto out; + goto out_unlock; } - if (xen_feature(XENFEAT_auto_translated_physmap)) { - ret = alloc_empty_pages(vma, m.num); - if (ret < 0) { - up_write(&mm->mmap_sem); - goto out; + + /* + * Caller must either: + * + * Map the whole VMA range, which will also allocate all the + * pages required for the auto_translated_physmap case. + * + * Or + * + * Map unmapped holes left from a previous map attempt (e.g., + * because those foreign frames were previously paged out). + */ + if (vma->vm_private_data == NULL) { + if (m.addr != vma->vm_start || + m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) { + ret = -EINVAL; + goto out_unlock; + } + if (!xen_pv_domain()) { + ret = alloc_empty_pages(vma, nr_pages); + if (ret < 0) + goto out_unlock; + } else + vma->vm_private_data = PRIV_VMA_LOCKED; + } else { + if (m.addr < vma->vm_start || + m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) { + ret = -EINVAL; + goto out_unlock; + } + if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) { + ret = -EINVAL; + goto out_unlock; } } @@ -444,18 +560,19 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) state.global_error = 0; state.version = version; + BUILD_BUG_ON(((PAGE_SIZE / sizeof(xen_pfn_t)) % XEN_PFN_PER_PAGE) != 0); /* mmap_batch_fn guarantees ret == 0 */ - BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t), - &pagelist, mmap_batch_fn, &state)); + BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t), + &pagelist, mmap_batch_fn, &state)); - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); if (state.global_error) { /* Write back errors in second pass. */ - state.user_mfn = (xen_pfn_t *)m.arr; + state.user_gfn = (xen_pfn_t *)m.arr; state.user_err = m.err; - ret = traverse_pages(m.num, sizeof(xen_pfn_t), - &pagelist, mmap_return_errors, &state); + ret = traverse_pages_block(m.num, sizeof(xen_pfn_t), + &pagelist, mmap_return_errors, &state); } else ret = 0; @@ -466,64 +583,1034 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) out: free_page_list(&pagelist); + return ret; + +out_unlock: + mmap_write_unlock(mm); + goto out; +} + +static int lock_pages( + struct privcmd_dm_op_buf kbufs[], unsigned int num, + struct page *pages[], unsigned int nr_pages, unsigned int *pinned) +{ + unsigned int i, off = 0; + + for (i = 0; i < num; ) { + unsigned int requested; + int page_count; + + requested = DIV_ROUND_UP( + offset_in_page(kbufs[i].uptr) + kbufs[i].size, + PAGE_SIZE) - off; + if (requested > nr_pages) + return -ENOSPC; + + page_count = pin_user_pages_fast( + (unsigned long)kbufs[i].uptr + off * PAGE_SIZE, + requested, FOLL_WRITE, pages); + if (page_count <= 0) + return page_count ? : -EFAULT; + + *pinned += page_count; + nr_pages -= page_count; + pages += page_count; + + off = (requested == page_count) ? 0 : off + page_count; + i += !off; + } + + return 0; +} + +static void unlock_pages(struct page *pages[], unsigned int nr_pages) +{ + unpin_user_pages_dirty_lock(pages, nr_pages, true); +} + +static long privcmd_ioctl_dm_op(struct file *file, void __user *udata) +{ + struct privcmd_data *data = file->private_data; + struct privcmd_dm_op kdata; + struct privcmd_dm_op_buf *kbufs; + unsigned int nr_pages = 0; + struct page **pages = NULL; + struct xen_dm_op_buf *xbufs = NULL; + unsigned int i; + long rc; + unsigned int pinned = 0; + + if (copy_from_user(&kdata, udata, sizeof(kdata))) + return -EFAULT; + + /* If restriction is in place, check the domid matches */ + if (data->domid != DOMID_INVALID && data->domid != kdata.dom) + return -EPERM; + + if (kdata.num == 0) + return 0; + + if (kdata.num > privcmd_dm_op_max_num) + return -E2BIG; + + kbufs = kcalloc(kdata.num, sizeof(*kbufs), GFP_KERNEL); + if (!kbufs) + return -ENOMEM; + + if (copy_from_user(kbufs, kdata.ubufs, + sizeof(*kbufs) * kdata.num)) { + rc = -EFAULT; + goto out; + } + + for (i = 0; i < kdata.num; i++) { + if (kbufs[i].size > privcmd_dm_op_buf_max_size) { + rc = -E2BIG; + goto out; + } + + if (!access_ok(kbufs[i].uptr, + kbufs[i].size)) { + rc = -EFAULT; + goto out; + } + + nr_pages += DIV_ROUND_UP( + offset_in_page(kbufs[i].uptr) + kbufs[i].size, + PAGE_SIZE); + } + + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) { + rc = -ENOMEM; + goto out; + } + + xbufs = kcalloc(kdata.num, sizeof(*xbufs), GFP_KERNEL); + if (!xbufs) { + rc = -ENOMEM; + goto out; + } + + rc = lock_pages(kbufs, kdata.num, pages, nr_pages, &pinned); + if (rc < 0) + goto out; + + for (i = 0; i < kdata.num; i++) { + set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr); + xbufs[i].size = kbufs[i].size; + } + + xen_preemptible_hcall_begin(); + rc = HYPERVISOR_dm_op(kdata.dom, kdata.num, xbufs); + xen_preemptible_hcall_end(); + +out: + unlock_pages(pages, pinned); + kfree(xbufs); + kfree(pages); + kfree(kbufs); + + return rc; +} + +static long privcmd_ioctl_restrict(struct file *file, void __user *udata) +{ + struct privcmd_data *data = file->private_data; + domid_t dom; + + if (copy_from_user(&dom, udata, sizeof(dom))) + return -EFAULT; + + /* Set restriction to the specified domain, or check it matches */ + if (data->domid == DOMID_INVALID) + data->domid = dom; + else if (data->domid != dom) + return -EINVAL; + + return 0; +} + +static long privcmd_ioctl_mmap_resource(struct file *file, + struct privcmd_mmap_resource __user *udata) +{ + struct privcmd_data *data = file->private_data; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct privcmd_mmap_resource kdata; + xen_pfn_t *pfns = NULL; + struct xen_mem_acquire_resource xdata = { }; + int rc; + + if (copy_from_user(&kdata, udata, sizeof(kdata))) + return -EFAULT; + + /* If restriction is in place, check the domid matches */ + if (data->domid != DOMID_INVALID && data->domid != kdata.dom) + return -EPERM; + + /* Both fields must be set or unset */ + if (!!kdata.addr != !!kdata.num) + return -EINVAL; + + xdata.domid = kdata.dom; + xdata.type = kdata.type; + xdata.id = kdata.id; + + if (!kdata.addr && !kdata.num) { + /* Query the size of the resource. */ + rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata); + if (rc) + return rc; + return __put_user(xdata.nr_frames, &udata->num); + } + + mmap_write_lock(mm); + + vma = find_vma(mm, kdata.addr); + if (!vma || vma->vm_ops != &privcmd_vm_ops) { + rc = -EINVAL; + goto out; + } + + pfns = kcalloc(kdata.num, sizeof(*pfns), GFP_KERNEL | __GFP_NOWARN); + if (!pfns) { + rc = -ENOMEM; + goto out; + } + + if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && !xen_pv_domain()) { + unsigned int nr = DIV_ROUND_UP(kdata.num, XEN_PFN_PER_PAGE); + struct page **pages; + unsigned int i; + + rc = alloc_empty_pages(vma, nr); + if (rc < 0) + goto out; + + pages = vma->vm_private_data; + + for (i = 0; i < kdata.num; i++) { + xen_pfn_t pfn = + page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]); + + pfns[i] = pfn + (i % XEN_PFN_PER_PAGE); + } + } else + vma->vm_private_data = PRIV_VMA_LOCKED; + + xdata.frame = kdata.idx; + xdata.nr_frames = kdata.num; + set_xen_guest_handle(xdata.frame_list, pfns); + + xen_preemptible_hcall_begin(); + rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata); + xen_preemptible_hcall_end(); + + if (rc) + goto out; + + if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && !xen_pv_domain()) { + rc = xen_remap_vma_range(vma, kdata.addr, kdata.num << PAGE_SHIFT); + } else { + unsigned int domid = + (xdata.flags & XENMEM_rsrc_acq_caller_owned) ? + DOMID_SELF : kdata.dom; + int num, *errs = (int *)pfns; + + BUILD_BUG_ON(sizeof(*errs) > sizeof(*pfns)); + num = xen_remap_domain_mfn_array(vma, + kdata.addr & PAGE_MASK, + pfns, kdata.num, errs, + vma->vm_page_prot, + domid); + if (num < 0) + rc = num; + else if (num != kdata.num) { + unsigned int i; + + for (i = 0; i < num; i++) { + rc = errs[i]; + if (rc < 0) + break; + } + } else + rc = 0; + } + +out: + mmap_write_unlock(mm); + kfree(pfns); + + return rc; +} + +static long privcmd_ioctl_pcidev_get_gsi(struct file *file, void __user *udata) +{ +#if defined(CONFIG_XEN_ACPI) + int rc; + struct privcmd_pcidev_get_gsi kdata; + + if (copy_from_user(&kdata, udata, sizeof(kdata))) + return -EFAULT; + + rc = xen_acpi_get_gsi_from_sbdf(kdata.sbdf); + if (rc < 0) + return rc; + + kdata.gsi = rc; + if (copy_to_user(udata, &kdata, sizeof(kdata))) + return -EFAULT; + + return 0; +#else + return -EINVAL; +#endif +} + +#ifdef CONFIG_XEN_PRIVCMD_EVENTFD +/* Irqfd support */ +static struct workqueue_struct *irqfd_cleanup_wq; +static DEFINE_SPINLOCK(irqfds_lock); +DEFINE_STATIC_SRCU(irqfds_srcu); +static LIST_HEAD(irqfds_list); + +struct privcmd_kernel_irqfd { + struct xen_dm_op_buf xbufs; + domid_t dom; + bool error; + struct eventfd_ctx *eventfd; + struct work_struct shutdown; + wait_queue_entry_t wait; + struct list_head list; + poll_table pt; +}; + +static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd) +{ + lockdep_assert_held(&irqfds_lock); + + list_del_init(&kirqfd->list); + queue_work(irqfd_cleanup_wq, &kirqfd->shutdown); +} + +static void irqfd_shutdown(struct work_struct *work) +{ + struct privcmd_kernel_irqfd *kirqfd = + container_of(work, struct privcmd_kernel_irqfd, shutdown); + u64 cnt; + + /* Make sure irqfd has been initialized in assign path */ + synchronize_srcu(&irqfds_srcu); + + eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt); + eventfd_ctx_put(kirqfd->eventfd); + kfree(kirqfd); +} + +static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd) +{ + u64 cnt; + long rc; + + eventfd_ctx_do_read(kirqfd->eventfd, &cnt); + + xen_preemptible_hcall_begin(); + rc = HYPERVISOR_dm_op(kirqfd->dom, 1, &kirqfd->xbufs); + xen_preemptible_hcall_end(); + + /* Don't repeat the error message for consecutive failures */ + if (rc && !kirqfd->error) { + pr_err("Failed to configure irq for guest domain: %d\n", + kirqfd->dom); + } + + kirqfd->error = rc; +} + +static int +irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) +{ + struct privcmd_kernel_irqfd *kirqfd = + container_of(wait, struct privcmd_kernel_irqfd, wait); + __poll_t flags = key_to_poll(key); + + if (flags & EPOLLIN) + irqfd_inject(kirqfd); + + if (flags & EPOLLHUP) { + unsigned long flags; + + spin_lock_irqsave(&irqfds_lock, flags); + irqfd_deactivate(kirqfd); + spin_unlock_irqrestore(&irqfds_lock, flags); + } + + return 0; +} + +static void +irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) +{ + struct privcmd_kernel_irqfd *kirqfd = + container_of(pt, struct privcmd_kernel_irqfd, pt); + + add_wait_queue_priority(wqh, &kirqfd->wait); +} + +static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd) +{ + struct privcmd_kernel_irqfd *kirqfd, *tmp; + unsigned long flags; + __poll_t events; + void *dm_op; + int ret, idx; + + CLASS(fd, f)(irqfd->fd); + + kirqfd = kzalloc(sizeof(*kirqfd) + irqfd->size, GFP_KERNEL); + if (!kirqfd) + return -ENOMEM; + dm_op = kirqfd + 1; + + if (copy_from_user(dm_op, u64_to_user_ptr(irqfd->dm_op), irqfd->size)) { + ret = -EFAULT; + goto error_kfree; + } + + kirqfd->xbufs.size = irqfd->size; + set_xen_guest_handle(kirqfd->xbufs.h, dm_op); + kirqfd->dom = irqfd->dom; + INIT_WORK(&kirqfd->shutdown, irqfd_shutdown); + + if (fd_empty(f)) { + ret = -EBADF; + goto error_kfree; + } + + kirqfd->eventfd = eventfd_ctx_fileget(fd_file(f)); + if (IS_ERR(kirqfd->eventfd)) { + ret = PTR_ERR(kirqfd->eventfd); + goto error_kfree; + } + + /* + * Install our own custom wake-up handling so we are notified via a + * callback whenever someone signals the underlying eventfd. + */ + init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup); + init_poll_funcptr(&kirqfd->pt, irqfd_poll_func); + + spin_lock_irqsave(&irqfds_lock, flags); + + list_for_each_entry(tmp, &irqfds_list, list) { + if (kirqfd->eventfd == tmp->eventfd) { + ret = -EBUSY; + spin_unlock_irqrestore(&irqfds_lock, flags); + goto error_eventfd; + } + } + + idx = srcu_read_lock(&irqfds_srcu); + list_add_tail(&kirqfd->list, &irqfds_list); + spin_unlock_irqrestore(&irqfds_lock, flags); + /* + * Check if there was an event already pending on the eventfd before we + * registered, and trigger it as if we didn't miss it. + */ + events = vfs_poll(fd_file(f), &kirqfd->pt); + if (events & EPOLLIN) + irqfd_inject(kirqfd); + + srcu_read_unlock(&irqfds_srcu, idx); + return 0; + +error_eventfd: + eventfd_ctx_put(kirqfd->eventfd); + +error_kfree: + kfree(kirqfd); return ret; } +static int privcmd_irqfd_deassign(struct privcmd_irqfd *irqfd) +{ + struct privcmd_kernel_irqfd *kirqfd; + struct eventfd_ctx *eventfd; + unsigned long flags; + + eventfd = eventfd_ctx_fdget(irqfd->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + spin_lock_irqsave(&irqfds_lock, flags); + + list_for_each_entry(kirqfd, &irqfds_list, list) { + if (kirqfd->eventfd == eventfd) { + irqfd_deactivate(kirqfd); + break; + } + } + + spin_unlock_irqrestore(&irqfds_lock, flags); + + eventfd_ctx_put(eventfd); + + /* + * Block until we know all outstanding shutdown jobs have completed so + * that we guarantee there will not be any more interrupts once this + * deassign function returns. + */ + flush_workqueue(irqfd_cleanup_wq); + + return 0; +} + +static long privcmd_ioctl_irqfd(struct file *file, void __user *udata) +{ + struct privcmd_data *data = file->private_data; + struct privcmd_irqfd irqfd; + + if (copy_from_user(&irqfd, udata, sizeof(irqfd))) + return -EFAULT; + + /* No other flags should be set */ + if (irqfd.flags & ~PRIVCMD_IRQFD_FLAG_DEASSIGN) + return -EINVAL; + + /* If restriction is in place, check the domid matches */ + if (data->domid != DOMID_INVALID && data->domid != irqfd.dom) + return -EPERM; + + if (irqfd.flags & PRIVCMD_IRQFD_FLAG_DEASSIGN) + return privcmd_irqfd_deassign(&irqfd); + + return privcmd_irqfd_assign(&irqfd); +} + +static int privcmd_irqfd_init(void) +{ + irqfd_cleanup_wq = alloc_workqueue("privcmd-irqfd-cleanup", 0, 0); + if (!irqfd_cleanup_wq) + return -ENOMEM; + + return 0; +} + +static void privcmd_irqfd_exit(void) +{ + struct privcmd_kernel_irqfd *kirqfd, *tmp; + unsigned long flags; + + spin_lock_irqsave(&irqfds_lock, flags); + + list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list) + irqfd_deactivate(kirqfd); + + spin_unlock_irqrestore(&irqfds_lock, flags); + + destroy_workqueue(irqfd_cleanup_wq); +} + +/* Ioeventfd Support */ +#define QUEUE_NOTIFY_VQ_MASK 0xFFFF + +static DEFINE_MUTEX(ioreq_lock); +static LIST_HEAD(ioreq_list); + +/* per-eventfd structure */ +struct privcmd_kernel_ioeventfd { + struct eventfd_ctx *eventfd; + struct list_head list; + u64 addr; + unsigned int addr_len; + unsigned int vq; +}; + +/* per-guest CPU / port structure */ +struct ioreq_port { + int vcpu; + unsigned int port; + struct privcmd_kernel_ioreq *kioreq; +}; + +/* per-guest structure */ +struct privcmd_kernel_ioreq { + domid_t dom; + unsigned int vcpus; + u64 uioreq; + struct ioreq *ioreq; + spinlock_t lock; /* Protects ioeventfds list */ + struct list_head ioeventfds; + struct list_head list; + struct ioreq_port ports[] __counted_by(vcpus); +}; + +static irqreturn_t ioeventfd_interrupt(int irq, void *dev_id) +{ + struct ioreq_port *port = dev_id; + struct privcmd_kernel_ioreq *kioreq = port->kioreq; + struct ioreq *ioreq = &kioreq->ioreq[port->vcpu]; + struct privcmd_kernel_ioeventfd *kioeventfd; + unsigned int state = STATE_IOREQ_READY; + + if (ioreq->state != STATE_IOREQ_READY || + ioreq->type != IOREQ_TYPE_COPY || ioreq->dir != IOREQ_WRITE) + return IRQ_NONE; + + /* + * We need a barrier, smp_mb(), here to ensure reads are finished before + * `state` is updated. Since the lock implementation ensures that + * appropriate barrier will be added anyway, we can avoid adding + * explicit barrier here. + * + * Ideally we don't need to update `state` within the locks, but we do + * that here to avoid adding explicit barrier. + */ + + spin_lock(&kioreq->lock); + ioreq->state = STATE_IOREQ_INPROCESS; + + list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) { + if (ioreq->addr == kioeventfd->addr + VIRTIO_MMIO_QUEUE_NOTIFY && + ioreq->size == kioeventfd->addr_len && + (ioreq->data & QUEUE_NOTIFY_VQ_MASK) == kioeventfd->vq) { + eventfd_signal(kioeventfd->eventfd); + state = STATE_IORESP_READY; + break; + } + } + spin_unlock(&kioreq->lock); + + /* + * We need a barrier, smp_mb(), here to ensure writes are finished + * before `state` is updated. Since the lock implementation ensures that + * appropriate barrier will be added anyway, we can avoid adding + * explicit barrier here. + */ + + ioreq->state = state; + + if (state == STATE_IORESP_READY) { + notify_remote_via_evtchn(port->port); + return IRQ_HANDLED; + } + + return IRQ_NONE; +} + +static void ioreq_free(struct privcmd_kernel_ioreq *kioreq) +{ + struct ioreq_port *ports = kioreq->ports; + int i; + + lockdep_assert_held(&ioreq_lock); + + list_del(&kioreq->list); + + for (i = kioreq->vcpus - 1; i >= 0; i--) + unbind_from_irqhandler(irq_from_evtchn(ports[i].port), &ports[i]); + + kfree(kioreq); +} + +static +struct privcmd_kernel_ioreq *alloc_ioreq(struct privcmd_ioeventfd *ioeventfd) +{ + struct privcmd_kernel_ioreq *kioreq; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct page **pages; + unsigned int *ports; + int ret, size, i; + + lockdep_assert_held(&ioreq_lock); + + size = struct_size(kioreq, ports, ioeventfd->vcpus); + kioreq = kzalloc(size, GFP_KERNEL); + if (!kioreq) + return ERR_PTR(-ENOMEM); + + kioreq->dom = ioeventfd->dom; + kioreq->vcpus = ioeventfd->vcpus; + kioreq->uioreq = ioeventfd->ioreq; + spin_lock_init(&kioreq->lock); + INIT_LIST_HEAD(&kioreq->ioeventfds); + + /* The memory for ioreq server must have been mapped earlier */ + mmap_write_lock(mm); + vma = find_vma(mm, (unsigned long)ioeventfd->ioreq); + if (!vma) { + pr_err("Failed to find vma for ioreq page!\n"); + mmap_write_unlock(mm); + ret = -EFAULT; + goto error_kfree; + } + + pages = vma->vm_private_data; + kioreq->ioreq = (struct ioreq *)(page_to_virt(pages[0])); + mmap_write_unlock(mm); + + ports = memdup_array_user(u64_to_user_ptr(ioeventfd->ports), + kioreq->vcpus, sizeof(*ports)); + if (IS_ERR(ports)) { + ret = PTR_ERR(ports); + goto error_kfree; + } + + for (i = 0; i < kioreq->vcpus; i++) { + kioreq->ports[i].vcpu = i; + kioreq->ports[i].port = ports[i]; + kioreq->ports[i].kioreq = kioreq; + + ret = bind_evtchn_to_irqhandler_lateeoi(ports[i], + ioeventfd_interrupt, IRQF_SHARED, "ioeventfd", + &kioreq->ports[i]); + if (ret < 0) + goto error_unbind; + } + + kfree(ports); + + list_add_tail(&kioreq->list, &ioreq_list); + + return kioreq; + +error_unbind: + while (--i >= 0) + unbind_from_irqhandler(irq_from_evtchn(ports[i]), &kioreq->ports[i]); + + kfree(ports); +error_kfree: + kfree(kioreq); + return ERR_PTR(ret); +} + +static struct privcmd_kernel_ioreq * +get_ioreq(struct privcmd_ioeventfd *ioeventfd, struct eventfd_ctx *eventfd) +{ + struct privcmd_kernel_ioreq *kioreq; + unsigned long flags; + + list_for_each_entry(kioreq, &ioreq_list, list) { + struct privcmd_kernel_ioeventfd *kioeventfd; + + /* + * kioreq fields can be accessed here without a lock as they are + * never updated after being added to the ioreq_list. + */ + if (kioreq->uioreq != ioeventfd->ioreq) { + continue; + } else if (kioreq->dom != ioeventfd->dom || + kioreq->vcpus != ioeventfd->vcpus) { + pr_err("Invalid ioeventfd configuration mismatch, dom (%u vs %u), vcpus (%u vs %u)\n", + kioreq->dom, ioeventfd->dom, kioreq->vcpus, + ioeventfd->vcpus); + return ERR_PTR(-EINVAL); + } + + /* Look for a duplicate eventfd for the same guest */ + spin_lock_irqsave(&kioreq->lock, flags); + list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) { + if (eventfd == kioeventfd->eventfd) { + spin_unlock_irqrestore(&kioreq->lock, flags); + return ERR_PTR(-EBUSY); + } + } + spin_unlock_irqrestore(&kioreq->lock, flags); + + return kioreq; + } + + /* Matching kioreq isn't found, allocate a new one */ + return alloc_ioreq(ioeventfd); +} + +static void ioeventfd_free(struct privcmd_kernel_ioeventfd *kioeventfd) +{ + list_del(&kioeventfd->list); + eventfd_ctx_put(kioeventfd->eventfd); + kfree(kioeventfd); +} + +static int privcmd_ioeventfd_assign(struct privcmd_ioeventfd *ioeventfd) +{ + struct privcmd_kernel_ioeventfd *kioeventfd; + struct privcmd_kernel_ioreq *kioreq; + unsigned long flags; + int ret; + + /* Check for range overflow */ + if (ioeventfd->addr + ioeventfd->addr_len < ioeventfd->addr) + return -EINVAL; + + /* Vhost requires us to support length 1, 2, 4, and 8 */ + if (!(ioeventfd->addr_len == 1 || ioeventfd->addr_len == 2 || + ioeventfd->addr_len == 4 || ioeventfd->addr_len == 8)) + return -EINVAL; + + /* 4096 vcpus limit enough ? */ + if (!ioeventfd->vcpus || ioeventfd->vcpus > 4096) + return -EINVAL; + + kioeventfd = kzalloc(sizeof(*kioeventfd), GFP_KERNEL); + if (!kioeventfd) + return -ENOMEM; + + kioeventfd->eventfd = eventfd_ctx_fdget(ioeventfd->event_fd); + if (IS_ERR(kioeventfd->eventfd)) { + ret = PTR_ERR(kioeventfd->eventfd); + goto error_kfree; + } + + kioeventfd->addr = ioeventfd->addr; + kioeventfd->addr_len = ioeventfd->addr_len; + kioeventfd->vq = ioeventfd->vq; + + mutex_lock(&ioreq_lock); + kioreq = get_ioreq(ioeventfd, kioeventfd->eventfd); + if (IS_ERR(kioreq)) { + mutex_unlock(&ioreq_lock); + ret = PTR_ERR(kioreq); + goto error_eventfd; + } + + spin_lock_irqsave(&kioreq->lock, flags); + list_add_tail(&kioeventfd->list, &kioreq->ioeventfds); + spin_unlock_irqrestore(&kioreq->lock, flags); + + mutex_unlock(&ioreq_lock); + + return 0; + +error_eventfd: + eventfd_ctx_put(kioeventfd->eventfd); + +error_kfree: + kfree(kioeventfd); + return ret; +} + +static int privcmd_ioeventfd_deassign(struct privcmd_ioeventfd *ioeventfd) +{ + struct privcmd_kernel_ioreq *kioreq, *tkioreq; + struct eventfd_ctx *eventfd; + unsigned long flags; + int ret = 0; + + eventfd = eventfd_ctx_fdget(ioeventfd->event_fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + mutex_lock(&ioreq_lock); + list_for_each_entry_safe(kioreq, tkioreq, &ioreq_list, list) { + struct privcmd_kernel_ioeventfd *kioeventfd, *tmp; + /* + * kioreq fields can be accessed here without a lock as they are + * never updated after being added to the ioreq_list. + */ + if (kioreq->dom != ioeventfd->dom || + kioreq->uioreq != ioeventfd->ioreq || + kioreq->vcpus != ioeventfd->vcpus) + continue; + + spin_lock_irqsave(&kioreq->lock, flags); + list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list) { + if (eventfd == kioeventfd->eventfd) { + ioeventfd_free(kioeventfd); + spin_unlock_irqrestore(&kioreq->lock, flags); + + if (list_empty(&kioreq->ioeventfds)) + ioreq_free(kioreq); + goto unlock; + } + } + spin_unlock_irqrestore(&kioreq->lock, flags); + break; + } + + pr_err("Ioeventfd isn't already assigned, dom: %u, addr: %llu\n", + ioeventfd->dom, ioeventfd->addr); + ret = -ENODEV; + +unlock: + mutex_unlock(&ioreq_lock); + eventfd_ctx_put(eventfd); + + return ret; +} + +static long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata) +{ + struct privcmd_data *data = file->private_data; + struct privcmd_ioeventfd ioeventfd; + + if (copy_from_user(&ioeventfd, udata, sizeof(ioeventfd))) + return -EFAULT; + + /* No other flags should be set */ + if (ioeventfd.flags & ~PRIVCMD_IOEVENTFD_FLAG_DEASSIGN) + return -EINVAL; + + /* If restriction is in place, check the domid matches */ + if (data->domid != DOMID_INVALID && data->domid != ioeventfd.dom) + return -EPERM; + + if (ioeventfd.flags & PRIVCMD_IOEVENTFD_FLAG_DEASSIGN) + return privcmd_ioeventfd_deassign(&ioeventfd); + + return privcmd_ioeventfd_assign(&ioeventfd); +} + +static void privcmd_ioeventfd_exit(void) +{ + struct privcmd_kernel_ioreq *kioreq, *tmp; + unsigned long flags; + + mutex_lock(&ioreq_lock); + list_for_each_entry_safe(kioreq, tmp, &ioreq_list, list) { + struct privcmd_kernel_ioeventfd *kioeventfd, *tmp; + + spin_lock_irqsave(&kioreq->lock, flags); + list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list) + ioeventfd_free(kioeventfd); + spin_unlock_irqrestore(&kioreq->lock, flags); + + ioreq_free(kioreq); + } + mutex_unlock(&ioreq_lock); +} +#else +static inline long privcmd_ioctl_irqfd(struct file *file, void __user *udata) +{ + return -EOPNOTSUPP; +} + +static inline int privcmd_irqfd_init(void) +{ + return 0; +} + +static inline void privcmd_irqfd_exit(void) +{ +} + +static inline long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata) +{ + return -EOPNOTSUPP; +} + +static inline void privcmd_ioeventfd_exit(void) +{ +} +#endif /* CONFIG_XEN_PRIVCMD_EVENTFD */ + static long privcmd_ioctl(struct file *file, unsigned int cmd, unsigned long data) { - int ret = -ENOSYS; + int ret = -ENOTTY; void __user *udata = (void __user *) data; switch (cmd) { case IOCTL_PRIVCMD_HYPERCALL: - ret = privcmd_ioctl_hypercall(udata); + ret = privcmd_ioctl_hypercall(file, udata); break; case IOCTL_PRIVCMD_MMAP: - ret = privcmd_ioctl_mmap(udata); + ret = privcmd_ioctl_mmap(file, udata); break; case IOCTL_PRIVCMD_MMAPBATCH: - ret = privcmd_ioctl_mmap_batch(udata, 1); + ret = privcmd_ioctl_mmap_batch(file, udata, 1); break; case IOCTL_PRIVCMD_MMAPBATCH_V2: - ret = privcmd_ioctl_mmap_batch(udata, 2); + ret = privcmd_ioctl_mmap_batch(file, udata, 2); + break; + + case IOCTL_PRIVCMD_DM_OP: + ret = privcmd_ioctl_dm_op(file, udata); + break; + + case IOCTL_PRIVCMD_RESTRICT: + ret = privcmd_ioctl_restrict(file, udata); + break; + + case IOCTL_PRIVCMD_MMAP_RESOURCE: + ret = privcmd_ioctl_mmap_resource(file, udata); + break; + + case IOCTL_PRIVCMD_IRQFD: + ret = privcmd_ioctl_irqfd(file, udata); + break; + + case IOCTL_PRIVCMD_IOEVENTFD: + ret = privcmd_ioctl_ioeventfd(file, udata); + break; + + case IOCTL_PRIVCMD_PCIDEV_GET_GSI: + ret = privcmd_ioctl_pcidev_get_gsi(file, udata); break; default: - ret = -EINVAL; break; } return ret; } +static int privcmd_open(struct inode *ino, struct file *file) +{ + struct privcmd_data *data = kzalloc(sizeof(*data), GFP_KERNEL); + + if (!data) + return -ENOMEM; + + /* DOMID_INVALID implies no restriction */ + data->domid = DOMID_INVALID; + + file->private_data = data; + return 0; +} + +static int privcmd_release(struct inode *ino, struct file *file) +{ + struct privcmd_data *data = file->private_data; + + kfree(data); + return 0; +} + static void privcmd_close(struct vm_area_struct *vma) { struct page **pages = vma->vm_private_data; - int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + int numpgs = vma_pages(vma); + int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT; + int rc; - if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) + if (xen_pv_domain() || !numpgs || !pages) return; - xen_unmap_domain_mfn_range(vma, numpgs, pages); - free_xenballooned_pages(numpgs, pages); - kfree(pages); + rc = xen_unmap_domain_gfn_range(vma, numgfns, pages); + if (rc == 0) + xen_free_unpopulated_pages(numpgs, pages); + else + pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n", + numpgs, rc); + kvfree(pages); } -static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static vm_fault_t privcmd_fault(struct vm_fault *vmf) { printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", - vma, vma->vm_start, vma->vm_end, - vmf->pgoff, vmf->virtual_address); + vmf->vma, vmf->vma->vm_start, vmf->vma->vm_end, + vmf->pgoff, (void *)vmf->address); return VM_FAULT_SIGBUS; } -static struct vm_operations_struct privcmd_vm_ops = { +static const struct vm_operations_struct privcmd_vm_ops = { .close = privcmd_close, .fault = privcmd_fault }; @@ -532,22 +1619,38 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) { /* DONTCOPY is essential for Xen because copy_page_range doesn't know * how to recreate these mappings */ - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY | - VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTCOPY | + VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &privcmd_vm_ops; vma->vm_private_data = NULL; return 0; } -static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma) +/* + * For MMAPBATCH*. This allows asserting the singleshot mapping + * on a per pfn/pte basis. Mapping calls that fail with ENOENT + * can be then retried until success. + */ +static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data) +{ + return pte_none(ptep_get(pte)) ? 0 : -EBUSY; +} + +static int privcmd_vma_range_is_mapped( + struct vm_area_struct *vma, + unsigned long addr, + unsigned long nr_pages) { - return !cmpxchg(&vma->vm_private_data, NULL, PRIV_VMA_LOCKED); + return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT, + is_mapped_fn, NULL) != 0; } const struct file_operations xen_privcmd_fops = { .owner = THIS_MODULE, .unlocked_ioctl = privcmd_ioctl, + .open = privcmd_open, + .release = privcmd_release, .mmap = privcmd_mmap, }; EXPORT_SYMBOL_GPL(xen_privcmd_fops); @@ -570,12 +1673,34 @@ static int __init privcmd_init(void) pr_err("Could not register Xen privcmd device\n"); return err; } + + err = misc_register(&xen_privcmdbuf_dev); + if (err != 0) { + pr_err("Could not register Xen hypercall-buf device\n"); + goto err_privcmdbuf; + } + + err = privcmd_irqfd_init(); + if (err != 0) { + pr_err("irqfd init failed\n"); + goto err_irqfd; + } + return 0; + +err_irqfd: + misc_deregister(&xen_privcmdbuf_dev); +err_privcmdbuf: + misc_deregister(&privcmd_dev); + return err; } static void __exit privcmd_exit(void) { + privcmd_ioeventfd_exit(); + privcmd_irqfd_exit(); misc_deregister(&privcmd_dev); + misc_deregister(&xen_privcmdbuf_dev); } module_init(privcmd_init); diff --git a/drivers/xen/privcmd.h b/drivers/xen/privcmd.h index 14facaeed36f..0dd9f8f67ee3 100644 --- a/drivers/xen/privcmd.h +++ b/drivers/xen/privcmd.h @@ -1,3 +1,6 @@ #include <linux/fs.h> extern const struct file_operations xen_privcmd_fops; +extern const struct file_operations xen_privcmdbuf_fops; + +extern struct miscdevice xen_privcmdbuf_dev; diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c new file mode 100644 index 000000000000..c5b6f6fa11eb --- /dev/null +++ b/drivers/xen/pvcalls-back.c @@ -0,0 +1,1251 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * (c) 2017 Stefano Stabellini <stefano@aporeto.com> + */ + +#include <linux/inet.h> +#include <linux/kthread.h> +#include <linux/list.h> +#include <linux/radix-tree.h> +#include <linux/module.h> +#include <linux/semaphore.h> +#include <linux/wait.h> +#include <net/sock.h> +#include <net/inet_common.h> +#include <net/inet_connection_sock.h> +#include <net/request_sock.h> +#include <trace/events/sock.h> + +#include <xen/events.h> +#include <xen/grant_table.h> +#include <xen/xen.h> +#include <xen/xenbus.h> +#include <xen/interface/io/pvcalls.h> + +#define PVCALLS_VERSIONS "1" +#define MAX_RING_ORDER XENBUS_MAX_RING_GRANT_ORDER + +static struct pvcalls_back_global { + struct list_head frontends; + struct semaphore frontends_lock; +} pvcalls_back_global; + +/* + * Per-frontend data structure. It contains pointers to the command + * ring, its event channel, a list of active sockets and a tree of + * passive sockets. + */ +struct pvcalls_fedata { + struct list_head list; + struct xenbus_device *dev; + struct xen_pvcalls_sring *sring; + struct xen_pvcalls_back_ring ring; + int irq; + struct list_head socket_mappings; + struct radix_tree_root socketpass_mappings; + struct semaphore socket_lock; +}; + +struct pvcalls_ioworker { + struct work_struct register_work; + struct workqueue_struct *wq; +}; + +struct sock_mapping { + struct list_head list; + struct pvcalls_fedata *fedata; + struct sockpass_mapping *sockpass; + struct socket *sock; + uint64_t id; + grant_ref_t ref; + struct pvcalls_data_intf *ring; + void *bytes; + struct pvcalls_data data; + uint32_t ring_order; + int irq; + atomic_t read; + atomic_t write; + atomic_t io; + atomic_t release; + atomic_t eoi; + void (*saved_data_ready)(struct sock *sk); + struct pvcalls_ioworker ioworker; +}; + +struct sockpass_mapping { + struct list_head list; + struct pvcalls_fedata *fedata; + struct socket *sock; + uint64_t id; + struct xen_pvcalls_request reqcopy; + spinlock_t copy_lock; + struct workqueue_struct *wq; + struct work_struct register_work; + void (*saved_data_ready)(struct sock *sk); +}; + +static irqreturn_t pvcalls_back_conn_event(int irq, void *sock_map); +static int pvcalls_back_release_active(struct xenbus_device *dev, + struct pvcalls_fedata *fedata, + struct sock_mapping *map); + +static bool pvcalls_conn_back_read(void *opaque) +{ + struct sock_mapping *map = (struct sock_mapping *)opaque; + struct msghdr msg; + struct kvec vec[2]; + RING_IDX cons, prod, size, wanted, array_size, masked_prod, masked_cons; + int32_t error; + struct pvcalls_data_intf *intf = map->ring; + struct pvcalls_data *data = &map->data; + unsigned long flags; + int ret; + + array_size = XEN_FLEX_RING_SIZE(map->ring_order); + cons = intf->in_cons; + prod = intf->in_prod; + error = intf->in_error; + /* read the indexes first, then deal with the data */ + virt_mb(); + + if (error) + return false; + + size = pvcalls_queued(prod, cons, array_size); + if (size >= array_size) + return false; + spin_lock_irqsave(&map->sock->sk->sk_receive_queue.lock, flags); + if (skb_queue_empty(&map->sock->sk->sk_receive_queue)) { + atomic_set(&map->read, 0); + spin_unlock_irqrestore(&map->sock->sk->sk_receive_queue.lock, + flags); + return true; + } + spin_unlock_irqrestore(&map->sock->sk->sk_receive_queue.lock, flags); + wanted = array_size - size; + masked_prod = pvcalls_mask(prod, array_size); + masked_cons = pvcalls_mask(cons, array_size); + + memset(&msg, 0, sizeof(msg)); + if (masked_prod < masked_cons) { + vec[0].iov_base = data->in + masked_prod; + vec[0].iov_len = wanted; + iov_iter_kvec(&msg.msg_iter, ITER_DEST, vec, 1, wanted); + } else { + vec[0].iov_base = data->in + masked_prod; + vec[0].iov_len = array_size - masked_prod; + vec[1].iov_base = data->in; + vec[1].iov_len = wanted - vec[0].iov_len; + iov_iter_kvec(&msg.msg_iter, ITER_DEST, vec, 2, wanted); + } + + atomic_set(&map->read, 0); + ret = inet_recvmsg(map->sock, &msg, wanted, MSG_DONTWAIT); + WARN_ON(ret > wanted); + if (ret == -EAGAIN) /* shouldn't happen */ + return true; + if (!ret) + ret = -ENOTCONN; + spin_lock_irqsave(&map->sock->sk->sk_receive_queue.lock, flags); + if (ret > 0 && !skb_queue_empty(&map->sock->sk->sk_receive_queue)) + atomic_inc(&map->read); + spin_unlock_irqrestore(&map->sock->sk->sk_receive_queue.lock, flags); + + /* write the data, then modify the indexes */ + virt_wmb(); + if (ret < 0) { + atomic_set(&map->read, 0); + intf->in_error = ret; + } else + intf->in_prod = prod + ret; + /* update the indexes, then notify the other end */ + virt_wmb(); + notify_remote_via_irq(map->irq); + + return true; +} + +static bool pvcalls_conn_back_write(struct sock_mapping *map) +{ + struct pvcalls_data_intf *intf = map->ring; + struct pvcalls_data *data = &map->data; + struct msghdr msg; + struct kvec vec[2]; + RING_IDX cons, prod, size, array_size; + int ret; + + atomic_set(&map->write, 0); + + cons = intf->out_cons; + prod = intf->out_prod; + /* read the indexes before dealing with the data */ + virt_mb(); + + array_size = XEN_FLEX_RING_SIZE(map->ring_order); + size = pvcalls_queued(prod, cons, array_size); + if (size == 0) + return false; + + memset(&msg, 0, sizeof(msg)); + msg.msg_flags |= MSG_DONTWAIT; + if (pvcalls_mask(prod, array_size) > pvcalls_mask(cons, array_size)) { + vec[0].iov_base = data->out + pvcalls_mask(cons, array_size); + vec[0].iov_len = size; + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, vec, 1, size); + } else { + vec[0].iov_base = data->out + pvcalls_mask(cons, array_size); + vec[0].iov_len = array_size - pvcalls_mask(cons, array_size); + vec[1].iov_base = data->out; + vec[1].iov_len = size - vec[0].iov_len; + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, vec, 2, size); + } + + ret = inet_sendmsg(map->sock, &msg, size); + if (ret == -EAGAIN) { + atomic_inc(&map->write); + atomic_inc(&map->io); + return true; + } + + /* write the data, then update the indexes */ + virt_wmb(); + if (ret < 0) { + intf->out_error = ret; + } else { + intf->out_error = 0; + intf->out_cons = cons + ret; + prod = intf->out_prod; + } + /* update the indexes, then notify the other end */ + virt_wmb(); + if (prod != cons + ret) { + atomic_inc(&map->write); + atomic_inc(&map->io); + } + notify_remote_via_irq(map->irq); + + return true; +} + +static void pvcalls_back_ioworker(struct work_struct *work) +{ + struct pvcalls_ioworker *ioworker = container_of(work, + struct pvcalls_ioworker, register_work); + struct sock_mapping *map = container_of(ioworker, struct sock_mapping, + ioworker); + unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS; + + while (atomic_read(&map->io) > 0) { + if (atomic_read(&map->release) > 0) { + atomic_set(&map->release, 0); + return; + } + + if (atomic_read(&map->read) > 0 && + pvcalls_conn_back_read(map)) + eoi_flags = 0; + if (atomic_read(&map->write) > 0 && + pvcalls_conn_back_write(map)) + eoi_flags = 0; + + if (atomic_read(&map->eoi) > 0 && !atomic_read(&map->write)) { + atomic_set(&map->eoi, 0); + xen_irq_lateeoi(map->irq, eoi_flags); + eoi_flags = XEN_EOI_FLAG_SPURIOUS; + } + + atomic_dec(&map->io); + } +} + +static int pvcalls_back_socket(struct xenbus_device *dev, + struct xen_pvcalls_request *req) +{ + struct pvcalls_fedata *fedata; + int ret; + struct xen_pvcalls_response *rsp; + + fedata = dev_get_drvdata(&dev->dev); + + if (req->u.socket.domain != AF_INET || + req->u.socket.type != SOCK_STREAM || + (req->u.socket.protocol != IPPROTO_IP && + req->u.socket.protocol != AF_INET)) + ret = -EAFNOSUPPORT; + else + ret = 0; + + /* leave the actual socket allocation for later */ + + rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++); + rsp->req_id = req->req_id; + rsp->cmd = req->cmd; + rsp->u.socket.id = req->u.socket.id; + rsp->ret = ret; + + return 0; +} + +static void pvcalls_sk_state_change(struct sock *sock) +{ + struct sock_mapping *map = sock->sk_user_data; + + if (map == NULL) + return; + + atomic_inc(&map->read); + notify_remote_via_irq(map->irq); +} + +static void pvcalls_sk_data_ready(struct sock *sock) +{ + struct sock_mapping *map = sock->sk_user_data; + struct pvcalls_ioworker *iow; + + trace_sk_data_ready(sock); + + if (map == NULL) + return; + + iow = &map->ioworker; + atomic_inc(&map->read); + atomic_inc(&map->io); + queue_work(iow->wq, &iow->register_work); +} + +static struct sock_mapping *pvcalls_new_active_socket( + struct pvcalls_fedata *fedata, + uint64_t id, + grant_ref_t ref, + evtchn_port_t evtchn, + struct socket *sock) +{ + int ret; + struct sock_mapping *map; + void *page; + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (map == NULL) { + sock_release(sock); + return NULL; + } + + map->fedata = fedata; + map->sock = sock; + map->id = id; + map->ref = ref; + + ret = xenbus_map_ring_valloc(fedata->dev, &ref, 1, &page); + if (ret < 0) + goto out; + map->ring = page; + map->ring_order = map->ring->ring_order; + /* first read the order, then map the data ring */ + virt_rmb(); + if (map->ring_order > MAX_RING_ORDER) { + pr_warn("%s frontend requested ring_order %u, which is > MAX (%u)\n", + __func__, map->ring_order, MAX_RING_ORDER); + goto out; + } + ret = xenbus_map_ring_valloc(fedata->dev, map->ring->ref, + (1 << map->ring_order), &page); + if (ret < 0) + goto out; + map->bytes = page; + + ret = bind_interdomain_evtchn_to_irqhandler_lateeoi( + fedata->dev, evtchn, + pvcalls_back_conn_event, 0, "pvcalls-backend", map); + if (ret < 0) + goto out; + map->irq = ret; + + map->data.in = map->bytes; + map->data.out = map->bytes + XEN_FLEX_RING_SIZE(map->ring_order); + + map->ioworker.wq = alloc_ordered_workqueue("pvcalls_io", 0); + if (!map->ioworker.wq) + goto out; + atomic_set(&map->io, 1); + INIT_WORK(&map->ioworker.register_work, pvcalls_back_ioworker); + + down(&fedata->socket_lock); + list_add_tail(&map->list, &fedata->socket_mappings); + up(&fedata->socket_lock); + + write_lock_bh(&map->sock->sk->sk_callback_lock); + map->saved_data_ready = map->sock->sk->sk_data_ready; + map->sock->sk->sk_user_data = map; + map->sock->sk->sk_data_ready = pvcalls_sk_data_ready; + map->sock->sk->sk_state_change = pvcalls_sk_state_change; + write_unlock_bh(&map->sock->sk->sk_callback_lock); + + return map; +out: + down(&fedata->socket_lock); + list_del(&map->list); + pvcalls_back_release_active(fedata->dev, fedata, map); + up(&fedata->socket_lock); + return NULL; +} + +static int pvcalls_back_connect(struct xenbus_device *dev, + struct xen_pvcalls_request *req) +{ + struct pvcalls_fedata *fedata; + int ret = -EINVAL; + struct socket *sock; + struct sock_mapping *map; + struct xen_pvcalls_response *rsp; + struct sockaddr *sa = (struct sockaddr *)&req->u.connect.addr; + + fedata = dev_get_drvdata(&dev->dev); + + if (req->u.connect.len < sizeof(sa->sa_family) || + req->u.connect.len > sizeof(req->u.connect.addr) || + sa->sa_family != AF_INET) + goto out; + + ret = sock_create(AF_INET, SOCK_STREAM, 0, &sock); + if (ret < 0) + goto out; + ret = inet_stream_connect(sock, (struct sockaddr_unsized *)sa, req->u.connect.len, 0); + if (ret < 0) { + sock_release(sock); + goto out; + } + + map = pvcalls_new_active_socket(fedata, + req->u.connect.id, + req->u.connect.ref, + req->u.connect.evtchn, + sock); + if (!map) + ret = -EFAULT; + +out: + rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++); + rsp->req_id = req->req_id; + rsp->cmd = req->cmd; + rsp->u.connect.id = req->u.connect.id; + rsp->ret = ret; + + return 0; +} + +static int pvcalls_back_release_active(struct xenbus_device *dev, + struct pvcalls_fedata *fedata, + struct sock_mapping *map) +{ + disable_irq(map->irq); + if (map->sock->sk != NULL) { + write_lock_bh(&map->sock->sk->sk_callback_lock); + map->sock->sk->sk_user_data = NULL; + map->sock->sk->sk_data_ready = map->saved_data_ready; + write_unlock_bh(&map->sock->sk->sk_callback_lock); + } + + atomic_set(&map->release, 1); + flush_work(&map->ioworker.register_work); + + xenbus_unmap_ring_vfree(dev, map->bytes); + xenbus_unmap_ring_vfree(dev, (void *)map->ring); + unbind_from_irqhandler(map->irq, map); + + sock_release(map->sock); + kfree(map); + + return 0; +} + +static int pvcalls_back_release_passive(struct xenbus_device *dev, + struct pvcalls_fedata *fedata, + struct sockpass_mapping *mappass) +{ + if (mappass->sock->sk != NULL) { + write_lock_bh(&mappass->sock->sk->sk_callback_lock); + mappass->sock->sk->sk_user_data = NULL; + mappass->sock->sk->sk_data_ready = mappass->saved_data_ready; + write_unlock_bh(&mappass->sock->sk->sk_callback_lock); + } + sock_release(mappass->sock); + destroy_workqueue(mappass->wq); + kfree(mappass); + + return 0; +} + +static int pvcalls_back_release(struct xenbus_device *dev, + struct xen_pvcalls_request *req) +{ + struct pvcalls_fedata *fedata; + struct sock_mapping *map, *n; + struct sockpass_mapping *mappass; + int ret = 0; + struct xen_pvcalls_response *rsp; + + fedata = dev_get_drvdata(&dev->dev); + + down(&fedata->socket_lock); + list_for_each_entry_safe(map, n, &fedata->socket_mappings, list) { + if (map->id == req->u.release.id) { + list_del(&map->list); + up(&fedata->socket_lock); + ret = pvcalls_back_release_active(dev, fedata, map); + goto out; + } + } + mappass = radix_tree_lookup(&fedata->socketpass_mappings, + req->u.release.id); + if (mappass != NULL) { + radix_tree_delete(&fedata->socketpass_mappings, mappass->id); + up(&fedata->socket_lock); + ret = pvcalls_back_release_passive(dev, fedata, mappass); + } else + up(&fedata->socket_lock); + +out: + rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++); + rsp->req_id = req->req_id; + rsp->u.release.id = req->u.release.id; + rsp->cmd = req->cmd; + rsp->ret = ret; + return 0; +} + +static void __pvcalls_back_accept(struct work_struct *work) +{ + struct sockpass_mapping *mappass = container_of( + work, struct sockpass_mapping, register_work); + struct proto_accept_arg arg = { + .flags = O_NONBLOCK, + .kern = true, + }; + struct sock_mapping *map; + struct pvcalls_ioworker *iow; + struct pvcalls_fedata *fedata; + struct socket *sock; + struct xen_pvcalls_response *rsp; + struct xen_pvcalls_request *req; + int notify; + int ret = -EINVAL; + unsigned long flags; + + fedata = mappass->fedata; + /* + * __pvcalls_back_accept can race against pvcalls_back_accept. + * We only need to check the value of "cmd" on read. It could be + * done atomically, but to simplify the code on the write side, we + * use a spinlock. + */ + spin_lock_irqsave(&mappass->copy_lock, flags); + req = &mappass->reqcopy; + if (req->cmd != PVCALLS_ACCEPT) { + spin_unlock_irqrestore(&mappass->copy_lock, flags); + return; + } + spin_unlock_irqrestore(&mappass->copy_lock, flags); + + sock = sock_alloc(); + if (sock == NULL) + goto out_error; + sock->type = mappass->sock->type; + sock->ops = mappass->sock->ops; + + ret = inet_accept(mappass->sock, sock, &arg); + if (ret == -EAGAIN) { + sock_release(sock); + return; + } + + map = pvcalls_new_active_socket(fedata, + req->u.accept.id_new, + req->u.accept.ref, + req->u.accept.evtchn, + sock); + if (!map) { + ret = -EFAULT; + goto out_error; + } + + map->sockpass = mappass; + iow = &map->ioworker; + atomic_inc(&map->read); + atomic_inc(&map->io); + queue_work(iow->wq, &iow->register_work); + +out_error: + rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++); + rsp->req_id = req->req_id; + rsp->cmd = req->cmd; + rsp->u.accept.id = req->u.accept.id; + rsp->ret = ret; + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&fedata->ring, notify); + if (notify) + notify_remote_via_irq(fedata->irq); + + mappass->reqcopy.cmd = 0; +} + +static void pvcalls_pass_sk_data_ready(struct sock *sock) +{ + struct sockpass_mapping *mappass = sock->sk_user_data; + struct pvcalls_fedata *fedata; + struct xen_pvcalls_response *rsp; + unsigned long flags; + int notify; + + trace_sk_data_ready(sock); + + if (mappass == NULL) + return; + + fedata = mappass->fedata; + spin_lock_irqsave(&mappass->copy_lock, flags); + if (mappass->reqcopy.cmd == PVCALLS_POLL) { + rsp = RING_GET_RESPONSE(&fedata->ring, + fedata->ring.rsp_prod_pvt++); + rsp->req_id = mappass->reqcopy.req_id; + rsp->u.poll.id = mappass->reqcopy.u.poll.id; + rsp->cmd = mappass->reqcopy.cmd; + rsp->ret = 0; + + mappass->reqcopy.cmd = 0; + spin_unlock_irqrestore(&mappass->copy_lock, flags); + + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&fedata->ring, notify); + if (notify) + notify_remote_via_irq(mappass->fedata->irq); + } else { + spin_unlock_irqrestore(&mappass->copy_lock, flags); + queue_work(mappass->wq, &mappass->register_work); + } +} + +static int pvcalls_back_bind(struct xenbus_device *dev, + struct xen_pvcalls_request *req) +{ + struct pvcalls_fedata *fedata; + int ret; + struct sockpass_mapping *map; + struct xen_pvcalls_response *rsp; + + fedata = dev_get_drvdata(&dev->dev); + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (map == NULL) { + ret = -ENOMEM; + goto out; + } + + INIT_WORK(&map->register_work, __pvcalls_back_accept); + spin_lock_init(&map->copy_lock); + map->wq = alloc_ordered_workqueue("pvcalls_wq", 0); + if (!map->wq) { + ret = -ENOMEM; + goto out; + } + + ret = sock_create(AF_INET, SOCK_STREAM, 0, &map->sock); + if (ret < 0) + goto out; + + ret = inet_bind(map->sock, (struct sockaddr_unsized *)&req->u.bind.addr, + req->u.bind.len); + if (ret < 0) + goto out; + + map->fedata = fedata; + map->id = req->u.bind.id; + + down(&fedata->socket_lock); + ret = radix_tree_insert(&fedata->socketpass_mappings, map->id, + map); + up(&fedata->socket_lock); + if (ret) + goto out; + + write_lock_bh(&map->sock->sk->sk_callback_lock); + map->saved_data_ready = map->sock->sk->sk_data_ready; + map->sock->sk->sk_user_data = map; + map->sock->sk->sk_data_ready = pvcalls_pass_sk_data_ready; + write_unlock_bh(&map->sock->sk->sk_callback_lock); + +out: + if (ret) { + if (map && map->sock) + sock_release(map->sock); + if (map && map->wq) + destroy_workqueue(map->wq); + kfree(map); + } + rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++); + rsp->req_id = req->req_id; + rsp->cmd = req->cmd; + rsp->u.bind.id = req->u.bind.id; + rsp->ret = ret; + return 0; +} + +static int pvcalls_back_listen(struct xenbus_device *dev, + struct xen_pvcalls_request *req) +{ + struct pvcalls_fedata *fedata; + int ret = -EINVAL; + struct sockpass_mapping *map; + struct xen_pvcalls_response *rsp; + + fedata = dev_get_drvdata(&dev->dev); + + down(&fedata->socket_lock); + map = radix_tree_lookup(&fedata->socketpass_mappings, req->u.listen.id); + up(&fedata->socket_lock); + if (map == NULL) + goto out; + + ret = inet_listen(map->sock, req->u.listen.backlog); + +out: + rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++); + rsp->req_id = req->req_id; + rsp->cmd = req->cmd; + rsp->u.listen.id = req->u.listen.id; + rsp->ret = ret; + return 0; +} + +static int pvcalls_back_accept(struct xenbus_device *dev, + struct xen_pvcalls_request *req) +{ + struct pvcalls_fedata *fedata; + struct sockpass_mapping *mappass; + int ret = -EINVAL; + struct xen_pvcalls_response *rsp; + unsigned long flags; + + fedata = dev_get_drvdata(&dev->dev); + + down(&fedata->socket_lock); + mappass = radix_tree_lookup(&fedata->socketpass_mappings, + req->u.accept.id); + up(&fedata->socket_lock); + if (mappass == NULL) + goto out_error; + + /* + * Limitation of the current implementation: only support one + * concurrent accept or poll call on one socket. + */ + spin_lock_irqsave(&mappass->copy_lock, flags); + if (mappass->reqcopy.cmd != 0) { + spin_unlock_irqrestore(&mappass->copy_lock, flags); + ret = -EINTR; + goto out_error; + } + + mappass->reqcopy = *req; + spin_unlock_irqrestore(&mappass->copy_lock, flags); + queue_work(mappass->wq, &mappass->register_work); + + /* Tell the caller we don't need to send back a notification yet */ + return -1; + +out_error: + rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++); + rsp->req_id = req->req_id; + rsp->cmd = req->cmd; + rsp->u.accept.id = req->u.accept.id; + rsp->ret = ret; + return 0; +} + +static int pvcalls_back_poll(struct xenbus_device *dev, + struct xen_pvcalls_request *req) +{ + struct pvcalls_fedata *fedata; + struct sockpass_mapping *mappass; + struct xen_pvcalls_response *rsp; + struct inet_connection_sock *icsk; + struct request_sock_queue *queue; + unsigned long flags; + int ret; + bool data; + + fedata = dev_get_drvdata(&dev->dev); + + down(&fedata->socket_lock); + mappass = radix_tree_lookup(&fedata->socketpass_mappings, + req->u.poll.id); + up(&fedata->socket_lock); + if (mappass == NULL) + return -EINVAL; + + /* + * Limitation of the current implementation: only support one + * concurrent accept or poll call on one socket. + */ + spin_lock_irqsave(&mappass->copy_lock, flags); + if (mappass->reqcopy.cmd != 0) { + ret = -EINTR; + goto out; + } + + mappass->reqcopy = *req; + icsk = inet_csk(mappass->sock->sk); + queue = &icsk->icsk_accept_queue; + data = READ_ONCE(queue->rskq_accept_head) != NULL; + if (data) { + mappass->reqcopy.cmd = 0; + ret = 0; + goto out; + } + spin_unlock_irqrestore(&mappass->copy_lock, flags); + + /* Tell the caller we don't need to send back a notification yet */ + return -1; + +out: + spin_unlock_irqrestore(&mappass->copy_lock, flags); + + rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++); + rsp->req_id = req->req_id; + rsp->cmd = req->cmd; + rsp->u.poll.id = req->u.poll.id; + rsp->ret = ret; + return 0; +} + +static int pvcalls_back_handle_cmd(struct xenbus_device *dev, + struct xen_pvcalls_request *req) +{ + int ret = 0; + + switch (req->cmd) { + case PVCALLS_SOCKET: + ret = pvcalls_back_socket(dev, req); + break; + case PVCALLS_CONNECT: + ret = pvcalls_back_connect(dev, req); + break; + case PVCALLS_RELEASE: + ret = pvcalls_back_release(dev, req); + break; + case PVCALLS_BIND: + ret = pvcalls_back_bind(dev, req); + break; + case PVCALLS_LISTEN: + ret = pvcalls_back_listen(dev, req); + break; + case PVCALLS_ACCEPT: + ret = pvcalls_back_accept(dev, req); + break; + case PVCALLS_POLL: + ret = pvcalls_back_poll(dev, req); + break; + default: + { + struct pvcalls_fedata *fedata; + struct xen_pvcalls_response *rsp; + + fedata = dev_get_drvdata(&dev->dev); + rsp = RING_GET_RESPONSE( + &fedata->ring, fedata->ring.rsp_prod_pvt++); + rsp->req_id = req->req_id; + rsp->cmd = req->cmd; + rsp->ret = -ENOTSUPP; + break; + } + } + return ret; +} + +static void pvcalls_back_work(struct pvcalls_fedata *fedata) +{ + int notify, notify_all = 0, more = 1; + struct xen_pvcalls_request req; + struct xenbus_device *dev = fedata->dev; + + while (more) { + while (RING_HAS_UNCONSUMED_REQUESTS(&fedata->ring)) { + RING_COPY_REQUEST(&fedata->ring, + fedata->ring.req_cons++, + &req); + + if (!pvcalls_back_handle_cmd(dev, &req)) { + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY( + &fedata->ring, notify); + notify_all += notify; + } + } + + if (notify_all) { + notify_remote_via_irq(fedata->irq); + notify_all = 0; + } + + RING_FINAL_CHECK_FOR_REQUESTS(&fedata->ring, more); + } +} + +static irqreturn_t pvcalls_back_event(int irq, void *dev_id) +{ + struct xenbus_device *dev = dev_id; + struct pvcalls_fedata *fedata = NULL; + unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS; + + if (dev) { + fedata = dev_get_drvdata(&dev->dev); + if (fedata) { + pvcalls_back_work(fedata); + eoi_flags = 0; + } + } + + xen_irq_lateeoi(irq, eoi_flags); + + return IRQ_HANDLED; +} + +static irqreturn_t pvcalls_back_conn_event(int irq, void *sock_map) +{ + struct sock_mapping *map = sock_map; + struct pvcalls_ioworker *iow; + + if (map == NULL || map->sock == NULL || map->sock->sk == NULL || + map->sock->sk->sk_user_data != map) { + xen_irq_lateeoi(irq, 0); + return IRQ_HANDLED; + } + + iow = &map->ioworker; + + atomic_inc(&map->write); + atomic_inc(&map->eoi); + atomic_inc(&map->io); + queue_work(iow->wq, &iow->register_work); + + return IRQ_HANDLED; +} + +static int backend_connect(struct xenbus_device *dev) +{ + int err; + evtchn_port_t evtchn; + grant_ref_t ring_ref; + struct pvcalls_fedata *fedata = NULL; + + fedata = kzalloc(sizeof(struct pvcalls_fedata), GFP_KERNEL); + if (!fedata) + return -ENOMEM; + + fedata->irq = -1; + err = xenbus_scanf(XBT_NIL, dev->otherend, "port", "%u", + &evtchn); + if (err != 1) { + err = -EINVAL; + xenbus_dev_fatal(dev, err, "reading %s/event-channel", + dev->otherend); + goto error; + } + + err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref", "%u", &ring_ref); + if (err != 1) { + err = -EINVAL; + xenbus_dev_fatal(dev, err, "reading %s/ring-ref", + dev->otherend); + goto error; + } + + err = bind_interdomain_evtchn_to_irq_lateeoi(dev, evtchn); + if (err < 0) + goto error; + fedata->irq = err; + + err = request_threaded_irq(fedata->irq, NULL, pvcalls_back_event, + IRQF_ONESHOT, "pvcalls-back", dev); + if (err < 0) + goto error; + + err = xenbus_map_ring_valloc(dev, &ring_ref, 1, + (void **)&fedata->sring); + if (err < 0) + goto error; + + BACK_RING_INIT(&fedata->ring, fedata->sring, XEN_PAGE_SIZE * 1); + fedata->dev = dev; + + INIT_LIST_HEAD(&fedata->socket_mappings); + INIT_RADIX_TREE(&fedata->socketpass_mappings, GFP_KERNEL); + sema_init(&fedata->socket_lock, 1); + dev_set_drvdata(&dev->dev, fedata); + + down(&pvcalls_back_global.frontends_lock); + list_add_tail(&fedata->list, &pvcalls_back_global.frontends); + up(&pvcalls_back_global.frontends_lock); + + return 0; + + error: + if (fedata->irq >= 0) + unbind_from_irqhandler(fedata->irq, dev); + if (fedata->sring != NULL) + xenbus_unmap_ring_vfree(dev, fedata->sring); + kfree(fedata); + return err; +} + +static int backend_disconnect(struct xenbus_device *dev) +{ + struct pvcalls_fedata *fedata; + struct sock_mapping *map, *n; + struct sockpass_mapping *mappass; + struct radix_tree_iter iter; + void **slot; + + + fedata = dev_get_drvdata(&dev->dev); + + down(&fedata->socket_lock); + list_for_each_entry_safe(map, n, &fedata->socket_mappings, list) { + list_del(&map->list); + pvcalls_back_release_active(dev, fedata, map); + } + + radix_tree_for_each_slot(slot, &fedata->socketpass_mappings, &iter, 0) { + mappass = radix_tree_deref_slot(slot); + if (!mappass) + continue; + if (radix_tree_exception(mappass)) { + if (radix_tree_deref_retry(mappass)) + slot = radix_tree_iter_retry(&iter); + } else { + radix_tree_delete(&fedata->socketpass_mappings, + mappass->id); + pvcalls_back_release_passive(dev, fedata, mappass); + } + } + up(&fedata->socket_lock); + + unbind_from_irqhandler(fedata->irq, dev); + xenbus_unmap_ring_vfree(dev, fedata->sring); + + list_del(&fedata->list); + kfree(fedata); + dev_set_drvdata(&dev->dev, NULL); + + return 0; +} + +static int pvcalls_back_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err, abort; + struct xenbus_transaction xbt; + +again: + abort = 1; + + err = xenbus_transaction_start(&xbt); + if (err) { + pr_warn("%s cannot create xenstore transaction\n", __func__); + return err; + } + + err = xenbus_printf(xbt, dev->nodename, "versions", "%s", + PVCALLS_VERSIONS); + if (err) { + pr_warn("%s write out 'versions' failed\n", __func__); + goto abort; + } + + err = xenbus_printf(xbt, dev->nodename, "max-page-order", "%u", + MAX_RING_ORDER); + if (err) { + pr_warn("%s write out 'max-page-order' failed\n", __func__); + goto abort; + } + + err = xenbus_printf(xbt, dev->nodename, "function-calls", + XENBUS_FUNCTIONS_CALLS); + if (err) { + pr_warn("%s write out 'function-calls' failed\n", __func__); + goto abort; + } + + abort = 0; +abort: + err = xenbus_transaction_end(xbt, abort); + if (err) { + if (err == -EAGAIN && !abort) + goto again; + pr_warn("%s cannot complete xenstore transaction\n", __func__); + return err; + } + + if (abort) + return -EFAULT; + + xenbus_switch_state(dev, XenbusStateInitWait); + + return 0; +} + +static void set_backend_state(struct xenbus_device *dev, + enum xenbus_state state) +{ + while (dev->state != state) { + switch (dev->state) { + case XenbusStateClosed: + switch (state) { + case XenbusStateInitWait: + case XenbusStateConnected: + xenbus_switch_state(dev, XenbusStateInitWait); + break; + case XenbusStateClosing: + xenbus_switch_state(dev, XenbusStateClosing); + break; + default: + WARN_ON(1); + } + break; + case XenbusStateInitWait: + case XenbusStateInitialised: + switch (state) { + case XenbusStateConnected: + if (backend_connect(dev)) + return; + xenbus_switch_state(dev, XenbusStateConnected); + break; + case XenbusStateClosing: + case XenbusStateClosed: + xenbus_switch_state(dev, XenbusStateClosing); + break; + default: + WARN_ON(1); + } + break; + case XenbusStateConnected: + switch (state) { + case XenbusStateInitWait: + case XenbusStateClosing: + case XenbusStateClosed: + down(&pvcalls_back_global.frontends_lock); + backend_disconnect(dev); + up(&pvcalls_back_global.frontends_lock); + xenbus_switch_state(dev, XenbusStateClosing); + break; + default: + WARN_ON(1); + } + break; + case XenbusStateClosing: + switch (state) { + case XenbusStateInitWait: + case XenbusStateConnected: + case XenbusStateClosed: + xenbus_switch_state(dev, XenbusStateClosed); + break; + default: + WARN_ON(1); + } + break; + default: + WARN_ON(1); + } + } +} + +static void pvcalls_back_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state) +{ + switch (frontend_state) { + case XenbusStateInitialising: + set_backend_state(dev, XenbusStateInitWait); + break; + + case XenbusStateInitialised: + case XenbusStateConnected: + set_backend_state(dev, XenbusStateConnected); + break; + + case XenbusStateClosing: + set_backend_state(dev, XenbusStateClosing); + break; + + case XenbusStateClosed: + set_backend_state(dev, XenbusStateClosed); + if (xenbus_dev_is_online(dev)) + break; + device_unregister(&dev->dev); + break; + case XenbusStateUnknown: + set_backend_state(dev, XenbusStateClosed); + device_unregister(&dev->dev); + break; + + default: + xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", + frontend_state); + break; + } +} + +static void pvcalls_back_remove(struct xenbus_device *dev) +{ +} + +static int pvcalls_back_uevent(const struct xenbus_device *xdev, + struct kobj_uevent_env *env) +{ + return 0; +} + +static const struct xenbus_device_id pvcalls_back_ids[] = { + { "pvcalls" }, + { "" } +}; + +static struct xenbus_driver pvcalls_back_driver = { + .ids = pvcalls_back_ids, + .probe = pvcalls_back_probe, + .remove = pvcalls_back_remove, + .uevent = pvcalls_back_uevent, + .otherend_changed = pvcalls_back_changed, +}; + +static int __init pvcalls_back_init(void) +{ + int ret; + + if (!xen_domain()) + return -ENODEV; + + ret = xenbus_register_backend(&pvcalls_back_driver); + if (ret < 0) + return ret; + + sema_init(&pvcalls_back_global.frontends_lock, 1); + INIT_LIST_HEAD(&pvcalls_back_global.frontends); + return 0; +} +module_init(pvcalls_back_init); + +static void __exit pvcalls_back_fin(void) +{ + struct pvcalls_fedata *fedata, *nfedata; + + down(&pvcalls_back_global.frontends_lock); + list_for_each_entry_safe(fedata, nfedata, + &pvcalls_back_global.frontends, list) { + backend_disconnect(fedata->dev); + } + up(&pvcalls_back_global.frontends_lock); + + xenbus_unregister_driver(&pvcalls_back_driver); +} + +module_exit(pvcalls_back_fin); + +MODULE_DESCRIPTION("Xen PV Calls backend driver"); +MODULE_AUTHOR("Stefano Stabellini <sstabellini@kernel.org>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c new file mode 100644 index 000000000000..4926d4badc57 --- /dev/null +++ b/drivers/xen/pvcalls-front.c @@ -0,0 +1,1312 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * (c) 2017 Stefano Stabellini <stefano@aporeto.com> + */ + +#include <linux/module.h> +#include <linux/net.h> +#include <linux/socket.h> + +#include <net/sock.h> + +#include <xen/events.h> +#include <xen/grant_table.h> +#include <xen/xen.h> +#include <xen/xenbus.h> +#include <xen/interface/io/pvcalls.h> + +#include "pvcalls-front.h" + +#define PVCALLS_INVALID_ID UINT_MAX +#define PVCALLS_RING_ORDER XENBUS_MAX_RING_GRANT_ORDER +#define PVCALLS_NR_RSP_PER_RING __CONST_RING_SIZE(xen_pvcalls, XEN_PAGE_SIZE) +#define PVCALLS_FRONT_MAX_SPIN 5000 + +static struct proto pvcalls_proto = { + .name = "PVCalls", + .owner = THIS_MODULE, + .obj_size = sizeof(struct sock), +}; + +struct pvcalls_bedata { + struct xen_pvcalls_front_ring ring; + grant_ref_t ref; + int irq; + + struct list_head socket_mappings; + spinlock_t socket_lock; + + wait_queue_head_t inflight_req; + struct xen_pvcalls_response rsp[PVCALLS_NR_RSP_PER_RING]; +}; +/* Only one front/back connection supported. */ +static struct xenbus_device *pvcalls_front_dev; +static atomic_t pvcalls_refcount; + +/* first increment refcount, then proceed */ +#define pvcalls_enter() { \ + atomic_inc(&pvcalls_refcount); \ +} + +/* first complete other operations, then decrement refcount */ +#define pvcalls_exit() { \ + atomic_dec(&pvcalls_refcount); \ +} + +struct sock_mapping { + bool active_socket; + struct list_head list; + struct socket *sock; + atomic_t refcount; + union { + struct { + int irq; + grant_ref_t ref; + struct pvcalls_data_intf *ring; + struct pvcalls_data data; + struct mutex in_mutex; + struct mutex out_mutex; + + wait_queue_head_t inflight_conn_req; + } active; + struct { + /* + * Socket status, needs to be 64-bit aligned due to the + * test_and_* functions which have this requirement on arm64. + */ +#define PVCALLS_STATUS_UNINITALIZED 0 +#define PVCALLS_STATUS_BIND 1 +#define PVCALLS_STATUS_LISTEN 2 + uint8_t status __attribute__((aligned(8))); + /* + * Internal state-machine flags. + * Only one accept operation can be inflight for a socket. + * Only one poll operation can be inflight for a given socket. + * flags needs to be 64-bit aligned due to the test_and_* + * functions which have this requirement on arm64. + */ +#define PVCALLS_FLAG_ACCEPT_INFLIGHT 0 +#define PVCALLS_FLAG_POLL_INFLIGHT 1 +#define PVCALLS_FLAG_POLL_RET 2 + uint8_t flags __attribute__((aligned(8))); + uint32_t inflight_req_id; + struct sock_mapping *accept_map; + wait_queue_head_t inflight_accept_req; + } passive; + }; +}; + +static inline struct sock_mapping *pvcalls_enter_sock(struct socket *sock) +{ + struct sock_mapping *map; + + if (!pvcalls_front_dev || + dev_get_drvdata(&pvcalls_front_dev->dev) == NULL) + return ERR_PTR(-ENOTCONN); + + map = (struct sock_mapping *)sock->sk->sk_send_head; + if (map == NULL) + return ERR_PTR(-ENOTSOCK); + + pvcalls_enter(); + atomic_inc(&map->refcount); + return map; +} + +static inline void pvcalls_exit_sock(struct socket *sock) +{ + struct sock_mapping *map; + + map = (struct sock_mapping *)sock->sk->sk_send_head; + atomic_dec(&map->refcount); + pvcalls_exit(); +} + +static inline int get_request(struct pvcalls_bedata *bedata, int *req_id) +{ + *req_id = bedata->ring.req_prod_pvt & (RING_SIZE(&bedata->ring) - 1); + if (RING_FULL(&bedata->ring) || + bedata->rsp[*req_id].req_id != PVCALLS_INVALID_ID) + return -EAGAIN; + return 0; +} + +static bool pvcalls_front_write_todo(struct sock_mapping *map) +{ + struct pvcalls_data_intf *intf = map->active.ring; + RING_IDX cons, prod, size = XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER); + int32_t error; + + error = intf->out_error; + if (error == -ENOTCONN) + return false; + if (error != 0) + return true; + + cons = intf->out_cons; + prod = intf->out_prod; + return !!(size - pvcalls_queued(prod, cons, size)); +} + +static bool pvcalls_front_read_todo(struct sock_mapping *map) +{ + struct pvcalls_data_intf *intf = map->active.ring; + RING_IDX cons, prod; + int32_t error; + + cons = intf->in_cons; + prod = intf->in_prod; + error = intf->in_error; + return (error != 0 || + pvcalls_queued(prod, cons, + XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER)) != 0); +} + +static irqreturn_t pvcalls_front_event_handler(int irq, void *dev_id) +{ + struct xenbus_device *dev = dev_id; + struct pvcalls_bedata *bedata; + struct xen_pvcalls_response *rsp; + uint8_t *src, *dst; + int req_id = 0, more = 0, done = 0; + + if (dev == NULL) + return IRQ_HANDLED; + + pvcalls_enter(); + bedata = dev_get_drvdata(&dev->dev); + if (bedata == NULL) { + pvcalls_exit(); + return IRQ_HANDLED; + } + +again: + while (RING_HAS_UNCONSUMED_RESPONSES(&bedata->ring)) { + rsp = RING_GET_RESPONSE(&bedata->ring, bedata->ring.rsp_cons); + + req_id = rsp->req_id; + if (rsp->cmd == PVCALLS_POLL) { + struct sock_mapping *map = (struct sock_mapping *)(uintptr_t) + rsp->u.poll.id; + + clear_bit(PVCALLS_FLAG_POLL_INFLIGHT, + (void *)&map->passive.flags); + /* + * clear INFLIGHT, then set RET. It pairs with + * the checks at the beginning of + * pvcalls_front_poll_passive. + */ + smp_wmb(); + set_bit(PVCALLS_FLAG_POLL_RET, + (void *)&map->passive.flags); + } else { + dst = (uint8_t *)&bedata->rsp[req_id] + + sizeof(rsp->req_id); + src = (uint8_t *)rsp + sizeof(rsp->req_id); + memcpy(dst, src, sizeof(*rsp) - sizeof(rsp->req_id)); + /* + * First copy the rest of the data, then req_id. It is + * paired with the barrier when accessing bedata->rsp. + */ + smp_wmb(); + bedata->rsp[req_id].req_id = req_id; + } + + done = 1; + bedata->ring.rsp_cons++; + } + + RING_FINAL_CHECK_FOR_RESPONSES(&bedata->ring, more); + if (more) + goto again; + if (done) + wake_up(&bedata->inflight_req); + pvcalls_exit(); + return IRQ_HANDLED; +} + +static void free_active_ring(struct sock_mapping *map); + +static void pvcalls_front_destroy_active(struct pvcalls_bedata *bedata, + struct sock_mapping *map) +{ + int i; + + unbind_from_irqhandler(map->active.irq, map); + + if (bedata) { + spin_lock(&bedata->socket_lock); + if (!list_empty(&map->list)) + list_del_init(&map->list); + spin_unlock(&bedata->socket_lock); + } + + for (i = 0; i < (1 << PVCALLS_RING_ORDER); i++) + gnttab_end_foreign_access(map->active.ring->ref[i], NULL); + gnttab_end_foreign_access(map->active.ref, NULL); + free_active_ring(map); +} + +static void pvcalls_front_free_map(struct pvcalls_bedata *bedata, + struct sock_mapping *map) +{ + pvcalls_front_destroy_active(bedata, map); + + kfree(map); +} + +static irqreturn_t pvcalls_front_conn_handler(int irq, void *sock_map) +{ + struct sock_mapping *map = sock_map; + + if (map == NULL) + return IRQ_HANDLED; + + wake_up_interruptible(&map->active.inflight_conn_req); + + return IRQ_HANDLED; +} + +int pvcalls_front_socket(struct socket *sock) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map = NULL; + struct xen_pvcalls_request *req; + int notify, req_id, ret; + + /* + * PVCalls only supports domain AF_INET, + * type SOCK_STREAM and protocol 0 sockets for now. + * + * Check socket type here, AF_INET and protocol checks are done + * by the caller. + */ + if (sock->type != SOCK_STREAM) + return -EOPNOTSUPP; + + pvcalls_enter(); + if (!pvcalls_front_dev) { + pvcalls_exit(); + return -EACCES; + } + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (map == NULL) { + pvcalls_exit(); + return -ENOMEM; + } + + spin_lock(&bedata->socket_lock); + + ret = get_request(bedata, &req_id); + if (ret < 0) { + kfree(map); + spin_unlock(&bedata->socket_lock); + pvcalls_exit(); + return ret; + } + + /* + * sock->sk->sk_send_head is not used for ip sockets: reuse the + * field to store a pointer to the struct sock_mapping + * corresponding to the socket. This way, we can easily get the + * struct sock_mapping from the struct socket. + */ + sock->sk->sk_send_head = (void *)map; + list_add_tail(&map->list, &bedata->socket_mappings); + + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + req->cmd = PVCALLS_SOCKET; + req->u.socket.id = (uintptr_t) map; + req->u.socket.domain = AF_INET; + req->u.socket.type = SOCK_STREAM; + req->u.socket.protocol = IPPROTO_IP; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + if (notify) + notify_remote_via_irq(bedata->irq); + + wait_event(bedata->inflight_req, + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); + + /* read req_id, then the content */ + smp_rmb(); + ret = bedata->rsp[req_id].ret; + bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; + + pvcalls_exit(); + return ret; +} +EXPORT_SYMBOL_GPL(pvcalls_front_socket); + +static void free_active_ring(struct sock_mapping *map) +{ + if (!map->active.ring) + return; + + free_pages_exact(map->active.data.in, + PAGE_SIZE << map->active.ring->ring_order); + free_page((unsigned long)map->active.ring); +} + +static int alloc_active_ring(struct sock_mapping *map) +{ + void *bytes; + + map->active.ring = (struct pvcalls_data_intf *) + get_zeroed_page(GFP_KERNEL); + if (!map->active.ring) + goto out; + + map->active.ring->ring_order = PVCALLS_RING_ORDER; + bytes = alloc_pages_exact(PAGE_SIZE << PVCALLS_RING_ORDER, + GFP_KERNEL | __GFP_ZERO); + if (!bytes) + goto out; + + map->active.data.in = bytes; + map->active.data.out = bytes + + XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER); + + return 0; + +out: + free_active_ring(map); + return -ENOMEM; +} + +static int create_active(struct sock_mapping *map, evtchn_port_t *evtchn) +{ + void *bytes; + int ret, irq = -1, i; + + *evtchn = 0; + init_waitqueue_head(&map->active.inflight_conn_req); + + bytes = map->active.data.in; + for (i = 0; i < (1 << PVCALLS_RING_ORDER); i++) + map->active.ring->ref[i] = gnttab_grant_foreign_access( + pvcalls_front_dev->otherend_id, + pfn_to_gfn(virt_to_pfn(bytes) + i), 0); + + map->active.ref = gnttab_grant_foreign_access( + pvcalls_front_dev->otherend_id, + pfn_to_gfn(virt_to_pfn((void *)map->active.ring)), 0); + + ret = xenbus_alloc_evtchn(pvcalls_front_dev, evtchn); + if (ret) + goto out_error; + irq = bind_evtchn_to_irqhandler(*evtchn, pvcalls_front_conn_handler, + 0, "pvcalls-frontend", map); + if (irq < 0) { + ret = irq; + goto out_error; + } + + map->active.irq = irq; + map->active_socket = true; + mutex_init(&map->active.in_mutex); + mutex_init(&map->active.out_mutex); + + return 0; + +out_error: + if (*evtchn > 0) + xenbus_free_evtchn(pvcalls_front_dev, *evtchn); + return ret; +} + +int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr, + int addr_len, int flags) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map = NULL; + struct xen_pvcalls_request *req; + int notify, req_id, ret; + evtchn_port_t evtchn; + + if (addr->sa_family != AF_INET || sock->type != SOCK_STREAM) + return -EOPNOTSUPP; + + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) + return PTR_ERR(map); + + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + ret = alloc_active_ring(map); + if (ret < 0) { + pvcalls_exit_sock(sock); + return ret; + } + ret = create_active(map, &evtchn); + if (ret < 0) { + free_active_ring(map); + pvcalls_exit_sock(sock); + return ret; + } + + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); + if (ret < 0) { + spin_unlock(&bedata->socket_lock); + pvcalls_front_destroy_active(NULL, map); + pvcalls_exit_sock(sock); + return ret; + } + + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + req->cmd = PVCALLS_CONNECT; + req->u.connect.id = (uintptr_t)map; + req->u.connect.len = addr_len; + req->u.connect.flags = flags; + req->u.connect.ref = map->active.ref; + req->u.connect.evtchn = evtchn; + memcpy(req->u.connect.addr, addr, sizeof(*addr)); + + map->sock = sock; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + + if (notify) + notify_remote_via_irq(bedata->irq); + + wait_event(bedata->inflight_req, + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); + + /* read req_id, then the content */ + smp_rmb(); + ret = bedata->rsp[req_id].ret; + bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; + pvcalls_exit_sock(sock); + return ret; +} +EXPORT_SYMBOL_GPL(pvcalls_front_connect); + +static int __write_ring(struct pvcalls_data_intf *intf, + struct pvcalls_data *data, + struct iov_iter *msg_iter, + int len) +{ + RING_IDX cons, prod, size, masked_prod, masked_cons; + RING_IDX array_size = XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER); + int32_t error; + + error = intf->out_error; + if (error < 0) + return error; + cons = intf->out_cons; + prod = intf->out_prod; + /* read indexes before continuing */ + virt_mb(); + + size = pvcalls_queued(prod, cons, array_size); + if (size > array_size) + return -EINVAL; + if (size == array_size) + return 0; + if (len > array_size - size) + len = array_size - size; + + masked_prod = pvcalls_mask(prod, array_size); + masked_cons = pvcalls_mask(cons, array_size); + + if (masked_prod < masked_cons) { + len = copy_from_iter(data->out + masked_prod, len, msg_iter); + } else { + if (len > array_size - masked_prod) { + int ret = copy_from_iter(data->out + masked_prod, + array_size - masked_prod, msg_iter); + if (ret != array_size - masked_prod) { + len = ret; + goto out; + } + len = ret + copy_from_iter(data->out, len - ret, msg_iter); + } else { + len = copy_from_iter(data->out + masked_prod, len, msg_iter); + } + } +out: + /* write to ring before updating pointer */ + virt_wmb(); + intf->out_prod += len; + + return len; +} + +int pvcalls_front_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) +{ + struct sock_mapping *map; + int sent, tot_sent = 0; + int count = 0, flags; + + flags = msg->msg_flags; + if (flags & (MSG_CONFIRM|MSG_DONTROUTE|MSG_EOR|MSG_OOB)) + return -EOPNOTSUPP; + + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) + return PTR_ERR(map); + + mutex_lock(&map->active.out_mutex); + if ((flags & MSG_DONTWAIT) && !pvcalls_front_write_todo(map)) { + mutex_unlock(&map->active.out_mutex); + pvcalls_exit_sock(sock); + return -EAGAIN; + } + if (len > INT_MAX) + len = INT_MAX; + +again: + count++; + sent = __write_ring(map->active.ring, + &map->active.data, &msg->msg_iter, + len); + if (sent > 0) { + len -= sent; + tot_sent += sent; + notify_remote_via_irq(map->active.irq); + } + if (sent >= 0 && len > 0 && count < PVCALLS_FRONT_MAX_SPIN) + goto again; + if (sent < 0) + tot_sent = sent; + + mutex_unlock(&map->active.out_mutex); + pvcalls_exit_sock(sock); + return tot_sent; +} +EXPORT_SYMBOL_GPL(pvcalls_front_sendmsg); + +static int __read_ring(struct pvcalls_data_intf *intf, + struct pvcalls_data *data, + struct iov_iter *msg_iter, + size_t len, int flags) +{ + RING_IDX cons, prod, size, masked_prod, masked_cons; + RING_IDX array_size = XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER); + int32_t error; + + cons = intf->in_cons; + prod = intf->in_prod; + error = intf->in_error; + /* get pointers before reading from the ring */ + virt_rmb(); + + size = pvcalls_queued(prod, cons, array_size); + masked_prod = pvcalls_mask(prod, array_size); + masked_cons = pvcalls_mask(cons, array_size); + + if (size == 0) + return error ?: size; + + if (len > size) + len = size; + + if (masked_prod > masked_cons) { + len = copy_to_iter(data->in + masked_cons, len, msg_iter); + } else { + if (len > (array_size - masked_cons)) { + int ret = copy_to_iter(data->in + masked_cons, + array_size - masked_cons, msg_iter); + if (ret != array_size - masked_cons) { + len = ret; + goto out; + } + len = ret + copy_to_iter(data->in, len - ret, msg_iter); + } else { + len = copy_to_iter(data->in + masked_cons, len, msg_iter); + } + } +out: + /* read data from the ring before increasing the index */ + virt_mb(); + if (!(flags & MSG_PEEK)) + intf->in_cons += len; + + return len; +} + +int pvcalls_front_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + int ret; + struct sock_mapping *map; + + if (flags & (MSG_CMSG_CLOEXEC|MSG_ERRQUEUE|MSG_OOB|MSG_TRUNC)) + return -EOPNOTSUPP; + + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) + return PTR_ERR(map); + + mutex_lock(&map->active.in_mutex); + if (len > XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER)) + len = XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER); + + while (!(flags & MSG_DONTWAIT) && !pvcalls_front_read_todo(map)) { + wait_event_interruptible(map->active.inflight_conn_req, + pvcalls_front_read_todo(map)); + } + ret = __read_ring(map->active.ring, &map->active.data, + &msg->msg_iter, len, flags); + + if (ret > 0) + notify_remote_via_irq(map->active.irq); + if (ret == 0) + ret = (flags & MSG_DONTWAIT) ? -EAGAIN : 0; + if (ret == -ENOTCONN) + ret = 0; + + mutex_unlock(&map->active.in_mutex); + pvcalls_exit_sock(sock); + return ret; +} +EXPORT_SYMBOL_GPL(pvcalls_front_recvmsg); + +int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map = NULL; + struct xen_pvcalls_request *req; + int notify, req_id, ret; + + if (addr->sa_family != AF_INET || sock->type != SOCK_STREAM) + return -EOPNOTSUPP; + + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) + return PTR_ERR(map); + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); + if (ret < 0) { + spin_unlock(&bedata->socket_lock); + pvcalls_exit_sock(sock); + return ret; + } + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + map->sock = sock; + req->cmd = PVCALLS_BIND; + req->u.bind.id = (uintptr_t)map; + memcpy(req->u.bind.addr, addr, sizeof(*addr)); + req->u.bind.len = addr_len; + + init_waitqueue_head(&map->passive.inflight_accept_req); + + map->active_socket = false; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + if (notify) + notify_remote_via_irq(bedata->irq); + + wait_event(bedata->inflight_req, + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); + + /* read req_id, then the content */ + smp_rmb(); + ret = bedata->rsp[req_id].ret; + bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; + + map->passive.status = PVCALLS_STATUS_BIND; + pvcalls_exit_sock(sock); + return 0; +} +EXPORT_SYMBOL_GPL(pvcalls_front_bind); + +int pvcalls_front_listen(struct socket *sock, int backlog) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map; + struct xen_pvcalls_request *req; + int notify, req_id, ret; + + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) + return PTR_ERR(map); + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + if (map->passive.status != PVCALLS_STATUS_BIND) { + pvcalls_exit_sock(sock); + return -EOPNOTSUPP; + } + + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); + if (ret < 0) { + spin_unlock(&bedata->socket_lock); + pvcalls_exit_sock(sock); + return ret; + } + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + req->cmd = PVCALLS_LISTEN; + req->u.listen.id = (uintptr_t) map; + req->u.listen.backlog = backlog; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + if (notify) + notify_remote_via_irq(bedata->irq); + + wait_event(bedata->inflight_req, + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); + + /* read req_id, then the content */ + smp_rmb(); + ret = bedata->rsp[req_id].ret; + bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; + + map->passive.status = PVCALLS_STATUS_LISTEN; + pvcalls_exit_sock(sock); + return ret; +} +EXPORT_SYMBOL_GPL(pvcalls_front_listen); + +int pvcalls_front_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map; + struct sock_mapping *map2 = NULL; + struct xen_pvcalls_request *req; + int notify, req_id, ret, nonblock; + evtchn_port_t evtchn; + + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) + return PTR_ERR(map); + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + if (map->passive.status != PVCALLS_STATUS_LISTEN) { + pvcalls_exit_sock(sock); + return -EINVAL; + } + + nonblock = arg->flags & SOCK_NONBLOCK; + /* + * Backend only supports 1 inflight accept request, will return + * errors for the others + */ + if (test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags)) { + req_id = READ_ONCE(map->passive.inflight_req_id); + if (req_id != PVCALLS_INVALID_ID && + READ_ONCE(bedata->rsp[req_id].req_id) == req_id) { + map2 = map->passive.accept_map; + goto received; + } + if (nonblock) { + pvcalls_exit_sock(sock); + return -EAGAIN; + } + if (wait_event_interruptible(map->passive.inflight_accept_req, + !test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags))) { + pvcalls_exit_sock(sock); + return -EINTR; + } + } + + map2 = kzalloc(sizeof(*map2), GFP_KERNEL); + if (map2 == NULL) { + clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags); + pvcalls_exit_sock(sock); + return -ENOMEM; + } + ret = alloc_active_ring(map2); + if (ret < 0) { + clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags); + kfree(map2); + pvcalls_exit_sock(sock); + return ret; + } + ret = create_active(map2, &evtchn); + if (ret < 0) { + free_active_ring(map2); + kfree(map2); + clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags); + pvcalls_exit_sock(sock); + return ret; + } + + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); + if (ret < 0) { + clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags); + spin_unlock(&bedata->socket_lock); + pvcalls_front_free_map(bedata, map2); + pvcalls_exit_sock(sock); + return ret; + } + + list_add_tail(&map2->list, &bedata->socket_mappings); + + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + req->cmd = PVCALLS_ACCEPT; + req->u.accept.id = (uintptr_t) map; + req->u.accept.ref = map2->active.ref; + req->u.accept.id_new = (uintptr_t) map2; + req->u.accept.evtchn = evtchn; + map->passive.accept_map = map2; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + if (notify) + notify_remote_via_irq(bedata->irq); + /* We could check if we have received a response before returning. */ + if (nonblock) { + WRITE_ONCE(map->passive.inflight_req_id, req_id); + pvcalls_exit_sock(sock); + return -EAGAIN; + } + + if (wait_event_interruptible(bedata->inflight_req, + READ_ONCE(bedata->rsp[req_id].req_id) == req_id)) { + pvcalls_exit_sock(sock); + return -EINTR; + } + /* read req_id, then the content */ + smp_rmb(); + +received: + map2->sock = newsock; + newsock->sk = sk_alloc(sock_net(sock->sk), PF_INET, GFP_KERNEL, &pvcalls_proto, false); + if (!newsock->sk) { + bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; + map->passive.inflight_req_id = PVCALLS_INVALID_ID; + clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags); + pvcalls_front_free_map(bedata, map2); + pvcalls_exit_sock(sock); + return -ENOMEM; + } + newsock->sk->sk_send_head = (void *)map2; + + ret = bedata->rsp[req_id].ret; + bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; + map->passive.inflight_req_id = PVCALLS_INVALID_ID; + + clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags); + wake_up(&map->passive.inflight_accept_req); + + pvcalls_exit_sock(sock); + return ret; +} +EXPORT_SYMBOL_GPL(pvcalls_front_accept); + +static __poll_t pvcalls_front_poll_passive(struct file *file, + struct pvcalls_bedata *bedata, + struct sock_mapping *map, + poll_table *wait) +{ + int notify, req_id, ret; + struct xen_pvcalls_request *req; + + if (test_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags)) { + uint32_t req_id = READ_ONCE(map->passive.inflight_req_id); + + if (req_id != PVCALLS_INVALID_ID && + READ_ONCE(bedata->rsp[req_id].req_id) == req_id) + return EPOLLIN | EPOLLRDNORM; + + poll_wait(file, &map->passive.inflight_accept_req, wait); + return 0; + } + + if (test_and_clear_bit(PVCALLS_FLAG_POLL_RET, + (void *)&map->passive.flags)) + return EPOLLIN | EPOLLRDNORM; + + /* + * First check RET, then INFLIGHT. No barriers necessary to + * ensure execution ordering because of the conditional + * instructions creating control dependencies. + */ + + if (test_and_set_bit(PVCALLS_FLAG_POLL_INFLIGHT, + (void *)&map->passive.flags)) { + poll_wait(file, &bedata->inflight_req, wait); + return 0; + } + + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); + if (ret < 0) { + spin_unlock(&bedata->socket_lock); + return ret; + } + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + req->cmd = PVCALLS_POLL; + req->u.poll.id = (uintptr_t) map; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + if (notify) + notify_remote_via_irq(bedata->irq); + + poll_wait(file, &bedata->inflight_req, wait); + return 0; +} + +static __poll_t pvcalls_front_poll_active(struct file *file, + struct pvcalls_bedata *bedata, + struct sock_mapping *map, + poll_table *wait) +{ + __poll_t mask = 0; + int32_t in_error, out_error; + struct pvcalls_data_intf *intf = map->active.ring; + + out_error = intf->out_error; + in_error = intf->in_error; + + poll_wait(file, &map->active.inflight_conn_req, wait); + if (pvcalls_front_write_todo(map)) + mask |= EPOLLOUT | EPOLLWRNORM; + if (pvcalls_front_read_todo(map)) + mask |= EPOLLIN | EPOLLRDNORM; + if (in_error != 0 || out_error != 0) + mask |= EPOLLERR; + + return mask; +} + +__poll_t pvcalls_front_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map; + __poll_t ret; + + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) + return EPOLLNVAL; + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + if (map->active_socket) + ret = pvcalls_front_poll_active(file, bedata, map, wait); + else + ret = pvcalls_front_poll_passive(file, bedata, map, wait); + pvcalls_exit_sock(sock); + return ret; +} +EXPORT_SYMBOL_GPL(pvcalls_front_poll); + +int pvcalls_front_release(struct socket *sock) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map; + int req_id, notify, ret; + struct xen_pvcalls_request *req; + + if (sock->sk == NULL) + return 0; + + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) { + if (PTR_ERR(map) == -ENOTCONN) + return -EIO; + else + return 0; + } + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); + if (ret < 0) { + spin_unlock(&bedata->socket_lock); + pvcalls_exit_sock(sock); + return ret; + } + sock->sk->sk_send_head = NULL; + + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + req->cmd = PVCALLS_RELEASE; + req->u.release.id = (uintptr_t)map; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + if (notify) + notify_remote_via_irq(bedata->irq); + + wait_event(bedata->inflight_req, + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); + + if (map->active_socket) { + /* + * Set in_error and wake up inflight_conn_req to force + * recvmsg waiters to exit. + */ + map->active.ring->in_error = -EBADF; + wake_up_interruptible(&map->active.inflight_conn_req); + + /* + * We need to make sure that sendmsg/recvmsg on this socket have + * not started before we've cleared sk_send_head here. The + * easiest way to guarantee this is to see that no pvcalls + * (other than us) is in progress on this socket. + */ + while (atomic_read(&map->refcount) > 1) + cpu_relax(); + + pvcalls_front_free_map(bedata, map); + } else { + wake_up(&bedata->inflight_req); + wake_up(&map->passive.inflight_accept_req); + + while (atomic_read(&map->refcount) > 1) + cpu_relax(); + + spin_lock(&bedata->socket_lock); + list_del(&map->list); + spin_unlock(&bedata->socket_lock); + if (READ_ONCE(map->passive.inflight_req_id) != PVCALLS_INVALID_ID && + READ_ONCE(map->passive.inflight_req_id) != 0) { + pvcalls_front_free_map(bedata, + map->passive.accept_map); + } + kfree(map); + } + WRITE_ONCE(bedata->rsp[req_id].req_id, PVCALLS_INVALID_ID); + + pvcalls_exit(); + return 0; +} +EXPORT_SYMBOL_GPL(pvcalls_front_release); + +static const struct xenbus_device_id pvcalls_front_ids[] = { + { "pvcalls" }, + { "" } +}; + +static void pvcalls_front_remove(struct xenbus_device *dev) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map = NULL, *n; + + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + dev_set_drvdata(&dev->dev, NULL); + pvcalls_front_dev = NULL; + if (bedata->irq >= 0) + unbind_from_irqhandler(bedata->irq, dev); + + list_for_each_entry_safe(map, n, &bedata->socket_mappings, list) { + map->sock->sk->sk_send_head = NULL; + if (map->active_socket) { + map->active.ring->in_error = -EBADF; + wake_up_interruptible(&map->active.inflight_conn_req); + } + } + + smp_mb(); + while (atomic_read(&pvcalls_refcount) > 0) + cpu_relax(); + list_for_each_entry_safe(map, n, &bedata->socket_mappings, list) { + if (map->active_socket) { + /* No need to lock, refcount is 0 */ + pvcalls_front_free_map(bedata, map); + } else { + list_del(&map->list); + kfree(map); + } + } + if (bedata->ref != -1) + gnttab_end_foreign_access(bedata->ref, NULL); + kfree(bedata->ring.sring); + kfree(bedata); + xenbus_switch_state(dev, XenbusStateClosed); +} + +static int pvcalls_front_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int ret = -ENOMEM, i; + evtchn_port_t evtchn; + unsigned int max_page_order, function_calls, len; + char *versions; + grant_ref_t gref_head = 0; + struct xenbus_transaction xbt; + struct pvcalls_bedata *bedata = NULL; + struct xen_pvcalls_sring *sring; + + if (pvcalls_front_dev != NULL) { + dev_err(&dev->dev, "only one PV Calls connection supported\n"); + return -EINVAL; + } + + versions = xenbus_read(XBT_NIL, dev->otherend, "versions", &len); + if (IS_ERR(versions)) + return PTR_ERR(versions); + if (!len) + return -EINVAL; + if (strcmp(versions, "1")) { + kfree(versions); + return -EINVAL; + } + kfree(versions); + max_page_order = xenbus_read_unsigned(dev->otherend, + "max-page-order", 0); + if (max_page_order < PVCALLS_RING_ORDER) + return -ENODEV; + function_calls = xenbus_read_unsigned(dev->otherend, + "function-calls", 0); + /* See XENBUS_FUNCTIONS_CALLS in pvcalls.h */ + if (function_calls != 1) + return -ENODEV; + pr_info("%s max-page-order is %u\n", __func__, max_page_order); + + bedata = kzalloc(sizeof(struct pvcalls_bedata), GFP_KERNEL); + if (!bedata) + return -ENOMEM; + + dev_set_drvdata(&dev->dev, bedata); + pvcalls_front_dev = dev; + init_waitqueue_head(&bedata->inflight_req); + INIT_LIST_HEAD(&bedata->socket_mappings); + spin_lock_init(&bedata->socket_lock); + bedata->irq = -1; + bedata->ref = -1; + + for (i = 0; i < PVCALLS_NR_RSP_PER_RING; i++) + bedata->rsp[i].req_id = PVCALLS_INVALID_ID; + + sring = (struct xen_pvcalls_sring *) __get_free_page(GFP_KERNEL | + __GFP_ZERO); + if (!sring) + goto error; + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&bedata->ring, sring, XEN_PAGE_SIZE); + + ret = xenbus_alloc_evtchn(dev, &evtchn); + if (ret) + goto error; + + bedata->irq = bind_evtchn_to_irqhandler(evtchn, + pvcalls_front_event_handler, + 0, "pvcalls-frontend", dev); + if (bedata->irq < 0) { + ret = bedata->irq; + goto error; + } + + ret = gnttab_alloc_grant_references(1, &gref_head); + if (ret < 0) + goto error; + ret = gnttab_claim_grant_reference(&gref_head); + if (ret < 0) + goto error; + bedata->ref = ret; + gnttab_grant_foreign_access_ref(bedata->ref, dev->otherend_id, + virt_to_gfn((void *)sring), 0); + + again: + ret = xenbus_transaction_start(&xbt); + if (ret) { + xenbus_dev_fatal(dev, ret, "starting transaction"); + goto error; + } + ret = xenbus_printf(xbt, dev->nodename, "version", "%u", 1); + if (ret) + goto error_xenbus; + ret = xenbus_printf(xbt, dev->nodename, "ring-ref", "%d", bedata->ref); + if (ret) + goto error_xenbus; + ret = xenbus_printf(xbt, dev->nodename, "port", "%u", + evtchn); + if (ret) + goto error_xenbus; + ret = xenbus_transaction_end(xbt, 0); + if (ret) { + if (ret == -EAGAIN) + goto again; + xenbus_dev_fatal(dev, ret, "completing transaction"); + goto error; + } + xenbus_switch_state(dev, XenbusStateInitialised); + + return 0; + + error_xenbus: + xenbus_transaction_end(xbt, 1); + xenbus_dev_fatal(dev, ret, "writing xenstore"); + error: + pvcalls_front_remove(dev); + return ret; +} + +static void pvcalls_front_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + switch (backend_state) { + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + case XenbusStateInitialising: + case XenbusStateInitialised: + case XenbusStateUnknown: + break; + + case XenbusStateInitWait: + break; + + case XenbusStateConnected: + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateClosed: + if (dev->state == XenbusStateClosed) + break; + /* Missed the backend's CLOSING state */ + fallthrough; + case XenbusStateClosing: + xenbus_frontend_closed(dev); + break; + } +} + +static struct xenbus_driver pvcalls_front_driver = { + .ids = pvcalls_front_ids, + .probe = pvcalls_front_probe, + .remove = pvcalls_front_remove, + .otherend_changed = pvcalls_front_changed, + .not_essential = true, +}; + +static int __init pvcalls_frontend_init(void) +{ + if (!xen_domain()) + return -ENODEV; + + pr_info("Initialising Xen pvcalls frontend driver\n"); + + return xenbus_register_frontend(&pvcalls_front_driver); +} + +module_init(pvcalls_frontend_init); + +MODULE_DESCRIPTION("Xen PV Calls frontend driver"); +MODULE_AUTHOR("Stefano Stabellini <sstabellini@kernel.org>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h new file mode 100644 index 000000000000..881ef14660bc --- /dev/null +++ b/drivers/xen/pvcalls-front.h @@ -0,0 +1,28 @@ +#ifndef __PVCALLS_FRONT_H__ +#define __PVCALLS_FRONT_H__ + +#include <linux/net.h> + +int pvcalls_front_socket(struct socket *sock); +int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr, + int addr_len, int flags); +int pvcalls_front_bind(struct socket *sock, + struct sockaddr *addr, + int addr_len); +int pvcalls_front_listen(struct socket *sock, int backlog); +int pvcalls_front_accept(struct socket *sock, + struct socket *newsock, + struct proto_accept_arg *arg); +int pvcalls_front_sendmsg(struct socket *sock, + struct msghdr *msg, + size_t len); +int pvcalls_front_recvmsg(struct socket *sock, + struct msghdr *msg, + size_t len, + int flags); +__poll_t pvcalls_front_poll(struct file *file, + struct socket *sock, + poll_table *wait); +int pvcalls_front_release(struct socket *sock); + +#endif diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index aadffcf7db9b..ccf25027bec1 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -1,18 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2010 * by Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> * * This code provides a IOMMU for Xen PV guests with PCI passthrough. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License v2.0 as published by - * the Free Software Foundation - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * * PV guests under Xen are running in an non-contiguous memory architecture. * * When PCI pass-through is utilized, this necessitates an IOMMU for @@ -30,556 +22,390 @@ * and PFN+1==MFN+1. Lastly with Xen 4.0, pages (in debug mode) are * allocated in descending order (high to low), meaning the guest might * never get any MFN's under the 4GB mark. - * */ #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt -#include <linux/bootmem.h> -#include <linux/dma-mapping.h> +#include <linux/memblock.h> +#include <linux/dma-direct.h> +#include <linux/dma-map-ops.h> #include <linux/export.h> #include <xen/swiotlb-xen.h> #include <xen/page.h> #include <xen/xen-ops.h> #include <xen/hvc-console.h> -/* - * Used to do a quick range check in swiotlb_tbl_unmap_single and - * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this - * API. - */ -static char *xen_io_tlb_start, *xen_io_tlb_end; -static unsigned long xen_io_tlb_nslabs; +#include <asm/dma-mapping.h> + +#include <trace/events/swiotlb.h> +#define MAX_DMA_BITS 32 + /* * Quick lookup value of the bus address of the IOTLB. */ -static u64 start_dma_addr; - -static dma_addr_t xen_phys_to_bus(phys_addr_t paddr) +static inline phys_addr_t xen_phys_to_bus(struct device *dev, phys_addr_t paddr) { - return phys_to_machine(XPADDR(paddr)).maddr; + unsigned long bfn = pfn_to_bfn(XEN_PFN_DOWN(paddr)); + phys_addr_t baddr = (phys_addr_t)bfn << XEN_PAGE_SHIFT; + + baddr |= paddr & ~XEN_PAGE_MASK; + return baddr; } -static phys_addr_t xen_bus_to_phys(dma_addr_t baddr) +static inline dma_addr_t xen_phys_to_dma(struct device *dev, phys_addr_t paddr) { - return machine_to_phys(XMADDR(baddr)).paddr; + return phys_to_dma(dev, xen_phys_to_bus(dev, paddr)); } -static dma_addr_t xen_virt_to_bus(void *address) +static inline phys_addr_t xen_bus_to_phys(struct device *dev, + phys_addr_t baddr) { - return xen_phys_to_bus(virt_to_phys(address)); + unsigned long xen_pfn = bfn_to_pfn(XEN_PFN_DOWN(baddr)); + phys_addr_t paddr = (xen_pfn << XEN_PAGE_SHIFT) | + (baddr & ~XEN_PAGE_MASK); + + return paddr; } -static int check_pages_physically_contiguous(unsigned long pfn, - unsigned int offset, - size_t length) +static inline phys_addr_t xen_dma_to_phys(struct device *dev, + dma_addr_t dma_addr) { - unsigned long next_mfn; - int i; - int nr_pages; + return xen_bus_to_phys(dev, dma_to_phys(dev, dma_addr)); +} - next_mfn = pfn_to_mfn(pfn); - nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT; +static inline bool range_requires_alignment(phys_addr_t p, size_t size) +{ + phys_addr_t algn = 1ULL << (get_order(size) + PAGE_SHIFT); + phys_addr_t bus_addr = pfn_to_bfn(XEN_PFN_DOWN(p)) << XEN_PAGE_SHIFT; - for (i = 1; i < nr_pages; i++) { - if (pfn_to_mfn(++pfn) != ++next_mfn) - return 0; - } - return 1; + return IS_ALIGNED(p, algn) && !IS_ALIGNED(bus_addr, algn); } -static int range_straddles_page_boundary(phys_addr_t p, size_t size) +static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) { - unsigned long pfn = PFN_DOWN(p); - unsigned int offset = p & ~PAGE_MASK; - - if (offset + size <= PAGE_SIZE) - return 0; - if (check_pages_physically_contiguous(pfn, offset, size)) - return 0; - return 1; + unsigned long next_bfn, xen_pfn = XEN_PFN_DOWN(p); + unsigned int i, nr_pages = XEN_PFN_UP(xen_offset_in_page(p) + size); + + next_bfn = pfn_to_bfn(xen_pfn); + + for (i = 1; i < nr_pages; i++) + if (pfn_to_bfn(++xen_pfn) != ++next_bfn) + return 1; + + return 0; } -static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) +static struct io_tlb_pool *xen_swiotlb_find_pool(struct device *dev, + dma_addr_t dma_addr) { - unsigned long mfn = PFN_DOWN(dma_addr); - unsigned long pfn = mfn_to_local_pfn(mfn); - phys_addr_t paddr; + unsigned long bfn = XEN_PFN_DOWN(dma_to_phys(dev, dma_addr)); + unsigned long xen_pfn = bfn_to_local_pfn(bfn); + phys_addr_t paddr = (phys_addr_t)xen_pfn << XEN_PAGE_SHIFT; /* If the address is outside our domain, it CAN * have the same virtual address as another address * in our domain. Therefore _only_ check address within our domain. */ - if (pfn_valid(pfn)) { - paddr = PFN_PHYS(pfn); - return paddr >= virt_to_phys(xen_io_tlb_start) && - paddr < virt_to_phys(xen_io_tlb_end); - } - return 0; + if (pfn_valid(PFN_DOWN(paddr))) + return swiotlb_find_pool(dev, paddr); + return NULL; } -static int max_dma_bits = 32; - -static int -xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) +#ifdef CONFIG_X86 +int __init xen_swiotlb_fixup(void *buf, unsigned long nslabs) { - int i, rc; - int dma_bits; + int rc; + unsigned int order = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT); + unsigned int i, dma_bits = order + PAGE_SHIFT; + dma_addr_t dma_handle; + phys_addr_t p = virt_to_phys(buf); - dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; + BUILD_BUG_ON(IO_TLB_SEGSIZE & (IO_TLB_SEGSIZE - 1)); + BUG_ON(nslabs % IO_TLB_SEGSIZE); i = 0; do { - int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE); - do { rc = xen_create_contiguous_region( - (unsigned long)buf + (i << IO_TLB_SHIFT), - get_order(slabs << IO_TLB_SHIFT), - dma_bits); - } while (rc && dma_bits++ < max_dma_bits); + p + (i << IO_TLB_SHIFT), order, + dma_bits, &dma_handle); + } while (rc && dma_bits++ < MAX_DMA_BITS); if (rc) return rc; - i += slabs; + i += IO_TLB_SEGSIZE; } while (i < nslabs); return 0; } -static unsigned long xen_set_nslabs(unsigned long nr_tbl) -{ - if (!nr_tbl) { - xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); - xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); - } else - xen_io_tlb_nslabs = nr_tbl; - - return xen_io_tlb_nslabs << IO_TLB_SHIFT; -} - -enum xen_swiotlb_err { - XEN_SWIOTLB_UNKNOWN = 0, - XEN_SWIOTLB_ENOMEM, - XEN_SWIOTLB_EFIXUP -}; -static const char *xen_swiotlb_error(enum xen_swiotlb_err err) -{ - switch (err) { - case XEN_SWIOTLB_ENOMEM: - return "Cannot allocate Xen-SWIOTLB buffer\n"; - case XEN_SWIOTLB_EFIXUP: - return "Failed to get contiguous memory for DMA from Xen!\n"\ - "You either: don't have the permissions, do not have"\ - " enough free memory under 4GB, or the hypervisor memory"\ - " is too fragmented!"; - default: - break; - } - return ""; -} -int __ref xen_swiotlb_init(int verbose, bool early) +static void * +xen_swiotlb_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flags, unsigned long attrs) { - unsigned long bytes, order; - int rc = -ENOMEM; - enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN; - unsigned int repeat = 3; - - xen_io_tlb_nslabs = swiotlb_nr_tbl(); -retry: - bytes = xen_set_nslabs(xen_io_tlb_nslabs); - order = get_order(xen_io_tlb_nslabs << IO_TLB_SHIFT); - /* - * Get IO TLB memory from any location. - */ - if (early) - xen_io_tlb_start = alloc_bootmem_pages(PAGE_ALIGN(bytes)); - else { -#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) -#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) - while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { - xen_io_tlb_start = (void *)__get_free_pages(__GFP_NOWARN, order); - if (xen_io_tlb_start) - break; - order--; - } - if (order != get_order(bytes)) { - pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n", - (PAGE_SIZE << order) >> 20); - xen_io_tlb_nslabs = SLABS_PER_PAGE << order; - bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; - } - } - if (!xen_io_tlb_start) { - m_ret = XEN_SWIOTLB_ENOMEM; - goto error; - } - xen_io_tlb_end = xen_io_tlb_start + bytes; - /* - * And replace that memory with pages under 4GB. - */ - rc = xen_swiotlb_fixup(xen_io_tlb_start, - bytes, - xen_io_tlb_nslabs); - if (rc) { - if (early) - free_bootmem(__pa(xen_io_tlb_start), PAGE_ALIGN(bytes)); - else { - free_pages((unsigned long)xen_io_tlb_start, order); - xen_io_tlb_start = NULL; - } - m_ret = XEN_SWIOTLB_EFIXUP; - goto error; - } - start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); - if (early) { - if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, - verbose)) - panic("Cannot allocate SWIOTLB buffer"); - rc = 0; - } else - rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs); - return rc; -error: - if (repeat--) { - xen_io_tlb_nslabs = max(1024UL, /* Min is 2MB */ - (xen_io_tlb_nslabs >> 1)); - pr_info("Lowering to %luMB\n", - (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20); - goto retry; - } - pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc); - if (early) - panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); - else - free_pages((unsigned long)xen_io_tlb_start, order); - return rc; -} -void * -xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags, - struct dma_attrs *attrs) -{ - void *ret; + u64 dma_mask = dev->coherent_dma_mask; int order = get_order(size); - u64 dma_mask = DMA_BIT_MASK(32); - unsigned long vstart; phys_addr_t phys; - dma_addr_t dev_addr; - - /* - * Ignore region specifiers - the kernel's ideas of - * pseudo-phys memory layout has nothing to do with the - * machine physical layout. We can't allocate highmem - * because we can't return a pointer to it. - */ - flags &= ~(__GFP_DMA | __GFP_HIGHMEM); - - if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret)) - return ret; + void *ret; - vstart = __get_free_pages(flags, order); - ret = (void *)vstart; + /* Align the allocation to the Xen page size */ + size = ALIGN(size, XEN_PAGE_SIZE); + ret = (void *)__get_free_pages(flags, get_order(size)); if (!ret) return ret; - - if (hwdev && hwdev->coherent_dma_mask) - dma_mask = dma_alloc_coherent_mask(hwdev, flags); - phys = virt_to_phys(ret); - dev_addr = xen_phys_to_bus(phys); - if (((dev_addr + size - 1 <= dma_mask)) && - !range_straddles_page_boundary(phys, size)) - *dma_handle = dev_addr; - else { - if (xen_create_contiguous_region(vstart, order, - fls64(dma_mask)) != 0) { - free_pages(vstart, order); - return NULL; - } - *dma_handle = virt_to_machine(ret).maddr; + + *dma_handle = xen_phys_to_dma(dev, phys); + if (*dma_handle + size - 1 > dma_mask || + range_straddles_page_boundary(phys, size) || + range_requires_alignment(phys, size)) { + if (xen_create_contiguous_region(phys, order, fls64(dma_mask), + dma_handle) != 0) + goto out_free_pages; + SetPageXenRemapped(virt_to_page(ret)); } + memset(ret, 0, size); return ret; + +out_free_pages: + free_pages((unsigned long)ret, get_order(size)); + return NULL; } -EXPORT_SYMBOL_GPL(xen_swiotlb_alloc_coherent); -void -xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, - dma_addr_t dev_addr, struct dma_attrs *attrs) +static void +xen_swiotlb_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, unsigned long attrs) { + phys_addr_t phys = virt_to_phys(vaddr); int order = get_order(size); - phys_addr_t phys; - u64 dma_mask = DMA_BIT_MASK(32); - - if (dma_release_from_coherent(hwdev, order, vaddr)) - return; - if (hwdev && hwdev->coherent_dma_mask) - dma_mask = hwdev->coherent_dma_mask; + /* Convert the size to actually allocated. */ + size = ALIGN(size, XEN_PAGE_SIZE); - phys = virt_to_phys(vaddr); + if (WARN_ON_ONCE(dma_handle + size - 1 > dev->coherent_dma_mask) || + WARN_ON_ONCE(range_straddles_page_boundary(phys, size) || + range_requires_alignment(phys, size))) + return; - if (((dev_addr + size - 1 > dma_mask)) || - range_straddles_page_boundary(phys, size)) - xen_destroy_contiguous_region((unsigned long)vaddr, order); - - free_pages((unsigned long)vaddr, order); + if (TestClearPageXenRemapped(virt_to_page(vaddr))) + xen_destroy_contiguous_region(phys, order); + free_pages((unsigned long)vaddr, get_order(size)); } -EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent); - +#endif /* CONFIG_X86 */ /* * Map a single buffer of the indicated size for DMA in streaming mode. The * physical address to use is returned. * * Once the device is given the dma address, the device owns this memory until - * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed. + * either xen_swiotlb_unmap_phys or xen_swiotlb_dma_sync_single is performed. */ -dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - struct dma_attrs *attrs) +static dma_addr_t xen_swiotlb_map_phys(struct device *dev, phys_addr_t phys, + size_t size, enum dma_data_direction dir, + unsigned long attrs) { - phys_addr_t map, phys = page_to_phys(page) + offset; - dma_addr_t dev_addr = xen_phys_to_bus(phys); + dma_addr_t dev_addr; + phys_addr_t map; BUG_ON(dir == DMA_NONE); + + if (attrs & DMA_ATTR_MMIO) { + if (unlikely(!dma_capable(dev, phys, size, false))) { + dev_err_once( + dev, + "DMA addr %pa+%zu overflow (mask %llx, bus limit %llx).\n", + &phys, size, *dev->dma_mask, + dev->bus_dma_limit); + WARN_ON_ONCE(1); + return DMA_MAPPING_ERROR; + } + return phys; + } + + dev_addr = xen_phys_to_dma(dev, phys); + /* * If the address happens to be in the device's DMA window, * we can safely return the device addr and not worry about bounce * buffering it. */ - if (dma_capable(dev, dev_addr, size) && - !range_straddles_page_boundary(phys, size) && !swiotlb_force) - return dev_addr; + if (dma_capable(dev, dev_addr, size, true) && + !dma_kmalloc_needs_bounce(dev, size, dir) && + !range_straddles_page_boundary(phys, size) && + !xen_arch_need_swiotlb(dev, phys, dev_addr) && + !is_swiotlb_force_bounce(dev)) + goto done; /* * Oh well, have to allocate and map a bounce buffer. */ - map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir); - if (map == SWIOTLB_MAP_ERROR) - return DMA_ERROR_CODE; + trace_swiotlb_bounced(dev, dev_addr, size); - dev_addr = xen_phys_to_bus(map); + map = swiotlb_tbl_map_single(dev, phys, size, 0, dir, attrs); + if (map == (phys_addr_t)DMA_MAPPING_ERROR) + return DMA_MAPPING_ERROR; + + phys = map; + dev_addr = xen_phys_to_dma(dev, map); /* * Ensure that the address returned is DMA'ble */ - if (!dma_capable(dev, dev_addr, size)) { - swiotlb_tbl_unmap_single(dev, map, size, dir); - dev_addr = 0; + if (unlikely(!dma_capable(dev, dev_addr, size, true))) { + __swiotlb_tbl_unmap_single(dev, map, size, dir, + attrs | DMA_ATTR_SKIP_CPU_SYNC, + swiotlb_find_pool(dev, map)); + return DMA_MAPPING_ERROR; + } + +done: + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dev_addr)))) + arch_sync_dma_for_device(phys, size, dir); + else + xen_dma_sync_for_device(dev, dev_addr, size, dir); } return dev_addr; } -EXPORT_SYMBOL_GPL(xen_swiotlb_map_page); /* * Unmap a single streaming mode DMA translation. The dma_addr and size must - * match what was provided for in a previous xen_swiotlb_map_page call. All + * match what was provided for in a previous xen_swiotlb_map_phys call. All * other usages are undefined. * * After this call, reads by the cpu to the buffer are guaranteed to see * whatever the device wrote there. */ -static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir) +static void xen_swiotlb_unmap_phys(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) { - phys_addr_t paddr = xen_bus_to_phys(dev_addr); + phys_addr_t paddr = xen_dma_to_phys(hwdev, dev_addr); + struct io_tlb_pool *pool; BUG_ON(dir == DMA_NONE); - /* NOTE: We use dev_addr here, not paddr! */ - if (is_xen_swiotlb_buffer(dev_addr)) { - swiotlb_tbl_unmap_single(hwdev, paddr, size, dir); - return; + if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { + if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr)))) + arch_sync_dma_for_cpu(paddr, size, dir); + else + xen_dma_sync_for_cpu(hwdev, dev_addr, size, dir); } - if (dir != DMA_FROM_DEVICE) - return; - - /* - * phys_to_virt doesn't work with hihgmem page but we could - * call dma_mark_clean() with hihgmem page here. However, we - * are fine since dma_mark_clean() is null on POWERPC. We can - * make dma_mark_clean() take a physical address if necessary. - */ - dma_mark_clean(phys_to_virt(paddr), size); -} - -void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) -{ - xen_unmap_single(hwdev, dev_addr, size, dir); + /* NOTE: We use dev_addr here, not paddr! */ + pool = xen_swiotlb_find_pool(hwdev, dev_addr); + if (pool) + __swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, + attrs, pool); } -EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page); -/* - * Make physical memory consistent for a single streaming mode DMA translation - * after a transfer. - * - * If you perform a xen_swiotlb_map_page() but wish to interrogate the buffer - * using the cpu, yet do not wish to teardown the dma mapping, you must - * call this function before doing so. At the next point you give the dma - * address back to the card, you must first perform a - * xen_swiotlb_dma_sync_for_device, and then the device again owns the buffer - */ static void -xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - enum dma_sync_target target) +xen_swiotlb_sync_single_for_cpu(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir) { - phys_addr_t paddr = xen_bus_to_phys(dev_addr); - - BUG_ON(dir == DMA_NONE); - - /* NOTE: We use dev_addr here, not paddr! */ - if (is_xen_swiotlb_buffer(dev_addr)) { - swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); - return; + phys_addr_t paddr = xen_dma_to_phys(dev, dma_addr); + struct io_tlb_pool *pool; + + if (!dev_is_dma_coherent(dev)) { + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) + arch_sync_dma_for_cpu(paddr, size, dir); + else + xen_dma_sync_for_cpu(dev, dma_addr, size, dir); } - if (dir != DMA_FROM_DEVICE) - return; - - dma_mark_clean(phys_to_virt(paddr), size); -} - -void -xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir) -{ - xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU); + pool = xen_swiotlb_find_pool(dev, dma_addr); + if (pool) + __swiotlb_sync_single_for_cpu(dev, paddr, size, dir, pool); } -EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_cpu); -void -xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir) +static void +xen_swiotlb_sync_single_for_device(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir) { - xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE); + phys_addr_t paddr = xen_dma_to_phys(dev, dma_addr); + struct io_tlb_pool *pool; + + pool = xen_swiotlb_find_pool(dev, dma_addr); + if (pool) + __swiotlb_sync_single_for_device(dev, paddr, size, dir, pool); + + if (!dev_is_dma_coherent(dev)) { + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) + arch_sync_dma_for_device(paddr, size, dir); + else + xen_dma_sync_for_device(dev, dma_addr, size, dir); + } } -EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_device); /* - * Map a set of buffers described by scatterlist in streaming mode for DMA. - * This is the scatter-gather version of the above xen_swiotlb_map_page - * interface. Here the scatter gather list elements are each tagged with the - * appropriate dma address and length. They are obtained via - * sg_dma_{address,length}(SG). - * - * NOTE: An implementation may be able to use a smaller number of - * DMA address/length pairs than there are SG table elements. - * (for example via virtual mapping capabilities) - * The routine returns the number of addr/length pairs actually - * used, at most nents. - * - * Device ownership issues as mentioned above for xen_swiotlb_map_page are the - * same here. + * Unmap a set of streaming mode DMA translations. Again, cpu read rules + * concerning calls here are the same as for swiotlb_unmap_phys() above. */ -int -xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, - int nelems, enum dma_data_direction dir, - struct dma_attrs *attrs) +static void +xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, + enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *sg; int i; BUG_ON(dir == DMA_NONE); - for_each_sg(sgl, sg, nelems, i) { - phys_addr_t paddr = sg_phys(sg); - dma_addr_t dev_addr = xen_phys_to_bus(paddr); - - if (swiotlb_force || - !dma_capable(hwdev, dev_addr, sg->length) || - range_straddles_page_boundary(paddr, sg->length)) { - phys_addr_t map = swiotlb_tbl_map_single(hwdev, - start_dma_addr, - sg_phys(sg), - sg->length, - dir); - if (map == SWIOTLB_MAP_ERROR) { - /* Don't panic here, we expect map_sg users - to do proper error handling. */ - xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, - attrs); - sgl[0].dma_length = 0; - return DMA_ERROR_CODE; - } - sg->dma_address = xen_phys_to_bus(map); - } else - sg->dma_address = dev_addr; - sg->dma_length = sg->length; - } - return nelems; + for_each_sg(sgl, sg, nelems, i) + xen_swiotlb_unmap_phys(hwdev, sg->dma_address, sg_dma_len(sg), + dir, attrs); + } -EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg_attrs); -/* - * Unmap a set of streaming mode DMA translations. Again, cpu read rules - * concerning calls here are the same as for swiotlb_unmap_page() above. - */ -void -xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, - int nelems, enum dma_data_direction dir, - struct dma_attrs *attrs) +static int +xen_swiotlb_map_sg(struct device *dev, struct scatterlist *sgl, int nelems, + enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *sg; int i; BUG_ON(dir == DMA_NONE); - for_each_sg(sgl, sg, nelems, i) - xen_unmap_single(hwdev, sg->dma_address, sg->dma_length, dir); + for_each_sg(sgl, sg, nelems, i) { + sg->dma_address = xen_swiotlb_map_phys(dev, sg_phys(sg), + sg->length, dir, attrs); + if (sg->dma_address == DMA_MAPPING_ERROR) + goto out_unmap; + sg_dma_len(sg) = sg->length; + } + return nelems; +out_unmap: + xen_swiotlb_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); + sg_dma_len(sgl) = 0; + return -EIO; } -EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs); -/* - * Make physical memory consistent for a set of streaming mode DMA translations - * after a transfer. - * - * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules - * and usage. - */ static void -xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, - int nelems, enum dma_data_direction dir, - enum dma_sync_target target) +xen_swiotlb_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir) { struct scatterlist *sg; int i; - for_each_sg(sgl, sg, nelems, i) - xen_swiotlb_sync_single(hwdev, sg->dma_address, - sg->dma_length, dir, target); -} - -void -xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir) -{ - xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU); + for_each_sg(sgl, sg, nelems, i) { + xen_swiotlb_sync_single_for_cpu(dev, sg->dma_address, + sg->length, dir); + } } -EXPORT_SYMBOL_GPL(xen_swiotlb_sync_sg_for_cpu); -void -xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, +static void +xen_swiotlb_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir) { - xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); -} -EXPORT_SYMBOL_GPL(xen_swiotlb_sync_sg_for_device); + struct scatterlist *sg; + int i; -int -xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) -{ - return !dma_addr; + for_each_sg(sgl, sg, nelems, i) { + xen_swiotlb_sync_single_for_device(dev, sg->dma_address, + sg->length, dir); + } } -EXPORT_SYMBOL_GPL(xen_swiotlb_dma_mapping_error); /* * Return whether the given device DMA address mask can be supported @@ -587,9 +413,32 @@ EXPORT_SYMBOL_GPL(xen_swiotlb_dma_mapping_error); * during bus mastering, then you would pass 0x00ffffff as the mask to * this function. */ -int +static int xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) { - return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask; + return xen_phys_to_dma(hwdev, default_swiotlb_limit()) <= mask; } -EXPORT_SYMBOL_GPL(xen_swiotlb_dma_supported); + +const struct dma_map_ops xen_swiotlb_dma_ops = { +#ifdef CONFIG_X86 + .alloc = xen_swiotlb_alloc_coherent, + .free = xen_swiotlb_free_coherent, +#else + .alloc = dma_direct_alloc, + .free = dma_direct_free, +#endif + .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, + .sync_single_for_device = xen_swiotlb_sync_single_for_device, + .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = xen_swiotlb_sync_sg_for_device, + .map_sg = xen_swiotlb_map_sg, + .unmap_sg = xen_swiotlb_unmap_sg, + .map_phys = xen_swiotlb_map_phys, + .unmap_phys = xen_swiotlb_unmap_phys, + .dma_supported = xen_swiotlb_dma_supported, + .mmap = dma_common_mmap, + .get_sgtable = dma_common_get_sgtable, + .alloc_pages_op = dma_common_alloc_pages, + .free_pages = dma_common_free_pages, + .max_mapping_size = swiotlb_max_mapping_size, +}; diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c index 96453f8a85c5..2f880374b463 100644 --- a/drivers/xen/sys-hypervisor.c +++ b/drivers/xen/sys-hypervisor.c @@ -1,15 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * copyright (c) 2006 IBM Corporation * Authored by: Mike D. Day <ncmike@us.ibm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/slab.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/kobject.h> #include <linux/err.h> @@ -20,19 +17,24 @@ #include <xen/xenbus.h> #include <xen/interface/xen.h> #include <xen/interface/version.h> +#ifdef CONFIG_XEN_HAVE_VPMU +#include <xen/interface/xenpmu.h> +#endif #define HYPERVISOR_ATTR_RO(_name) \ -static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name) +static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name) #define HYPERVISOR_ATTR_RW(_name) \ -static struct hyp_sysfs_attr _name##_attr = \ - __ATTR(_name, 0644, _name##_show, _name##_store) +static struct hyp_sysfs_attr _name##_attr = __ATTR_RW(_name) struct hyp_sysfs_attr { struct attribute attr; ssize_t (*show)(struct hyp_sysfs_attr *, char *); ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t); - void *hyp_attr_data; + union { + void *hyp_attr_data; + unsigned long hyp_attr_value; + }; }; static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer) @@ -47,9 +49,33 @@ static int __init xen_sysfs_type_init(void) return sysfs_create_file(hypervisor_kobj, &type_attr.attr); } -static void xen_sysfs_type_destroy(void) +static ssize_t guest_type_show(struct hyp_sysfs_attr *attr, char *buffer) { - sysfs_remove_file(hypervisor_kobj, &type_attr.attr); + const char *type; + + switch (xen_domain_type) { + case XEN_NATIVE: + /* ARM only. */ + type = "Xen"; + break; + case XEN_PV_DOMAIN: + type = "PV"; + break; + case XEN_HVM_DOMAIN: + type = xen_pvh_domain() ? "PVH" : "HVM"; + break; + default: + return -EINVAL; + } + + return sprintf(buffer, "%s\n", type); +} + +HYPERVISOR_ATTR_RO(guest_type); + +static int __init xen_sysfs_guest_type_init(void) +{ + return sysfs_create_file(hypervisor_kobj, &guest_type_attr.attr); } /* xen version attributes */ @@ -108,11 +134,6 @@ static int __init xen_sysfs_version_init(void) return sysfs_create_group(hypervisor_kobj, &version_group); } -static void xen_sysfs_version_destroy(void) -{ - sysfs_remove_group(hypervisor_kobj, &version_group); -} - /* UUID */ static ssize_t uuid_show_fallback(struct hyp_sysfs_attr *attr, char *buffer) @@ -154,11 +175,6 @@ static int __init xen_sysfs_uuid_init(void) return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr); } -static void xen_sysfs_uuid_destroy(void) -{ - sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr); -} - /* xen compilation attributes */ static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer) @@ -227,16 +243,11 @@ static const struct attribute_group xen_compilation_group = { .attrs = xen_compile_attrs, }; -static int __init xen_compilation_init(void) +static int __init xen_sysfs_compilation_init(void) { return sysfs_create_group(hypervisor_kobj, &xen_compilation_group); } -static void xen_compilation_destroy(void) -{ - sysfs_remove_group(hypervisor_kobj, &xen_compilation_group); -} - /* xen properties info */ static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer) @@ -344,12 +355,40 @@ static ssize_t features_show(struct hyp_sysfs_attr *attr, char *buffer) HYPERVISOR_ATTR_RO(features); +static ssize_t buildid_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + ssize_t ret; + struct xen_build_id *buildid; + + ret = HYPERVISOR_xen_version(XENVER_build_id, NULL); + if (ret < 0) { + if (ret == -EPERM) + ret = sprintf(buffer, "<denied>"); + return ret; + } + + buildid = kmalloc(sizeof(*buildid) + ret, GFP_KERNEL); + if (!buildid) + return -ENOMEM; + + buildid->len = ret; + ret = HYPERVISOR_xen_version(XENVER_build_id, buildid); + if (ret > 0) + ret = sprintf(buffer, "%s", buildid->buf); + kfree(buildid); + + return ret; +} + +HYPERVISOR_ATTR_RO(buildid); + static struct attribute *xen_properties_attrs[] = { &capabilities_attr.attr, &changeset_attr.attr, &virtual_start_attr.attr, &pagesize_attr.attr, &features_attr.attr, + &buildid_attr.attr, NULL }; @@ -358,16 +397,180 @@ static const struct attribute_group xen_properties_group = { .attrs = xen_properties_attrs, }; -static int __init xen_properties_init(void) +static int __init xen_sysfs_properties_init(void) { return sysfs_create_group(hypervisor_kobj, &xen_properties_group); } -static void xen_properties_destroy(void) +#define FLAG_UNAME "unknown" +#define FLAG_UNAME_FMT FLAG_UNAME "%02u" +#define FLAG_UNAME_MAX sizeof(FLAG_UNAME "XX") +#define FLAG_COUNT (sizeof(xen_start_flags) * BITS_PER_BYTE) +static_assert(sizeof(xen_start_flags) <= + sizeof_field(struct hyp_sysfs_attr, hyp_attr_value)); + +static ssize_t flag_show(struct hyp_sysfs_attr *attr, char *buffer) { - sysfs_remove_group(hypervisor_kobj, &xen_properties_group); + char *p = buffer; + + *p++ = '0' + ((xen_start_flags & attr->hyp_attr_value) != 0); + *p++ = '\n'; + return p - buffer; +} + +#define FLAG_NODE(flag, node) \ + [ilog2(flag)] = { \ + .attr = { .name = #node, .mode = 0444 },\ + .show = flag_show, \ + .hyp_attr_value = flag \ + } + +/* + * Add new, known flags here. No other changes are required, but + * note that each known flag wastes one entry in flag_unames[]. + * The code/complexity machinations to avoid this isn't worth it + * for a few entries, but keep it in mind. + */ +static struct hyp_sysfs_attr flag_attrs[FLAG_COUNT] = { + FLAG_NODE(SIF_PRIVILEGED, privileged), + FLAG_NODE(SIF_INITDOMAIN, initdomain) +}; +static struct attribute_group xen_flags_group = { + .name = "start_flags", + .attrs = (struct attribute *[FLAG_COUNT + 1]){} +}; +static char flag_unames[FLAG_COUNT][FLAG_UNAME_MAX]; + +static int __init xen_sysfs_flags_init(void) +{ + for (unsigned fnum = 0; fnum != FLAG_COUNT; fnum++) { + if (likely(flag_attrs[fnum].attr.name == NULL)) { + sprintf(flag_unames[fnum], FLAG_UNAME_FMT, fnum); + flag_attrs[fnum].attr.name = flag_unames[fnum]; + flag_attrs[fnum].attr.mode = 0444; + flag_attrs[fnum].show = flag_show; + flag_attrs[fnum].hyp_attr_value = 1 << fnum; + } + xen_flags_group.attrs[fnum] = &flag_attrs[fnum].attr; + } + return sysfs_create_group(hypervisor_kobj, &xen_flags_group); } +#ifdef CONFIG_XEN_HAVE_VPMU +struct pmu_mode { + const char *name; + uint32_t mode; +}; + +static struct pmu_mode pmu_modes[] = { + {"off", XENPMU_MODE_OFF}, + {"self", XENPMU_MODE_SELF}, + {"hv", XENPMU_MODE_HV}, + {"all", XENPMU_MODE_ALL} +}; + +static ssize_t pmu_mode_store(struct hyp_sysfs_attr *attr, + const char *buffer, size_t len) +{ + int ret; + struct xen_pmu_params xp; + int i; + + for (i = 0; i < ARRAY_SIZE(pmu_modes); i++) { + if (strncmp(buffer, pmu_modes[i].name, len - 1) == 0) { + xp.val = pmu_modes[i].mode; + break; + } + } + + if (i == ARRAY_SIZE(pmu_modes)) + return -EINVAL; + + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + ret = HYPERVISOR_xenpmu_op(XENPMU_mode_set, &xp); + if (ret) + return ret; + + return len; +} + +static ssize_t pmu_mode_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret; + struct xen_pmu_params xp; + int i; + uint32_t mode; + + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + ret = HYPERVISOR_xenpmu_op(XENPMU_mode_get, &xp); + if (ret) + return ret; + + mode = (uint32_t)xp.val; + for (i = 0; i < ARRAY_SIZE(pmu_modes); i++) { + if (mode == pmu_modes[i].mode) + return sprintf(buffer, "%s\n", pmu_modes[i].name); + } + + return -EINVAL; +} +HYPERVISOR_ATTR_RW(pmu_mode); + +static ssize_t pmu_features_store(struct hyp_sysfs_attr *attr, + const char *buffer, size_t len) +{ + int ret; + uint32_t features; + struct xen_pmu_params xp; + + ret = kstrtou32(buffer, 0, &features); + if (ret) + return ret; + + xp.val = features; + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + ret = HYPERVISOR_xenpmu_op(XENPMU_feature_set, &xp); + if (ret) + return ret; + + return len; +} + +static ssize_t pmu_features_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret; + struct xen_pmu_params xp; + + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + ret = HYPERVISOR_xenpmu_op(XENPMU_feature_get, &xp); + if (ret) + return ret; + + return sprintf(buffer, "0x%x\n", (uint32_t)xp.val); +} +HYPERVISOR_ATTR_RW(pmu_features); + +static struct attribute *xen_pmu_attrs[] = { + &pmu_mode_attr.attr, + &pmu_features_attr.attr, + NULL +}; + +static const struct attribute_group xen_pmu_group = { + .name = "pmu", + .attrs = xen_pmu_attrs, +}; + +static int __init xen_sysfs_pmu_init(void) +{ + return sysfs_create_group(hypervisor_kobj, &xen_pmu_group); +} +#endif + static int __init hyper_sysfs_init(void) { int ret; @@ -378,44 +581,51 @@ static int __init hyper_sysfs_init(void) ret = xen_sysfs_type_init(); if (ret) goto out; + ret = xen_sysfs_guest_type_init(); + if (ret) + goto guest_type_out; ret = xen_sysfs_version_init(); if (ret) goto version_out; - ret = xen_compilation_init(); + ret = xen_sysfs_compilation_init(); if (ret) goto comp_out; ret = xen_sysfs_uuid_init(); if (ret) goto uuid_out; - ret = xen_properties_init(); + ret = xen_sysfs_properties_init(); if (ret) goto prop_out; - + ret = xen_sysfs_flags_init(); + if (ret) + goto flags_out; +#ifdef CONFIG_XEN_HAVE_VPMU + if (xen_initial_domain()) { + ret = xen_sysfs_pmu_init(); + if (ret) { + sysfs_remove_group(hypervisor_kobj, &xen_flags_group); + goto flags_out; + } + } +#endif goto out; +flags_out: + sysfs_remove_group(hypervisor_kobj, &xen_properties_group); prop_out: - xen_sysfs_uuid_destroy(); + sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr); uuid_out: - xen_compilation_destroy(); + sysfs_remove_group(hypervisor_kobj, &xen_compilation_group); comp_out: - xen_sysfs_version_destroy(); + sysfs_remove_group(hypervisor_kobj, &version_group); version_out: - xen_sysfs_type_destroy(); + sysfs_remove_file(hypervisor_kobj, &guest_type_attr.attr); +guest_type_out: + sysfs_remove_file(hypervisor_kobj, &type_attr.attr); out: return ret; } - -static void __exit hyper_sysfs_exit(void) -{ - xen_properties_destroy(); - xen_compilation_destroy(); - xen_sysfs_uuid_destroy(); - xen_sysfs_version_destroy(); - xen_sysfs_type_destroy(); - -} -module_init(hyper_sysfs_init); -module_exit(hyper_sysfs_exit); +device_initcall(hyper_sysfs_init); static ssize_t hyp_sysfs_show(struct kobject *kobj, struct attribute *attr, @@ -445,7 +655,7 @@ static const struct sysfs_ops hyp_sysfs_ops = { .store = hyp_sysfs_store, }; -static struct kobj_type hyp_sysfs_kobj_type = { +static const struct kobj_type hyp_sysfs_kobj_type = { .sysfs_ops = &hyp_sysfs_ops, }; diff --git a/drivers/xen/time.c b/drivers/xen/time.c new file mode 100644 index 000000000000..5683383d2305 --- /dev/null +++ b/drivers/xen/time.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Xen stolen ticks accounting. + */ +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/math64.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/static_call.h> + +#include <asm/paravirt.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> + +#include <xen/events.h> +#include <xen/features.h> +#include <xen/interface/xen.h> +#include <xen/interface/vcpu.h> +#include <xen/xen-ops.h> + +/* runstate info updated by Xen */ +static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate); + +static DEFINE_PER_CPU(u64[4], old_runstate_time); + +/* return an consistent snapshot of 64-bit time/counter value */ +static u64 get64(const u64 *p) +{ + u64 ret; + + if (BITS_PER_LONG < 64) { + u32 *p32 = (u32 *)p; + u32 h, l, h2; + + /* + * Read high then low, and then make sure high is + * still the same; this will only loop if low wraps + * and carries into high. + * XXX some clean way to make this endian-proof? + */ + do { + h = READ_ONCE(p32[1]); + l = READ_ONCE(p32[0]); + h2 = READ_ONCE(p32[1]); + } while(h2 != h); + + ret = (((u64)h) << 32) | l; + } else + ret = READ_ONCE(*p); + + return ret; +} + +static void xen_get_runstate_snapshot_cpu_delta( + struct vcpu_runstate_info *res, unsigned int cpu) +{ + u64 state_time; + struct vcpu_runstate_info *state; + + BUG_ON(preemptible()); + + state = per_cpu_ptr(&xen_runstate, cpu); + + do { + state_time = get64(&state->state_entry_time); + rmb(); /* Hypervisor might update data. */ + *res = __READ_ONCE(*state); + rmb(); /* Hypervisor might update data. */ + } while (get64(&state->state_entry_time) != state_time || + (state_time & XEN_RUNSTATE_UPDATE)); +} + +static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res, + unsigned int cpu) +{ + int i; + + xen_get_runstate_snapshot_cpu_delta(res, cpu); + + for (i = 0; i < 4; i++) + res->time[i] += per_cpu(old_runstate_time, cpu)[i]; +} + +void xen_manage_runstate_time(int action) +{ + static struct vcpu_runstate_info *runstate_delta; + struct vcpu_runstate_info state; + int cpu, i; + + switch (action) { + case -1: /* backup runstate time before suspend */ + if (unlikely(runstate_delta)) + pr_warn_once("%s: memory leak as runstate_delta is not NULL\n", + __func__); + + runstate_delta = kmalloc_array(num_possible_cpus(), + sizeof(*runstate_delta), + GFP_ATOMIC); + if (unlikely(!runstate_delta)) { + pr_warn("%s: failed to allocate runstate_delta\n", + __func__); + return; + } + + for_each_possible_cpu(cpu) { + xen_get_runstate_snapshot_cpu_delta(&state, cpu); + memcpy(runstate_delta[cpu].time, state.time, + sizeof(runstate_delta[cpu].time)); + } + + break; + + case 0: /* backup runstate time after resume */ + if (unlikely(!runstate_delta)) { + pr_warn("%s: cannot accumulate runstate time as runstate_delta is NULL\n", + __func__); + return; + } + + for_each_possible_cpu(cpu) { + for (i = 0; i < 4; i++) + per_cpu(old_runstate_time, cpu)[i] += + runstate_delta[cpu].time[i]; + } + + break; + + default: /* do not accumulate runstate time for checkpointing */ + break; + } + + if (action != -1 && runstate_delta) { + kfree(runstate_delta); + runstate_delta = NULL; + } +} + +/* return true when a vcpu could run but has no real cpu to run on */ +bool xen_vcpu_stolen(int vcpu) +{ + return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable; +} + +u64 xen_steal_clock(int cpu) +{ + struct vcpu_runstate_info state; + + xen_get_runstate_snapshot_cpu(&state, cpu); + return state.time[RUNSTATE_runnable] + state.time[RUNSTATE_offline]; +} + +void xen_setup_runstate_info(int cpu) +{ + struct vcpu_register_runstate_memory_area area; + + area.addr.v = &per_cpu(xen_runstate, cpu); + + if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, + xen_vcpu_nr(cpu), &area)) + BUG(); +} + +void __init xen_time_setup_guest(void) +{ + bool xen_runstate_remote; + + xen_runstate_remote = !HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_runstate_update_flag); + + static_call_update(pv_steal_clock, xen_steal_clock); + + static_key_slow_inc(¶virt_steal_enabled); + if (xen_runstate_remote) + static_key_slow_inc(¶virt_steal_rq_enabled); +} diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c deleted file mode 100644 index 83b5c53bec6b..000000000000 --- a/drivers/xen/tmem.c +++ /dev/null @@ -1,426 +0,0 @@ -/* - * Xen implementation for transcendent memory (tmem) - * - * Copyright (C) 2009-2011 Oracle Corp. All rights reserved. - * Author: Dan Magenheimer - */ - -#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/pagemap.h> -#include <linux/cleancache.h> -#include <linux/frontswap.h> - -#include <xen/xen.h> -#include <xen/interface/xen.h> -#include <asm/xen/hypercall.h> -#include <asm/xen/page.h> -#include <asm/xen/hypervisor.h> -#include <xen/tmem.h> - -#ifndef CONFIG_XEN_TMEM_MODULE -bool __read_mostly tmem_enabled = false; - -static int __init enable_tmem(char *s) -{ - tmem_enabled = true; - return 1; -} -__setup("tmem", enable_tmem); -#endif - -#ifdef CONFIG_CLEANCACHE -static bool cleancache __read_mostly = true; -module_param(cleancache, bool, S_IRUGO); -static bool selfballooning __read_mostly = true; -module_param(selfballooning, bool, S_IRUGO); -#endif /* CONFIG_CLEANCACHE */ - -#ifdef CONFIG_FRONTSWAP -static bool frontswap __read_mostly = true; -module_param(frontswap, bool, S_IRUGO); -#else /* CONFIG_FRONTSWAP */ -#define frontswap (0) -#endif /* CONFIG_FRONTSWAP */ - -#ifdef CONFIG_XEN_SELFBALLOONING -static bool selfshrinking __read_mostly = true; -module_param(selfshrinking, bool, S_IRUGO); -#endif /* CONFIG_XEN_SELFBALLOONING */ - -#define TMEM_CONTROL 0 -#define TMEM_NEW_POOL 1 -#define TMEM_DESTROY_POOL 2 -#define TMEM_NEW_PAGE 3 -#define TMEM_PUT_PAGE 4 -#define TMEM_GET_PAGE 5 -#define TMEM_FLUSH_PAGE 6 -#define TMEM_FLUSH_OBJECT 7 -#define TMEM_READ 8 -#define TMEM_WRITE 9 -#define TMEM_XCHG 10 - -/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */ -#define TMEM_POOL_PERSIST 1 -#define TMEM_POOL_SHARED 2 -#define TMEM_POOL_PAGESIZE_SHIFT 4 -#define TMEM_VERSION_SHIFT 24 - - -struct tmem_pool_uuid { - u64 uuid_lo; - u64 uuid_hi; -}; - -struct tmem_oid { - u64 oid[3]; -}; - -#define TMEM_POOL_PRIVATE_UUID { 0, 0 } - -/* flags for tmem_ops.new_pool */ -#define TMEM_POOL_PERSIST 1 -#define TMEM_POOL_SHARED 2 - -/* xen tmem foundation ops/hypercalls */ - -static inline int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid oid, - u32 index, unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len) -{ - struct tmem_op op; - int rc = 0; - - op.cmd = tmem_cmd; - op.pool_id = tmem_pool; - op.u.gen.oid[0] = oid.oid[0]; - op.u.gen.oid[1] = oid.oid[1]; - op.u.gen.oid[2] = oid.oid[2]; - op.u.gen.index = index; - op.u.gen.tmem_offset = tmem_offset; - op.u.gen.pfn_offset = pfn_offset; - op.u.gen.len = len; - set_xen_guest_handle(op.u.gen.gmfn, (void *)gmfn); - rc = HYPERVISOR_tmem_op(&op); - return rc; -} - -static int xen_tmem_new_pool(struct tmem_pool_uuid uuid, - u32 flags, unsigned long pagesize) -{ - struct tmem_op op; - int rc = 0, pageshift; - - for (pageshift = 0; pagesize != 1; pageshift++) - pagesize >>= 1; - flags |= (pageshift - 12) << TMEM_POOL_PAGESIZE_SHIFT; - flags |= TMEM_SPEC_VERSION << TMEM_VERSION_SHIFT; - op.cmd = TMEM_NEW_POOL; - op.u.new.uuid[0] = uuid.uuid_lo; - op.u.new.uuid[1] = uuid.uuid_hi; - op.u.new.flags = flags; - rc = HYPERVISOR_tmem_op(&op); - return rc; -} - -/* xen generic tmem ops */ - -static int xen_tmem_put_page(u32 pool_id, struct tmem_oid oid, - u32 index, unsigned long pfn) -{ - unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn; - - return xen_tmem_op(TMEM_PUT_PAGE, pool_id, oid, index, - gmfn, 0, 0, 0); -} - -static int xen_tmem_get_page(u32 pool_id, struct tmem_oid oid, - u32 index, unsigned long pfn) -{ - unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn; - - return xen_tmem_op(TMEM_GET_PAGE, pool_id, oid, index, - gmfn, 0, 0, 0); -} - -static int xen_tmem_flush_page(u32 pool_id, struct tmem_oid oid, u32 index) -{ - return xen_tmem_op(TMEM_FLUSH_PAGE, pool_id, oid, index, - 0, 0, 0, 0); -} - -static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid) -{ - return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0); -} - - -#ifdef CONFIG_CLEANCACHE -static int xen_tmem_destroy_pool(u32 pool_id) -{ - struct tmem_oid oid = { { 0 } }; - - return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0); -} - -/* cleancache ops */ - -static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key, - pgoff_t index, struct page *page) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - unsigned long pfn = page_to_pfn(page); - - if (pool < 0) - return; - if (ind != index) - return; - mb(); /* ensure page is quiescent; tmem may address it with an alias */ - (void)xen_tmem_put_page((u32)pool, oid, ind, pfn); -} - -static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key, - pgoff_t index, struct page *page) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - unsigned long pfn = page_to_pfn(page); - int ret; - - /* translate return values to linux semantics */ - if (pool < 0) - return -1; - if (ind != index) - return -1; - ret = xen_tmem_get_page((u32)pool, oid, ind, pfn); - if (ret == 1) - return 0; - else - return -1; -} - -static void tmem_cleancache_flush_page(int pool, struct cleancache_filekey key, - pgoff_t index) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - - if (pool < 0) - return; - if (ind != index) - return; - (void)xen_tmem_flush_page((u32)pool, oid, ind); -} - -static void tmem_cleancache_flush_inode(int pool, struct cleancache_filekey key) -{ - struct tmem_oid oid = *(struct tmem_oid *)&key; - - if (pool < 0) - return; - (void)xen_tmem_flush_object((u32)pool, oid); -} - -static void tmem_cleancache_flush_fs(int pool) -{ - if (pool < 0) - return; - (void)xen_tmem_destroy_pool((u32)pool); -} - -static int tmem_cleancache_init_fs(size_t pagesize) -{ - struct tmem_pool_uuid uuid_private = TMEM_POOL_PRIVATE_UUID; - - return xen_tmem_new_pool(uuid_private, 0, pagesize); -} - -static int tmem_cleancache_init_shared_fs(char *uuid, size_t pagesize) -{ - struct tmem_pool_uuid shared_uuid; - - shared_uuid.uuid_lo = *(u64 *)uuid; - shared_uuid.uuid_hi = *(u64 *)(&uuid[8]); - return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize); -} - -static struct cleancache_ops tmem_cleancache_ops = { - .put_page = tmem_cleancache_put_page, - .get_page = tmem_cleancache_get_page, - .invalidate_page = tmem_cleancache_flush_page, - .invalidate_inode = tmem_cleancache_flush_inode, - .invalidate_fs = tmem_cleancache_flush_fs, - .init_shared_fs = tmem_cleancache_init_shared_fs, - .init_fs = tmem_cleancache_init_fs -}; -#endif - -#ifdef CONFIG_FRONTSWAP -/* frontswap tmem operations */ - -/* a single tmem poolid is used for all frontswap "types" (swapfiles) */ -static int tmem_frontswap_poolid; - -/* - * Swizzling increases objects per swaptype, increasing tmem concurrency - * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS - */ -#define SWIZ_BITS 4 -#define SWIZ_MASK ((1 << SWIZ_BITS) - 1) -#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK)) -#define iswiz(_ind) (_ind >> SWIZ_BITS) - -static inline struct tmem_oid oswiz(unsigned type, u32 ind) -{ - struct tmem_oid oid = { .oid = { 0 } }; - oid.oid[0] = _oswiz(type, ind); - return oid; -} - -/* returns 0 if the page was successfully put into frontswap, -1 if not */ -static int tmem_frontswap_store(unsigned type, pgoff_t offset, - struct page *page) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - unsigned long pfn = page_to_pfn(page); - int pool = tmem_frontswap_poolid; - int ret; - - if (pool < 0) - return -1; - if (ind64 != ind) - return -1; - mb(); /* ensure page is quiescent; tmem may address it with an alias */ - ret = xen_tmem_put_page(pool, oswiz(type, ind), iswiz(ind), pfn); - /* translate Xen tmem return values to linux semantics */ - if (ret == 1) - return 0; - else - return -1; -} - -/* - * returns 0 if the page was successfully gotten from frontswap, -1 if - * was not present (should never happen!) - */ -static int tmem_frontswap_load(unsigned type, pgoff_t offset, - struct page *page) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - unsigned long pfn = page_to_pfn(page); - int pool = tmem_frontswap_poolid; - int ret; - - if (pool < 0) - return -1; - if (ind64 != ind) - return -1; - ret = xen_tmem_get_page(pool, oswiz(type, ind), iswiz(ind), pfn); - /* translate Xen tmem return values to linux semantics */ - if (ret == 1) - return 0; - else - return -1; -} - -/* flush a single page from frontswap */ -static void tmem_frontswap_flush_page(unsigned type, pgoff_t offset) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - int pool = tmem_frontswap_poolid; - - if (pool < 0) - return; - if (ind64 != ind) - return; - (void) xen_tmem_flush_page(pool, oswiz(type, ind), iswiz(ind)); -} - -/* flush all pages from the passed swaptype */ -static void tmem_frontswap_flush_area(unsigned type) -{ - int pool = tmem_frontswap_poolid; - int ind; - - if (pool < 0) - return; - for (ind = SWIZ_MASK; ind >= 0; ind--) - (void)xen_tmem_flush_object(pool, oswiz(type, ind)); -} - -static void tmem_frontswap_init(unsigned ignored) -{ - struct tmem_pool_uuid private = TMEM_POOL_PRIVATE_UUID; - - /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ - if (tmem_frontswap_poolid < 0) - tmem_frontswap_poolid = - xen_tmem_new_pool(private, TMEM_POOL_PERSIST, PAGE_SIZE); -} - -static struct frontswap_ops tmem_frontswap_ops = { - .store = tmem_frontswap_store, - .load = tmem_frontswap_load, - .invalidate_page = tmem_frontswap_flush_page, - .invalidate_area = tmem_frontswap_flush_area, - .init = tmem_frontswap_init -}; -#endif - -static int xen_tmem_init(void) -{ - if (!xen_domain()) - return 0; -#ifdef CONFIG_FRONTSWAP - if (tmem_enabled && frontswap) { - char *s = ""; - struct frontswap_ops *old_ops; - - tmem_frontswap_poolid = -1; - old_ops = frontswap_register_ops(&tmem_frontswap_ops); - if (IS_ERR(old_ops) || old_ops) { - if (IS_ERR(old_ops)) - return PTR_ERR(old_ops); - s = " (WARNING: frontswap_ops overridden)"; - } - pr_info("frontswap enabled, RAM provided by Xen Transcendent Memory%s\n", - s); - } -#endif -#ifdef CONFIG_CLEANCACHE - BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid)); - if (tmem_enabled && cleancache) { - char *s = ""; - struct cleancache_ops *old_ops = - cleancache_register_ops(&tmem_cleancache_ops); - if (old_ops) - s = " (WARNING: cleancache_ops overridden)"; - pr_info("cleancache enabled, RAM provided by Xen Transcendent Memory%s\n", - s); - } -#endif -#ifdef CONFIG_XEN_SELFBALLOONING - /* - * There is no point of driving pages to the swap system if they - * aren't going anywhere in tmem universe. - */ - if (!frontswap) { - selfshrinking = false; - selfballooning = false; - } - xen_selfballoon_init(selfballooning, selfshrinking); -#endif - return 0; -} - -module_init(xen_tmem_init) -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Dan Magenheimer <dan.magenheimer@oracle.com>"); -MODULE_DESCRIPTION("Shim to Xen transcendent memory"); diff --git a/drivers/xen/unpopulated-alloc.c b/drivers/xen/unpopulated-alloc.c new file mode 100644 index 000000000000..d6fc2aefe264 --- /dev/null +++ b/drivers/xen/unpopulated-alloc.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/errno.h> +#include <linux/gfp.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/memremap.h> +#include <linux/slab.h> + +#include <asm/page.h> + +#include <xen/balloon.h> +#include <xen/page.h> +#include <xen/xen.h> + +static DEFINE_MUTEX(list_lock); +static struct page *page_list; +static unsigned int list_count; + +static struct resource *target_resource; + +/* + * If arch is not happy with system "iomem_resource" being used for + * the region allocation it can provide it's own view by creating specific + * Xen resource with unused regions of guest physical address space provided + * by the hypervisor. + */ +int __weak __init arch_xen_unpopulated_init(struct resource **res) +{ + *res = &iomem_resource; + + return 0; +} + +static int fill_list(unsigned int nr_pages) +{ + struct dev_pagemap *pgmap; + struct resource *res, *tmp_res = NULL; + void *vaddr; + unsigned int i, alloc_pages = round_up(nr_pages, PAGES_PER_SECTION); + struct range mhp_range; + int ret; + + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return -ENOMEM; + + res->name = "Xen scratch"; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + + mhp_range = mhp_get_pluggable_range(true); + + ret = allocate_resource(target_resource, res, + alloc_pages * PAGE_SIZE, mhp_range.start, mhp_range.end, + PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL); + if (ret < 0) { + pr_err("Cannot allocate new IOMEM resource\n"); + goto err_resource; + } + + /* + * Reserve the region previously allocated from Xen resource to avoid + * re-using it by someone else. + */ + if (target_resource != &iomem_resource) { + tmp_res = kzalloc(sizeof(*tmp_res), GFP_KERNEL); + if (!tmp_res) { + ret = -ENOMEM; + goto err_insert; + } + + tmp_res->name = res->name; + tmp_res->start = res->start; + tmp_res->end = res->end; + tmp_res->flags = res->flags; + + ret = request_resource(&iomem_resource, tmp_res); + if (ret < 0) { + pr_err("Cannot request resource %pR (%d)\n", tmp_res, ret); + kfree(tmp_res); + goto err_insert; + } + } + + pgmap = kzalloc(sizeof(*pgmap), GFP_KERNEL); + if (!pgmap) { + ret = -ENOMEM; + goto err_pgmap; + } + + pgmap->type = MEMORY_DEVICE_GENERIC; + pgmap->range = (struct range) { + .start = res->start, + .end = res->end, + }; + pgmap->nr_range = 1; + pgmap->owner = res; + +#ifdef CONFIG_XEN_HAVE_PVMMU + /* + * memremap will build page tables for the new memory so + * the p2m must contain invalid entries so the correct + * non-present PTEs will be written. + * + * If a failure occurs, the original (identity) p2m entries + * are not restored since this region is now known not to + * conflict with any devices. + */ + if (xen_pv_domain()) { + xen_pfn_t pfn = PFN_DOWN(res->start); + + for (i = 0; i < alloc_pages; i++) { + if (!set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY)) { + pr_warn("set_phys_to_machine() failed, no memory added\n"); + ret = -ENOMEM; + goto err_memremap; + } + } + } +#endif + + vaddr = memremap_pages(pgmap, NUMA_NO_NODE); + if (IS_ERR(vaddr)) { + pr_err("Cannot remap memory range\n"); + ret = PTR_ERR(vaddr); + goto err_memremap; + } + + for (i = 0; i < alloc_pages; i++) { + struct page *pg = virt_to_page(vaddr + PAGE_SIZE * i); + + pg->zone_device_data = page_list; + page_list = pg; + list_count++; + } + + return 0; + +err_memremap: + kfree(pgmap); +err_pgmap: + if (tmp_res) { + release_resource(tmp_res); + kfree(tmp_res); + } +err_insert: + release_resource(res); +err_resource: + kfree(res); + return ret; +} + +/** + * xen_alloc_unpopulated_pages - alloc unpopulated pages + * @nr_pages: Number of pages + * @pages: pages returned + * @return 0 on success, error otherwise + */ +int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages) +{ + unsigned int i; + int ret = 0; + + /* + * Fallback to default behavior if we do not have any suitable resource + * to allocate required region from and as the result we won't be able to + * construct pages. + */ + if (!target_resource) + return xen_alloc_ballooned_pages(nr_pages, pages); + + mutex_lock(&list_lock); + if (list_count < nr_pages) { + ret = fill_list(nr_pages - list_count); + if (ret) + goto out; + } + + for (i = 0; i < nr_pages; i++) { + struct page *pg = page_list; + + BUG_ON(!pg); + page_list = pg->zone_device_data; + list_count--; + pages[i] = pg; + +#ifdef CONFIG_XEN_HAVE_PVMMU + if (xen_pv_domain()) { + ret = xen_alloc_p2m_entry(page_to_pfn(pg)); + if (ret < 0) { + unsigned int j; + + for (j = 0; j <= i; j++) { + pages[j]->zone_device_data = page_list; + page_list = pages[j]; + list_count++; + } + goto out; + } + } +#endif + } + +out: + mutex_unlock(&list_lock); + return ret; +} +EXPORT_SYMBOL(xen_alloc_unpopulated_pages); + +/** + * xen_free_unpopulated_pages - return unpopulated pages + * @nr_pages: Number of pages + * @pages: pages to return + */ +void xen_free_unpopulated_pages(unsigned int nr_pages, struct page **pages) +{ + unsigned int i; + + if (!target_resource) { + xen_free_ballooned_pages(nr_pages, pages); + return; + } + + mutex_lock(&list_lock); + for (i = 0; i < nr_pages; i++) { + pages[i]->zone_device_data = page_list; + page_list = pages[i]; + list_count++; + } + mutex_unlock(&list_lock); +} +EXPORT_SYMBOL(xen_free_unpopulated_pages); + +static int __init unpopulated_init(void) +{ + int ret; + + if (!xen_domain()) + return -ENODEV; + + ret = arch_xen_unpopulated_init(&target_resource); + if (ret) { + pr_err("xen:unpopulated: Cannot initialize target resource\n"); + target_resource = NULL; + } + + return ret; +} +early_initcall(unpopulated_init); diff --git a/drivers/xen/xen-acpi-cpuhotplug.c b/drivers/xen/xen-acpi-cpuhotplug.c deleted file mode 100644 index 0caf4863be8c..000000000000 --- a/drivers/xen/xen-acpi-cpuhotplug.c +++ /dev/null @@ -1,463 +0,0 @@ -/* - * Copyright (C) 2012 Intel Corporation - * Author: Liu Jinsong <jinsong.liu@intel.com> - * Author: Jiang Yunhong <yunhong.jiang@intel.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/cpu.h> -#include <linux/acpi.h> -#include <linux/uaccess.h> -#include <acpi/acpi_bus.h> -#include <acpi/acpi_drivers.h> -#include <acpi/processor.h> - -#include <xen/acpi.h> -#include <xen/interface/platform.h> -#include <asm/xen/hypercall.h> - -#define PREFIX "ACPI:xen_cpu_hotplug:" - -#define INSTALL_NOTIFY_HANDLER 0 -#define UNINSTALL_NOTIFY_HANDLER 1 - -static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr); - -/* -------------------------------------------------------------------------- - Driver Interface --------------------------------------------------------------------------- */ - -static int xen_acpi_processor_enable(struct acpi_device *device) -{ - acpi_status status = 0; - unsigned long long value; - union acpi_object object = { 0 }; - struct acpi_buffer buffer = { sizeof(union acpi_object), &object }; - struct acpi_processor *pr; - - pr = acpi_driver_data(device); - if (!pr) { - pr_err(PREFIX "Cannot find driver data\n"); - return -EINVAL; - } - - if (!strcmp(acpi_device_hid(device), ACPI_PROCESSOR_OBJECT_HID)) { - /* Declared with "Processor" statement; match ProcessorID */ - status = acpi_evaluate_object(pr->handle, NULL, NULL, &buffer); - if (ACPI_FAILURE(status)) { - pr_err(PREFIX "Evaluating processor object\n"); - return -ENODEV; - } - - pr->acpi_id = object.processor.proc_id; - } else { - /* Declared with "Device" statement; match _UID */ - status = acpi_evaluate_integer(pr->handle, METHOD_NAME__UID, - NULL, &value); - if (ACPI_FAILURE(status)) { - pr_err(PREFIX "Evaluating processor _UID\n"); - return -ENODEV; - } - - pr->acpi_id = value; - } - - pr->id = xen_pcpu_id(pr->acpi_id); - - if ((int)pr->id < 0) - /* This cpu is not presented at hypervisor, try to hotadd it */ - if (ACPI_FAILURE(xen_acpi_cpu_hotadd(pr))) { - pr_err(PREFIX "Hotadd CPU (acpi_id = %d) failed.\n", - pr->acpi_id); - return -ENODEV; - } - - return 0; -} - -static int __cpuinit xen_acpi_processor_add(struct acpi_device *device) -{ - int ret; - struct acpi_processor *pr; - - if (!device) - return -EINVAL; - - pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL); - if (!pr) - return -ENOMEM; - - pr->handle = device->handle; - strcpy(acpi_device_name(device), ACPI_PROCESSOR_DEVICE_NAME); - strcpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS); - device->driver_data = pr; - - ret = xen_acpi_processor_enable(device); - if (ret) - pr_err(PREFIX "Error when enabling Xen processor\n"); - - return ret; -} - -static int xen_acpi_processor_remove(struct acpi_device *device) -{ - struct acpi_processor *pr; - - if (!device) - return -EINVAL; - - pr = acpi_driver_data(device); - if (!pr) - return -EINVAL; - - kfree(pr); - return 0; -} - -/*-------------------------------------------------------------- - Acpi processor hotplug support ---------------------------------------------------------------*/ - -static int is_processor_present(acpi_handle handle) -{ - acpi_status status; - unsigned long long sta = 0; - - - status = acpi_evaluate_integer(handle, "_STA", NULL, &sta); - - if (ACPI_SUCCESS(status) && (sta & ACPI_STA_DEVICE_PRESENT)) - return 1; - - /* - * _STA is mandatory for a processor that supports hot plug - */ - if (status == AE_NOT_FOUND) - pr_info(PREFIX "Processor does not support hot plug\n"); - else - pr_info(PREFIX "Processor Device is not present"); - return 0; -} - -static int xen_apic_id(acpi_handle handle) -{ - struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; - union acpi_object *obj; - struct acpi_madt_local_apic *lapic; - int apic_id; - - if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) - return -EINVAL; - - if (!buffer.length || !buffer.pointer) - return -EINVAL; - - obj = buffer.pointer; - if (obj->type != ACPI_TYPE_BUFFER || - obj->buffer.length < sizeof(*lapic)) { - kfree(buffer.pointer); - return -EINVAL; - } - - lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer; - - if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC || - !(lapic->lapic_flags & ACPI_MADT_ENABLED)) { - kfree(buffer.pointer); - return -EINVAL; - } - - apic_id = (uint32_t)lapic->id; - kfree(buffer.pointer); - buffer.length = ACPI_ALLOCATE_BUFFER; - buffer.pointer = NULL; - - return apic_id; -} - -static int xen_hotadd_cpu(struct acpi_processor *pr) -{ - int cpu_id, apic_id, pxm; - struct xen_platform_op op; - - apic_id = xen_apic_id(pr->handle); - if (apic_id < 0) { - pr_err(PREFIX "Failed to get apic_id for acpi_id %d\n", - pr->acpi_id); - return -ENODEV; - } - - pxm = xen_acpi_get_pxm(pr->handle); - if (pxm < 0) { - pr_err(PREFIX "Failed to get _PXM for acpi_id %d\n", - pr->acpi_id); - return pxm; - } - - op.cmd = XENPF_cpu_hotadd; - op.u.cpu_add.apic_id = apic_id; - op.u.cpu_add.acpi_id = pr->acpi_id; - op.u.cpu_add.pxm = pxm; - - cpu_id = HYPERVISOR_dom0_op(&op); - if (cpu_id < 0) - pr_err(PREFIX "Failed to hotadd CPU for acpi_id %d\n", - pr->acpi_id); - - return cpu_id; -} - -static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr) -{ - if (!is_processor_present(pr->handle)) - return AE_ERROR; - - pr->id = xen_hotadd_cpu(pr); - if ((int)pr->id < 0) - return AE_ERROR; - - /* - * Sync with Xen hypervisor, providing new /sys/.../xen_cpuX - * interface after cpu hotadded. - */ - xen_pcpu_hotplug_sync(); - - return AE_OK; -} - -static int acpi_processor_device_remove(struct acpi_device *device) -{ - pr_debug(PREFIX "Xen does not support CPU hotremove\n"); - - return -ENOSYS; -} - -static void acpi_processor_hotplug_notify(acpi_handle handle, - u32 event, void *data) -{ - struct acpi_processor *pr; - struct acpi_device *device = NULL; - u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */ - int result; - - acpi_scan_lock_acquire(); - - switch (event) { - case ACPI_NOTIFY_BUS_CHECK: - case ACPI_NOTIFY_DEVICE_CHECK: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Processor driver received %s event\n", - (event == ACPI_NOTIFY_BUS_CHECK) ? - "ACPI_NOTIFY_BUS_CHECK" : "ACPI_NOTIFY_DEVICE_CHECK")); - - if (!is_processor_present(handle)) - break; - - if (!acpi_bus_get_device(handle, &device)) - break; - - result = acpi_bus_scan(handle); - if (result) { - pr_err(PREFIX "Unable to add the device\n"); - break; - } - result = acpi_bus_get_device(handle, &device); - if (result) { - pr_err(PREFIX "Missing device object\n"); - break; - } - ost_code = ACPI_OST_SC_SUCCESS; - break; - - case ACPI_NOTIFY_EJECT_REQUEST: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "received ACPI_NOTIFY_EJECT_REQUEST\n")); - - if (acpi_bus_get_device(handle, &device)) { - pr_err(PREFIX "Device don't exist, dropping EJECT\n"); - break; - } - pr = acpi_driver_data(device); - if (!pr) { - pr_err(PREFIX "Driver data is NULL, dropping EJECT\n"); - break; - } - - /* - * TBD: implement acpi_processor_device_remove if Xen support - * CPU hotremove in the future. - */ - acpi_processor_device_remove(device); - break; - - default: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Unsupported event [0x%x]\n", event)); - - /* non-hotplug event; possibly handled by other handler */ - goto out; - } - - (void) acpi_evaluate_hotplug_ost(handle, event, ost_code, NULL); - -out: - acpi_scan_lock_release(); -} - -static acpi_status is_processor_device(acpi_handle handle) -{ - struct acpi_device_info *info; - char *hid; - acpi_status status; - - status = acpi_get_object_info(handle, &info); - if (ACPI_FAILURE(status)) - return status; - - if (info->type == ACPI_TYPE_PROCESSOR) { - kfree(info); - return AE_OK; /* found a processor object */ - } - - if (!(info->valid & ACPI_VALID_HID)) { - kfree(info); - return AE_ERROR; - } - - hid = info->hardware_id.string; - if ((hid == NULL) || strcmp(hid, ACPI_PROCESSOR_DEVICE_HID)) { - kfree(info); - return AE_ERROR; - } - - kfree(info); - return AE_OK; /* found a processor device object */ -} - -static acpi_status -processor_walk_namespace_cb(acpi_handle handle, - u32 lvl, void *context, void **rv) -{ - acpi_status status; - int *action = context; - - status = is_processor_device(handle); - if (ACPI_FAILURE(status)) - return AE_OK; /* not a processor; continue to walk */ - - switch (*action) { - case INSTALL_NOTIFY_HANDLER: - acpi_install_notify_handler(handle, - ACPI_SYSTEM_NOTIFY, - acpi_processor_hotplug_notify, - NULL); - break; - case UNINSTALL_NOTIFY_HANDLER: - acpi_remove_notify_handler(handle, - ACPI_SYSTEM_NOTIFY, - acpi_processor_hotplug_notify); - break; - default: - break; - } - - /* found a processor; skip walking underneath */ - return AE_CTRL_DEPTH; -} - -static -void acpi_processor_install_hotplug_notify(void) -{ - int action = INSTALL_NOTIFY_HANDLER; - acpi_walk_namespace(ACPI_TYPE_ANY, - ACPI_ROOT_OBJECT, - ACPI_UINT32_MAX, - processor_walk_namespace_cb, NULL, &action, NULL); -} - -static -void acpi_processor_uninstall_hotplug_notify(void) -{ - int action = UNINSTALL_NOTIFY_HANDLER; - acpi_walk_namespace(ACPI_TYPE_ANY, - ACPI_ROOT_OBJECT, - ACPI_UINT32_MAX, - processor_walk_namespace_cb, NULL, &action, NULL); -} - -static const struct acpi_device_id processor_device_ids[] = { - {ACPI_PROCESSOR_OBJECT_HID, 0}, - {ACPI_PROCESSOR_DEVICE_HID, 0}, - {"", 0}, -}; -MODULE_DEVICE_TABLE(acpi, processor_device_ids); - -static struct acpi_driver xen_acpi_processor_driver = { - .name = "processor", - .class = ACPI_PROCESSOR_CLASS, - .ids = processor_device_ids, - .ops = { - .add = xen_acpi_processor_add, - .remove = xen_acpi_processor_remove, - }, -}; - -static int __init xen_acpi_processor_init(void) -{ - int result = 0; - - if (!xen_initial_domain()) - return -ENODEV; - - /* unregister the stub which only used to reserve driver space */ - xen_stub_processor_exit(); - - result = acpi_bus_register_driver(&xen_acpi_processor_driver); - if (result < 0) { - xen_stub_processor_init(); - return result; - } - - acpi_processor_install_hotplug_notify(); - return 0; -} - -static void __exit xen_acpi_processor_exit(void) -{ - if (!xen_initial_domain()) - return; - - acpi_processor_uninstall_hotplug_notify(); - - acpi_bus_unregister_driver(&xen_acpi_processor_driver); - - /* - * stub reserve space again to prevent any chance of native - * driver loading. - */ - xen_stub_processor_init(); - return; -} - -module_init(xen_acpi_processor_init); -module_exit(xen_acpi_processor_exit); -ACPI_MODULE_NAME("xen-acpi-cpuhotplug"); -MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>"); -MODULE_DESCRIPTION("Xen Hotplug CPU Driver"); -MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xen-acpi-memhotplug.c b/drivers/xen/xen-acpi-memhotplug.c deleted file mode 100644 index 9083f1e474f8..000000000000 --- a/drivers/xen/xen-acpi-memhotplug.c +++ /dev/null @@ -1,485 +0,0 @@ -/* - * Copyright (C) 2012 Intel Corporation - * Author: Liu Jinsong <jinsong.liu@intel.com> - * Author: Jiang Yunhong <yunhong.jiang@intel.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/acpi.h> -#include <acpi/acpi_drivers.h> -#include <xen/acpi.h> -#include <xen/interface/platform.h> -#include <asm/xen/hypercall.h> - -#define PREFIX "ACPI:xen_memory_hotplug:" - -struct acpi_memory_info { - struct list_head list; - u64 start_addr; /* Memory Range start physical addr */ - u64 length; /* Memory Range length */ - unsigned short caching; /* memory cache attribute */ - unsigned short write_protect; /* memory read/write attribute */ - /* copied from buffer getting from _CRS */ - unsigned int enabled:1; -}; - -struct acpi_memory_device { - struct acpi_device *device; - struct list_head res_list; -}; - -static bool acpi_hotmem_initialized __read_mostly; - -static int xen_hotadd_memory(int pxm, struct acpi_memory_info *info) -{ - int rc; - struct xen_platform_op op; - - op.cmd = XENPF_mem_hotadd; - op.u.mem_add.spfn = info->start_addr >> PAGE_SHIFT; - op.u.mem_add.epfn = (info->start_addr + info->length) >> PAGE_SHIFT; - op.u.mem_add.pxm = pxm; - - rc = HYPERVISOR_dom0_op(&op); - if (rc) - pr_err(PREFIX "Xen Hotplug Memory Add failed on " - "0x%lx -> 0x%lx, _PXM: %d, error: %d\n", - (unsigned long)info->start_addr, - (unsigned long)(info->start_addr + info->length), - pxm, rc); - - return rc; -} - -static int xen_acpi_memory_enable_device(struct acpi_memory_device *mem_device) -{ - int pxm, result; - int num_enabled = 0; - struct acpi_memory_info *info; - - if (!mem_device) - return -EINVAL; - - pxm = xen_acpi_get_pxm(mem_device->device->handle); - if (pxm < 0) - return pxm; - - list_for_each_entry(info, &mem_device->res_list, list) { - if (info->enabled) { /* just sanity check...*/ - num_enabled++; - continue; - } - - if (!info->length) - continue; - - result = xen_hotadd_memory(pxm, info); - if (result) - continue; - info->enabled = 1; - num_enabled++; - } - - if (!num_enabled) - return -ENODEV; - - return 0; -} - -static acpi_status -acpi_memory_get_resource(struct acpi_resource *resource, void *context) -{ - struct acpi_memory_device *mem_device = context; - struct acpi_resource_address64 address64; - struct acpi_memory_info *info, *new; - acpi_status status; - - status = acpi_resource_to_address64(resource, &address64); - if (ACPI_FAILURE(status) || - (address64.resource_type != ACPI_MEMORY_RANGE)) - return AE_OK; - - list_for_each_entry(info, &mem_device->res_list, list) { - if ((info->caching == address64.info.mem.caching) && - (info->write_protect == address64.info.mem.write_protect) && - (info->start_addr + info->length == address64.minimum)) { - info->length += address64.address_length; - return AE_OK; - } - } - - new = kzalloc(sizeof(struct acpi_memory_info), GFP_KERNEL); - if (!new) - return AE_ERROR; - - INIT_LIST_HEAD(&new->list); - new->caching = address64.info.mem.caching; - new->write_protect = address64.info.mem.write_protect; - new->start_addr = address64.minimum; - new->length = address64.address_length; - list_add_tail(&new->list, &mem_device->res_list); - - return AE_OK; -} - -static int -acpi_memory_get_device_resources(struct acpi_memory_device *mem_device) -{ - acpi_status status; - struct acpi_memory_info *info, *n; - - if (!list_empty(&mem_device->res_list)) - return 0; - - status = acpi_walk_resources(mem_device->device->handle, - METHOD_NAME__CRS, acpi_memory_get_resource, mem_device); - - if (ACPI_FAILURE(status)) { - list_for_each_entry_safe(info, n, &mem_device->res_list, list) - kfree(info); - INIT_LIST_HEAD(&mem_device->res_list); - return -EINVAL; - } - - return 0; -} - -static int acpi_memory_get_device(acpi_handle handle, - struct acpi_memory_device **mem_device) -{ - struct acpi_device *device = NULL; - int result = 0; - - acpi_scan_lock_acquire(); - - acpi_bus_get_device(handle, &device); - if (device) - goto end; - - /* - * Now add the notified device. This creates the acpi_device - * and invokes .add function - */ - result = acpi_bus_scan(handle); - if (result) { - pr_warn(PREFIX "ACPI namespace scan failed\n"); - result = -EINVAL; - goto out; - } - result = acpi_bus_get_device(handle, &device); - if (result) { - pr_warn(PREFIX "Missing device object\n"); - result = -EINVAL; - goto out; - } - -end: - *mem_device = acpi_driver_data(device); - if (!(*mem_device)) { - pr_err(PREFIX "driver data not found\n"); - result = -ENODEV; - goto out; - } - -out: - acpi_scan_lock_release(); - return result; -} - -static int acpi_memory_check_device(struct acpi_memory_device *mem_device) -{ - unsigned long long current_status; - - /* Get device present/absent information from the _STA */ - if (ACPI_FAILURE(acpi_evaluate_integer(mem_device->device->handle, - "_STA", NULL, ¤t_status))) - return -ENODEV; - /* - * Check for device status. Device should be - * present/enabled/functioning. - */ - if (!((current_status & ACPI_STA_DEVICE_PRESENT) - && (current_status & ACPI_STA_DEVICE_ENABLED) - && (current_status & ACPI_STA_DEVICE_FUNCTIONING))) - return -ENODEV; - - return 0; -} - -static int acpi_memory_disable_device(struct acpi_memory_device *mem_device) -{ - pr_debug(PREFIX "Xen does not support memory hotremove\n"); - - return -ENOSYS; -} - -static void acpi_memory_device_notify(acpi_handle handle, u32 event, void *data) -{ - struct acpi_memory_device *mem_device; - struct acpi_device *device; - u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */ - - switch (event) { - case ACPI_NOTIFY_BUS_CHECK: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "\nReceived BUS CHECK notification for device\n")); - /* Fall Through */ - case ACPI_NOTIFY_DEVICE_CHECK: - if (event == ACPI_NOTIFY_DEVICE_CHECK) - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "\nReceived DEVICE CHECK notification for device\n")); - - if (acpi_memory_get_device(handle, &mem_device)) { - pr_err(PREFIX "Cannot find driver data\n"); - break; - } - - ost_code = ACPI_OST_SC_SUCCESS; - break; - - case ACPI_NOTIFY_EJECT_REQUEST: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "\nReceived EJECT REQUEST notification for device\n")); - - acpi_scan_lock_acquire(); - if (acpi_bus_get_device(handle, &device)) { - acpi_scan_lock_release(); - pr_err(PREFIX "Device doesn't exist\n"); - break; - } - mem_device = acpi_driver_data(device); - if (!mem_device) { - acpi_scan_lock_release(); - pr_err(PREFIX "Driver Data is NULL\n"); - break; - } - - /* - * TBD: implement acpi_memory_disable_device and invoke - * acpi_bus_remove if Xen support hotremove in the future - */ - acpi_memory_disable_device(mem_device); - acpi_scan_lock_release(); - break; - - default: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Unsupported event [0x%x]\n", event)); - /* non-hotplug event; possibly handled by other handler */ - return; - } - - (void) acpi_evaluate_hotplug_ost(handle, event, ost_code, NULL); - return; -} - -static int xen_acpi_memory_device_add(struct acpi_device *device) -{ - int result; - struct acpi_memory_device *mem_device = NULL; - - - if (!device) - return -EINVAL; - - mem_device = kzalloc(sizeof(struct acpi_memory_device), GFP_KERNEL); - if (!mem_device) - return -ENOMEM; - - INIT_LIST_HEAD(&mem_device->res_list); - mem_device->device = device; - sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME); - sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS); - device->driver_data = mem_device; - - /* Get the range from the _CRS */ - result = acpi_memory_get_device_resources(mem_device); - if (result) { - kfree(mem_device); - return result; - } - - /* - * For booting existed memory devices, early boot code has recognized - * memory area by EFI/E820. If DSDT shows these memory devices on boot, - * hotplug is not necessary for them. - * For hot-added memory devices during runtime, it need hypercall to - * Xen hypervisor to add memory. - */ - if (!acpi_hotmem_initialized) - return 0; - - if (!acpi_memory_check_device(mem_device)) - result = xen_acpi_memory_enable_device(mem_device); - - return result; -} - -static int xen_acpi_memory_device_remove(struct acpi_device *device) -{ - struct acpi_memory_device *mem_device = NULL; - - if (!device || !acpi_driver_data(device)) - return -EINVAL; - - mem_device = acpi_driver_data(device); - kfree(mem_device); - - return 0; -} - -/* - * Helper function to check for memory device - */ -static acpi_status is_memory_device(acpi_handle handle) -{ - char *hardware_id; - acpi_status status; - struct acpi_device_info *info; - - status = acpi_get_object_info(handle, &info); - if (ACPI_FAILURE(status)) - return status; - - if (!(info->valid & ACPI_VALID_HID)) { - kfree(info); - return AE_ERROR; - } - - hardware_id = info->hardware_id.string; - if ((hardware_id == NULL) || - (strcmp(hardware_id, ACPI_MEMORY_DEVICE_HID))) - status = AE_ERROR; - - kfree(info); - return status; -} - -static acpi_status -acpi_memory_register_notify_handler(acpi_handle handle, - u32 level, void *ctxt, void **retv) -{ - acpi_status status; - - status = is_memory_device(handle); - if (ACPI_FAILURE(status)) - return AE_OK; /* continue */ - - status = acpi_install_notify_handler(handle, ACPI_SYSTEM_NOTIFY, - acpi_memory_device_notify, NULL); - /* continue */ - return AE_OK; -} - -static acpi_status -acpi_memory_deregister_notify_handler(acpi_handle handle, - u32 level, void *ctxt, void **retv) -{ - acpi_status status; - - status = is_memory_device(handle); - if (ACPI_FAILURE(status)) - return AE_OK; /* continue */ - - status = acpi_remove_notify_handler(handle, - ACPI_SYSTEM_NOTIFY, - acpi_memory_device_notify); - - return AE_OK; /* continue */ -} - -static const struct acpi_device_id memory_device_ids[] = { - {ACPI_MEMORY_DEVICE_HID, 0}, - {"", 0}, -}; -MODULE_DEVICE_TABLE(acpi, memory_device_ids); - -static struct acpi_driver xen_acpi_memory_device_driver = { - .name = "acpi_memhotplug", - .class = ACPI_MEMORY_DEVICE_CLASS, - .ids = memory_device_ids, - .ops = { - .add = xen_acpi_memory_device_add, - .remove = xen_acpi_memory_device_remove, - }, -}; - -static int __init xen_acpi_memory_device_init(void) -{ - int result; - acpi_status status; - - if (!xen_initial_domain()) - return -ENODEV; - - /* unregister the stub which only used to reserve driver space */ - xen_stub_memory_device_exit(); - - result = acpi_bus_register_driver(&xen_acpi_memory_device_driver); - if (result < 0) { - xen_stub_memory_device_init(); - return -ENODEV; - } - - status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, - ACPI_UINT32_MAX, - acpi_memory_register_notify_handler, - NULL, NULL, NULL); - - if (ACPI_FAILURE(status)) { - pr_warn(PREFIX "walk_namespace failed\n"); - acpi_bus_unregister_driver(&xen_acpi_memory_device_driver); - xen_stub_memory_device_init(); - return -ENODEV; - } - - acpi_hotmem_initialized = true; - return 0; -} - -static void __exit xen_acpi_memory_device_exit(void) -{ - acpi_status status; - - if (!xen_initial_domain()) - return; - - status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, - ACPI_UINT32_MAX, - acpi_memory_deregister_notify_handler, - NULL, NULL, NULL); - if (ACPI_FAILURE(status)) - pr_warn(PREFIX "walk_namespace failed\n"); - - acpi_bus_unregister_driver(&xen_acpi_memory_device_driver); - - /* - * stub reserve space again to prevent any chance of native - * driver loading. - */ - xen_stub_memory_device_init(); - return; -} - -module_init(xen_acpi_memory_device_init); -module_exit(xen_acpi_memory_device_exit); -ACPI_MODULE_NAME("xen-acpi-memhotplug"); -MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>"); -MODULE_DESCRIPTION("Xen Hotplug Mem Driver"); -MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xen-acpi-pad.c b/drivers/xen/xen-acpi-pad.c index 59708fdd068b..ede69a5278d3 100644 --- a/drivers/xen/xen-acpi-pad.c +++ b/drivers/xen/xen-acpi-pad.c @@ -1,28 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * xen-acpi-pad.c - Xen pad interface * * Copyright (c) 2012, Intel Corporation. * Author: Liu, Jinsong <jinsong.liu@intel.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/types.h> -#include <acpi/acpi_bus.h> -#include <acpi/acpi_drivers.h> -#include <asm/xen/hypercall.h> +#include <linux/acpi.h> +#include <xen/xen.h> #include <xen/interface/version.h> #include <xen/xen-ops.h> +#include <asm/xen/hypercall.h> #define ACPI_PROCESSOR_AGGREGATOR_CLASS "acpi_pad" #define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator" @@ -37,7 +29,7 @@ static int xen_acpi_pad_idle_cpus(unsigned int idle_nums) op.u.core_parking.type = XEN_CORE_PARKING_SET; op.u.core_parking.idle_nums = idle_nums; - return HYPERVISOR_dom0_op(&op); + return HYPERVISOR_platform_op(&op); } static int xen_acpi_pad_idle_cpus_num(void) @@ -47,7 +39,7 @@ static int xen_acpi_pad_idle_cpus_num(void) op.cmd = XENPF_core_parking; op.u.core_parking.type = XEN_CORE_PARKING_GET; - return HYPERVISOR_dom0_op(&op) + return HYPERVISOR_platform_op(&op) ?: op.u.core_parking.idle_nums; } @@ -78,27 +70,14 @@ static int acpi_pad_pur(acpi_handle handle) return num; } -/* Notify firmware how many CPUs are idle */ -static void acpi_pad_ost(acpi_handle handle, int stat, - uint32_t idle_nums) -{ - union acpi_object params[3] = { - {.type = ACPI_TYPE_INTEGER,}, - {.type = ACPI_TYPE_INTEGER,}, - {.type = ACPI_TYPE_BUFFER,}, - }; - struct acpi_object_list arg_list = {3, params}; - - params[0].integer.value = ACPI_PROCESSOR_AGGREGATOR_NOTIFY; - params[1].integer.value = stat; - params[2].buffer.length = 4; - params[2].buffer.pointer = (void *)&idle_nums; - acpi_evaluate_object(handle, "_OST", &arg_list, NULL); -} - static void acpi_pad_handle_notify(acpi_handle handle) { int idle_nums; + struct acpi_buffer param = { + .length = 4, + .pointer = (void *)&idle_nums, + }; + mutex_lock(&xen_cpu_lock); idle_nums = acpi_pad_pur(handle); @@ -110,7 +89,8 @@ static void acpi_pad_handle_notify(acpi_handle handle) idle_nums = xen_acpi_pad_idle_cpus(idle_nums) ?: xen_acpi_pad_idle_cpus_num(); if (idle_nums >= 0) - acpi_pad_ost(handle, 0, idle_nums); + acpi_evaluate_ost(handle, ACPI_PROCESSOR_AGGREGATOR_NOTIFY, + 0, ¶m); mutex_unlock(&xen_cpu_lock); } @@ -142,7 +122,7 @@ static int acpi_pad_add(struct acpi_device *device) return 0; } -static int acpi_pad_remove(struct acpi_device *device) +static void acpi_pad_remove(struct acpi_device *device) { mutex_lock(&xen_cpu_lock); xen_acpi_pad_idle_cpus(0); @@ -150,7 +130,6 @@ static int acpi_pad_remove(struct acpi_device *device) acpi_remove_notify_handler(device->handle, ACPI_DEVICE_NOTIFY, acpi_pad_notify); - return 0; } static const struct acpi_device_id pad_device_ids[] = { diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c index 13bc6c31c060..f2e8eaf684ba 100644 --- a/drivers/xen/xen-acpi-processor.c +++ b/drivers/xen/xen-acpi-processor.c @@ -1,20 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2012 by Oracle Inc * Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> * - * This code borrows ideas from https://lkml.org/lkml/2011/11/30/249 + * This code borrows ideas from + * https://lore.kernel.org/lkml/1322673664-14642-6-git-send-email-konrad.wilk@oracle.com * so many thanks go to Kevin Tian <kevin.tian@intel.com> * and Yu Ke <ke.yu@intel.com>. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -28,10 +20,8 @@ #include <linux/module.h> #include <linux/types.h> #include <linux/syscore_ops.h> -#include <acpi/acpi_bus.h> -#include <acpi/acpi_drivers.h> +#include <linux/acpi.h> #include <acpi/processor.h> - #include <xen/xen.h> #include <xen/interface/platform.h> #include <asm/xen/hypercall.h> @@ -55,6 +45,8 @@ static unsigned long *acpi_ids_done; static unsigned long *acpi_id_present; /* And if there is an _CST definition (or a PBLK) for the ACPI IDs */ static unsigned long *acpi_id_cst_present; +/* Which ACPI P-State dependencies for a enumerated processor */ +static struct acpi_psd_package *acpi_psd; static int push_cxx_to_hypervisor(struct acpi_processor *_pr) { @@ -118,7 +110,7 @@ static int push_cxx_to_hypervisor(struct acpi_processor *_pr) set_xen_guest_handle(op.u.set_pminfo.power.states, dst_cx_states); if (!no_hypercall) - ret = HYPERVISOR_dom0_op(&op); + ret = HYPERVISOR_platform_op(&op); if (!ret) { pr_debug("ACPI CPU%u - C-states uploaded.\n", _pr->acpi_id); @@ -129,7 +121,7 @@ static int push_cxx_to_hypervisor(struct acpi_processor *_pr) pr_debug(" C%d: %s %d uS\n", cx->type, cx->desc, (u32)cx->latency); } - } else if (ret != -EINVAL) + } else if ((ret != -EINVAL) && (ret != -ENOSYS)) /* EINVAL means the ACPI ID is incorrect - meaning the ACPI * table is referencing a non-existing CPU - which can happen * with broken ACPI tables. */ @@ -246,7 +238,7 @@ static int push_pxx_to_hypervisor(struct acpi_processor *_pr) } if (!no_hypercall) - ret = HYPERVISOR_dom0_op(&op); + ret = HYPERVISOR_platform_op(&op); if (!ret) { struct acpi_processor_performance *perf; @@ -261,7 +253,7 @@ static int push_pxx_to_hypervisor(struct acpi_processor *_pr) (u32) perf->states[i].power, (u32) perf->states[i].transition_latency); } - } else if (ret != -EINVAL) + } else if ((ret != -EINVAL) && (ret != -ENOSYS)) /* EINVAL means the ACPI ID is incorrect - meaning the ACPI * table is referencing a non-existing CPU - which can happen * with broken ACPI tables. */ @@ -304,7 +296,7 @@ static unsigned int __init get_max_acpi_id(void) info = &op.u.pcpu_info; info->xen_cpuid = 0; - ret = HYPERVISOR_dom0_op(&op); + ret = HYPERVISOR_platform_op(&op); if (ret) return NR_CPUS; @@ -312,7 +304,7 @@ static unsigned int __init get_max_acpi_id(void) last_cpu = op.u.pcpu_info.max_present; for (i = 0; i <= last_cpu; i++) { info->xen_cpuid = i; - ret = HYPERVISOR_dom0_op(&op); + ret = HYPERVISOR_platform_op(&op); if (ret) continue; max_acpi_id = max(info->acpi_id, max_acpi_id); @@ -362,11 +354,17 @@ read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv) default: return AE_OK; } + if (invalid_phys_cpuid(acpi_get_phys_id(handle, + acpi_type == ACPI_TYPE_DEVICE, + acpi_id))) { + pr_debug("CPU with ACPI ID %u is unavailable\n", acpi_id); + return AE_OK; + } /* There are more ACPI Processor objects than in x2APIC or MADT. * This can happen with incorrect ACPI SSDT declerations. */ - if (acpi_id > nr_acpi_bits) { - pr_debug("We only have %u, trying to set %u\n", - nr_acpi_bits, acpi_id); + if (acpi_id >= nr_acpi_bits) { + pr_debug("max acpi id %u, trying to set %u\n", + nr_acpi_bits - 1, acpi_id); return AE_OK; } /* OK, There is a ACPI Processor object */ @@ -374,6 +372,13 @@ read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv) pr_debug("ACPI CPU%u w/ PBLK:0x%lx\n", acpi_id, (unsigned long)pblk); + /* It has P-state dependencies */ + if (!acpi_processor_get_psd(handle, &acpi_psd[acpi_id])) { + pr_debug("ACPI CPU%u w/ PST:coord_type = %llu domain = %llu\n", + acpi_id, acpi_psd[acpi_id].coord_type, + acpi_psd[acpi_id].domain); + } + status = acpi_evaluate_object(handle, "_CST", NULL, &buffer); if (ACPI_FAILURE(status)) { if (!pblk) @@ -397,20 +402,28 @@ static int check_acpi_ids(struct acpi_processor *pr_backup) /* All online CPUs have been processed at this stage. Now verify * whether in fact "online CPUs" == physical CPUs. */ - acpi_id_present = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL); + acpi_id_present = bitmap_zalloc(nr_acpi_bits, GFP_KERNEL); if (!acpi_id_present) return -ENOMEM; - acpi_id_cst_present = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL); + acpi_id_cst_present = bitmap_zalloc(nr_acpi_bits, GFP_KERNEL); if (!acpi_id_cst_present) { - kfree(acpi_id_present); + bitmap_free(acpi_id_present); + return -ENOMEM; + } + + acpi_psd = kcalloc(nr_acpi_bits, sizeof(struct acpi_psd_package), + GFP_KERNEL); + if (!acpi_psd) { + bitmap_free(acpi_id_present); + bitmap_free(acpi_id_cst_present); return -ENOMEM; } acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, ACPI_UINT32_MAX, read_acpi_id, NULL, NULL, NULL); - acpi_get_devices("ACPI0007", read_acpi_id, NULL, NULL); + acpi_get_devices(ACPI_PROCESSOR_DEVICE_HID, read_acpi_id, NULL, NULL); upload: if (!bitmap_equal(acpi_id_present, acpi_ids_done, nr_acpi_bits)) { @@ -419,48 +432,25 @@ upload: pr_backup->acpi_id = i; /* Mask out C-states if there are no _CST or PBLK */ pr_backup->flags.power = test_bit(i, acpi_id_cst_present); + /* num_entries is non-zero if we evaluated _PSD */ + if (acpi_psd[i].num_entries) { + memcpy(&pr_backup->performance->domain_info, + &acpi_psd[i], + sizeof(struct acpi_psd_package)); + } (void)upload_pm_data(pr_backup); } } return 0; } -static int __init check_prereq(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - if (!xen_initial_domain()) - return -ENODEV; - - if (!acpi_gbl_FADT.smi_command) - return -ENODEV; - - if (c->x86_vendor == X86_VENDOR_INTEL) { - if (!cpu_has(c, X86_FEATURE_EST)) - return -ENODEV; - - return 0; - } - if (c->x86_vendor == X86_VENDOR_AMD) { - /* Copied from powernow-k8.h, can't include ../cpufreq/powernow - * as we get compile warnings for the static functions. - */ -#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007 -#define USE_HW_PSTATE 0x00000080 - u32 eax, ebx, ecx, edx; - cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); - if ((edx & USE_HW_PSTATE) != USE_HW_PSTATE) - return -ENODEV; - return 0; - } - return -ENODEV; -} /* acpi_perf_data is a pointer to percpu data. */ static struct acpi_processor_performance __percpu *acpi_perf_data; static void free_acpi_perf_data(void) { - unsigned int i; + int i; /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */ for_each_possible_cpu(i) @@ -472,7 +462,7 @@ static void free_acpi_perf_data(void) static int xen_upload_processor_pm_data(void) { struct acpi_processor *pr_backup = NULL; - unsigned int i; + int i; int rc = 0; pr_info("Uploading Xen processor PM info\n"); @@ -483,11 +473,8 @@ static int xen_upload_processor_pm_data(void) if (!_pr) continue; - if (!pr_backup) { - pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL); - if (pr_backup) - memcpy(pr_backup, _pr, sizeof(struct acpi_processor)); - } + if (!pr_backup) + pr_backup = kmemdup(_pr, sizeof(*_pr), GFP_KERNEL); (void)upload_pm_data(_pr); } @@ -497,33 +484,56 @@ static int xen_upload_processor_pm_data(void) return rc; } -static void xen_acpi_processor_resume(void) +static void xen_acpi_processor_resume_worker(struct work_struct *dummy) { + int rc; + bitmap_zero(acpi_ids_done, nr_acpi_bits); - xen_upload_processor_pm_data(); + + rc = xen_upload_processor_pm_data(); + if (rc != 0) + pr_info("ACPI data upload failed, error = %d\n", rc); +} + +static void xen_acpi_processor_resume(void *data) +{ + static DECLARE_WORK(wq, xen_acpi_processor_resume_worker); + + /* + * xen_upload_processor_pm_data() calls non-atomic code. + * However, the context for xen_acpi_processor_resume is syscore + * with only the boot CPU online and in an atomic context. + * + * So defer the upload for some point safer. + */ + schedule_work(&wq); } -static struct syscore_ops xap_syscore_ops = { +static const struct syscore_ops xap_syscore_ops = { .resume = xen_acpi_processor_resume, }; +static struct syscore xap_syscore = { + .ops = &xap_syscore_ops, +}; + static int __init xen_acpi_processor_init(void) { - unsigned int i; - int rc = check_prereq(); + int i; + int rc; - if (rc) - return rc; + if (!xen_initial_domain()) + return -ENODEV; nr_acpi_bits = get_max_acpi_id() + 1; - acpi_ids_done = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL); + acpi_ids_done = bitmap_zalloc(nr_acpi_bits, GFP_KERNEL); if (!acpi_ids_done) return -ENOMEM; acpi_perf_data = alloc_percpu(struct acpi_processor_performance); if (!acpi_perf_data) { pr_debug("Memory allocation error for acpi_perf_data\n"); - kfree(acpi_ids_done); + bitmap_free(acpi_ids_done); return -ENOMEM; } for_each_possible_cpu(i) { @@ -557,34 +567,31 @@ static int __init xen_acpi_processor_init(void) if (rc) goto err_unregister; - register_syscore_ops(&xap_syscore_ops); + register_syscore(&xap_syscore); return 0; err_unregister: - for_each_possible_cpu(i) { - struct acpi_processor_performance *perf; - perf = per_cpu_ptr(acpi_perf_data, i); - acpi_processor_unregister_performance(perf, i); - } + for_each_possible_cpu(i) + acpi_processor_unregister_performance(i); + err_out: /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */ free_acpi_perf_data(); - kfree(acpi_ids_done); + bitmap_free(acpi_ids_done); return rc; } static void __exit xen_acpi_processor_exit(void) { int i; - unregister_syscore_ops(&xap_syscore_ops); - kfree(acpi_ids_done); - kfree(acpi_id_present); - kfree(acpi_id_cst_present); - for_each_possible_cpu(i) { - struct acpi_processor_performance *perf; - perf = per_cpu_ptr(acpi_perf_data, i); - acpi_processor_unregister_performance(perf, i); - } + unregister_syscore(&xap_syscore); + bitmap_free(acpi_ids_done); + bitmap_free(acpi_id_present); + bitmap_free(acpi_id_cst_present); + kfree(acpi_psd); + for_each_possible_cpu(i) + acpi_processor_unregister_performance(i); + free_acpi_perf_data(); } diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c index e555845d61fa..b293d7652f15 100644 --- a/drivers/xen/xen-balloon.c +++ b/drivers/xen/xen-balloon.c @@ -33,8 +33,11 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/errno.h> +#include <linux/mm_types.h> +#include <linux/init.h> #include <linux/capability.h> +#include <linux/memory_hotplug.h> #include <xen/xen.h> #include <xen/interface/xen.h> @@ -42,21 +45,34 @@ #include <xen/xenbus.h> #include <xen/features.h> #include <xen/page.h> +#include <xen/mem-reservation.h> #define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) #define BALLOON_CLASS_NAME "xen_memory" +#ifdef CONFIG_MEMORY_HOTPLUG +u64 xen_saved_max_mem_size = 0; +#endif + static struct device balloon_dev; static int register_balloon(struct device *dev); /* React to a change in the target key */ static void watch_target(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { - unsigned long long new_target; + unsigned long long new_target, static_max; int err; + static bool watch_fired; + static long target_diff; + +#ifdef CONFIG_MEMORY_HOTPLUG + /* The balloon driver will take care of adding memory now. */ + if (xen_saved_max_mem_size) + max_mem_size = xen_saved_max_mem_size; +#endif err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target); if (err != 1) { @@ -67,7 +83,24 @@ static void watch_target(struct xenbus_watch *watch, /* The given memory/target value is in KiB, so it needs converting to * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. */ - balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); + new_target >>= PAGE_SHIFT - 10; + + if (!watch_fired) { + watch_fired = true; + + if ((xenbus_scanf(XBT_NIL, "memory", "static-max", + "%llu", &static_max) == 1) || + (xenbus_scanf(XBT_NIL, "memory", "memory_static_max", + "%llu", &static_max) == 1)) + static_max >>= PAGE_SHIFT - 10; + else + static_max = balloon_stats.current_pages; + + target_diff = (xen_pv_domain() || xen_initial_domain()) ? 0 + : static_max - balloon_stats.target_pages; + } + + balloon_set_new_target(new_target - target_diff); } static struct xenbus_watch target_watch = { .node = "memory/target", @@ -92,39 +125,22 @@ static struct notifier_block xenstore_notifier = { .notifier_call = balloon_init_watcher, }; -static int __init balloon_init(void) +void xen_balloon_init(void) { - if (!xen_domain()) - return -ENODEV; - - pr_info("Initialising balloon driver\n"); - register_balloon(&balloon_dev); - register_xen_selfballooning(&balloon_dev); - register_xenstore_notifier(&xenstore_notifier); - - return 0; -} -subsys_initcall(balloon_init); - -static void balloon_exit(void) -{ - /* XXX - release balloon here */ - return; } - -module_exit(balloon_exit); +EXPORT_SYMBOL_GPL(xen_balloon_init); #define BALLOON_SHOW(name, format, args...) \ - static ssize_t show_##name(struct device *dev, \ + static ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, \ char *buf) \ { \ return sprintf(buf, format, ##args); \ } \ - static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + static DEVICE_ATTR_RO(name) BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages)); BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low)); @@ -134,17 +150,17 @@ static DEVICE_ULONG_ATTR(schedule_delay, 0444, balloon_stats.schedule_delay); static DEVICE_ULONG_ATTR(max_schedule_delay, 0644, balloon_stats.max_schedule_delay); static DEVICE_ULONG_ATTR(retry_count, 0444, balloon_stats.retry_count); static DEVICE_ULONG_ATTR(max_retry_count, 0644, balloon_stats.max_retry_count); +static DEVICE_BOOL_ATTR(scrub_pages, 0644, xen_scrub_pages); -static ssize_t show_target_kb(struct device *dev, struct device_attribute *attr, +static ssize_t target_kb_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages)); } -static ssize_t store_target_kb(struct device *dev, +static ssize_t target_kb_store(struct device *dev, struct device_attribute *attr, - const char *buf, - size_t count) + const char *buf, size_t count) { char *endchar; unsigned long long target_bytes; @@ -159,22 +175,19 @@ static ssize_t store_target_kb(struct device *dev, return count; } -static DEVICE_ATTR(target_kb, S_IRUGO | S_IWUSR, - show_target_kb, store_target_kb); - +static DEVICE_ATTR_RW(target_kb); -static ssize_t show_target(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t target_show(struct device *dev, struct device_attribute *attr, + char *buf) { return sprintf(buf, "%llu\n", (unsigned long long)balloon_stats.target_pages << PAGE_SHIFT); } -static ssize_t store_target(struct device *dev, +static ssize_t target_store(struct device *dev, struct device_attribute *attr, - const char *buf, - size_t count) + const char *buf, size_t count) { char *endchar; unsigned long long target_bytes; @@ -189,17 +202,21 @@ static ssize_t store_target(struct device *dev, return count; } -static DEVICE_ATTR(target, S_IRUGO | S_IWUSR, - show_target, store_target); +static DEVICE_ATTR_RW(target); +static struct attribute *balloon_attrs[] = { + &dev_attr_target_kb.attr, + &dev_attr_target.attr, + &dev_attr_schedule_delay.attr.attr, + &dev_attr_max_schedule_delay.attr.attr, + &dev_attr_retry_count.attr.attr, + &dev_attr_max_retry_count.attr.attr, + &dev_attr_scrub_pages.attr.attr, + NULL +}; -static struct device_attribute *balloon_attrs[] = { - &dev_attr_target_kb, - &dev_attr_target, - &dev_attr_schedule_delay.attr, - &dev_attr_max_schedule_delay.attr, - &dev_attr_retry_count.attr, - &dev_attr_max_retry_count.attr +static const struct attribute_group balloon_group = { + .attrs = balloon_attrs }; static struct attribute *balloon_info_attrs[] = { @@ -214,14 +231,20 @@ static const struct attribute_group balloon_info_group = { .attrs = balloon_info_attrs }; -static struct bus_type balloon_subsys = { +static const struct attribute_group *balloon_groups[] = { + &balloon_group, + &balloon_info_group, + NULL +}; + +static const struct bus_type balloon_subsys = { .name = BALLOON_CLASS_NAME, .dev_name = BALLOON_CLASS_NAME, }; static int register_balloon(struct device *dev) { - int i, error; + int error; error = subsys_system_register(&balloon_subsys, NULL); if (error) @@ -229,6 +252,7 @@ static int register_balloon(struct device *dev) dev->id = 0; dev->bus = &balloon_subsys; + dev->groups = balloon_groups; error = device_register(dev); if (error) { @@ -236,24 +260,5 @@ static int register_balloon(struct device *dev) return error; } - for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) { - error = device_create_file(dev, balloon_attrs[i]); - if (error) - goto fail; - } - - error = sysfs_create_group(&dev->kobj, &balloon_info_group); - if (error) - goto fail; - return 0; - - fail: - while (--i >= 0) - device_remove_file(dev, balloon_attrs[i]); - device_unregister(dev); - bus_unregister(&balloon_subsys); - return error; } - -MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xen-front-pgdir-shbuf.c b/drivers/xen/xen-front-pgdir-shbuf.c new file mode 100644 index 000000000000..223870a0111b --- /dev/null +++ b/drivers/xen/xen-front-pgdir-shbuf.c @@ -0,0 +1,551 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT + +/* + * Xen frontend/backend page directory based shared buffer + * helper module. + * + * Copyright (C) 2018 EPAM Systems Inc. + * + * Author: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com> + */ + +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/mm.h> + +#include <asm/xen/hypervisor.h> +#include <xen/balloon.h> +#include <xen/xen.h> +#include <xen/xenbus.h> +#include <xen/interface/io/ring.h> + +#include <xen/xen-front-pgdir-shbuf.h> + +/* + * This structure represents the structure of a shared page + * that contains grant references to the pages of the shared + * buffer. This structure is common to many Xen para-virtualized + * protocols at include/xen/interface/io/ + */ +struct xen_page_directory { + grant_ref_t gref_dir_next_page; +#define XEN_GREF_LIST_END 0 + grant_ref_t gref[]; /* Variable length */ +}; + +/* + * Shared buffer ops which are differently implemented + * depending on the allocation mode, e.g. if the buffer + * is allocated by the corresponding backend or frontend. + * Some of the operations. + */ +struct xen_front_pgdir_shbuf_ops { + /* + * Calculate number of grefs required to handle this buffer, + * e.g. if grefs are required for page directory only or the buffer + * pages as well. + */ + void (*calc_num_grefs)(struct xen_front_pgdir_shbuf *buf); + + /* Fill page directory according to para-virtual display protocol. */ + void (*fill_page_dir)(struct xen_front_pgdir_shbuf *buf); + + /* Claim grant references for the pages of the buffer. */ + int (*grant_refs_for_buffer)(struct xen_front_pgdir_shbuf *buf, + grant_ref_t *priv_gref_head, int gref_idx); + + /* Map grant references of the buffer. */ + int (*map)(struct xen_front_pgdir_shbuf *buf); + + /* Unmap grant references of the buffer. */ + int (*unmap)(struct xen_front_pgdir_shbuf *buf); +}; + +/* + * Get granted reference to the very first page of the + * page directory. Usually this is passed to the backend, + * so it can find/fill the grant references to the buffer's + * pages. + * + * \param buf shared buffer which page directory is of interest. + * \return granted reference to the very first page of the + * page directory. + */ +grant_ref_t +xen_front_pgdir_shbuf_get_dir_start(struct xen_front_pgdir_shbuf *buf) +{ + if (!buf->grefs) + return INVALID_GRANT_REF; + + return buf->grefs[0]; +} +EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_get_dir_start); + +/* + * Map granted references of the shared buffer. + * + * Depending on the shared buffer mode of allocation + * (be_alloc flag) this can either do nothing (for buffers + * shared by the frontend itself) or map the provided granted + * references onto the backing storage (buf->pages). + * + * \param buf shared buffer which grants to be mapped. + * \return zero on success or a negative number on failure. + */ +int xen_front_pgdir_shbuf_map(struct xen_front_pgdir_shbuf *buf) +{ + if (buf->ops && buf->ops->map) + return buf->ops->map(buf); + + /* No need to map own grant references. */ + return 0; +} +EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_map); + +/* + * Unmap granted references of the shared buffer. + * + * Depending on the shared buffer mode of allocation + * (be_alloc flag) this can either do nothing (for buffers + * shared by the frontend itself) or unmap the provided granted + * references. + * + * \param buf shared buffer which grants to be unmapped. + * \return zero on success or a negative number on failure. + */ +int xen_front_pgdir_shbuf_unmap(struct xen_front_pgdir_shbuf *buf) +{ + if (buf->ops && buf->ops->unmap) + return buf->ops->unmap(buf); + + /* No need to unmap own grant references. */ + return 0; +} +EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_unmap); + +/* + * Free all the resources of the shared buffer. + * + * \param buf shared buffer which resources to be freed. + */ +void xen_front_pgdir_shbuf_free(struct xen_front_pgdir_shbuf *buf) +{ + if (buf->grefs) { + int i; + + for (i = 0; i < buf->num_grefs; i++) + if (buf->grefs[i] != INVALID_GRANT_REF) + gnttab_end_foreign_access(buf->grefs[i], NULL); + } + kfree(buf->grefs); + kfree(buf->directory); +} +EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_free); + +/* + * Number of grefs a page can hold with respect to the + * struct xen_page_directory header. + */ +#define XEN_NUM_GREFS_PER_PAGE ((PAGE_SIZE - \ + offsetof(struct xen_page_directory, \ + gref)) / sizeof(grant_ref_t)) + +/* + * Get the number of pages the page directory consumes itself. + * + * \param buf shared buffer. + */ +static int get_num_pages_dir(struct xen_front_pgdir_shbuf *buf) +{ + return DIV_ROUND_UP(buf->num_pages, XEN_NUM_GREFS_PER_PAGE); +} + +/* + * Calculate the number of grant references needed to share the buffer + * and its pages when backend allocates the buffer. + * + * \param buf shared buffer. + */ +static void backend_calc_num_grefs(struct xen_front_pgdir_shbuf *buf) +{ + /* Only for pages the page directory consumes itself. */ + buf->num_grefs = get_num_pages_dir(buf); +} + +/* + * Calculate the number of grant references needed to share the buffer + * and its pages when frontend allocates the buffer. + * + * \param buf shared buffer. + */ +static void guest_calc_num_grefs(struct xen_front_pgdir_shbuf *buf) +{ + /* + * Number of pages the page directory consumes itself + * plus grefs for the buffer pages. + */ + buf->num_grefs = get_num_pages_dir(buf) + buf->num_pages; +} + +#define xen_page_to_vaddr(page) \ + ((uintptr_t)pfn_to_kaddr(page_to_xen_pfn(page))) + +/* + * Unmap the buffer previously mapped with grant references + * provided by the backend. + * + * \param buf shared buffer. + * \return zero on success or a negative number on failure. + */ +static int backend_unmap(struct xen_front_pgdir_shbuf *buf) +{ + struct gnttab_unmap_grant_ref *unmap_ops; + int i, ret; + + if (!buf->pages || !buf->backend_map_handles || !buf->grefs) + return 0; + + unmap_ops = kcalloc(buf->num_pages, sizeof(*unmap_ops), + GFP_KERNEL); + if (!unmap_ops) + return -ENOMEM; + + for (i = 0; i < buf->num_pages; i++) { + phys_addr_t addr; + + addr = xen_page_to_vaddr(buf->pages[i]); + gnttab_set_unmap_op(&unmap_ops[i], addr, GNTMAP_host_map, + buf->backend_map_handles[i]); + } + + ret = gnttab_unmap_refs(unmap_ops, NULL, buf->pages, + buf->num_pages); + + for (i = 0; i < buf->num_pages; i++) { + if (unlikely(unmap_ops[i].status != GNTST_okay)) + dev_err(&buf->xb_dev->dev, + "Failed to unmap page %d: %d\n", + i, unmap_ops[i].status); + } + + if (ret) + dev_err(&buf->xb_dev->dev, + "Failed to unmap grant references, ret %d", ret); + + kfree(unmap_ops); + kfree(buf->backend_map_handles); + buf->backend_map_handles = NULL; + return ret; +} + +/* + * Map the buffer with grant references provided by the backend. + * + * \param buf shared buffer. + * \return zero on success or a negative number on failure. + */ +static int backend_map(struct xen_front_pgdir_shbuf *buf) +{ + struct gnttab_map_grant_ref *map_ops = NULL; + unsigned char *ptr; + int ret, cur_gref, cur_dir_page, cur_page, grefs_left; + + map_ops = kcalloc(buf->num_pages, sizeof(*map_ops), GFP_KERNEL); + if (!map_ops) + return -ENOMEM; + + buf->backend_map_handles = kcalloc(buf->num_pages, + sizeof(*buf->backend_map_handles), + GFP_KERNEL); + if (!buf->backend_map_handles) { + kfree(map_ops); + return -ENOMEM; + } + + /* + * Read page directory to get grefs from the backend: for external + * buffer we only allocate buf->grefs for the page directory, + * so buf->num_grefs has number of pages in the page directory itself. + */ + ptr = buf->directory; + grefs_left = buf->num_pages; + cur_page = 0; + for (cur_dir_page = 0; cur_dir_page < buf->num_grefs; cur_dir_page++) { + struct xen_page_directory *page_dir = + (struct xen_page_directory *)ptr; + int to_copy = XEN_NUM_GREFS_PER_PAGE; + + if (to_copy > grefs_left) + to_copy = grefs_left; + + for (cur_gref = 0; cur_gref < to_copy; cur_gref++) { + phys_addr_t addr; + + addr = xen_page_to_vaddr(buf->pages[cur_page]); + gnttab_set_map_op(&map_ops[cur_page], addr, + GNTMAP_host_map, + page_dir->gref[cur_gref], + buf->xb_dev->otherend_id); + cur_page++; + } + + grefs_left -= to_copy; + ptr += PAGE_SIZE; + } + ret = gnttab_map_refs(map_ops, NULL, buf->pages, buf->num_pages); + + /* Save handles even if error, so we can unmap. */ + for (cur_page = 0; cur_page < buf->num_pages; cur_page++) { + if (likely(map_ops[cur_page].status == GNTST_okay)) { + buf->backend_map_handles[cur_page] = + map_ops[cur_page].handle; + } else { + buf->backend_map_handles[cur_page] = + INVALID_GRANT_HANDLE; + if (!ret) + ret = -ENXIO; + dev_err(&buf->xb_dev->dev, + "Failed to map page %d: %d\n", + cur_page, map_ops[cur_page].status); + } + } + + if (ret) { + dev_err(&buf->xb_dev->dev, + "Failed to map grant references, ret %d", ret); + backend_unmap(buf); + } + + kfree(map_ops); + return ret; +} + +/* + * Fill page directory with grant references to the pages of the + * page directory itself. + * + * The grant references to the buffer pages are provided by the + * backend in this case. + * + * \param buf shared buffer. + */ +static void backend_fill_page_dir(struct xen_front_pgdir_shbuf *buf) +{ + struct xen_page_directory *page_dir; + unsigned char *ptr; + int i, num_pages_dir; + + ptr = buf->directory; + num_pages_dir = get_num_pages_dir(buf); + + /* Fill only grefs for the page directory itself. */ + for (i = 0; i < num_pages_dir - 1; i++) { + page_dir = (struct xen_page_directory *)ptr; + + page_dir->gref_dir_next_page = buf->grefs[i + 1]; + ptr += PAGE_SIZE; + } + /* Last page must say there is no more pages. */ + page_dir = (struct xen_page_directory *)ptr; + page_dir->gref_dir_next_page = XEN_GREF_LIST_END; +} + +/* + * Fill page directory with grant references to the pages of the + * page directory and the buffer we share with the backend. + * + * \param buf shared buffer. + */ +static void guest_fill_page_dir(struct xen_front_pgdir_shbuf *buf) +{ + unsigned char *ptr; + int cur_gref, grefs_left, to_copy, i, num_pages_dir; + + ptr = buf->directory; + num_pages_dir = get_num_pages_dir(buf); + + /* + * While copying, skip grefs at start, they are for pages + * granted for the page directory itself. + */ + cur_gref = num_pages_dir; + grefs_left = buf->num_pages; + for (i = 0; i < num_pages_dir; i++) { + struct xen_page_directory *page_dir = + (struct xen_page_directory *)ptr; + + if (grefs_left <= XEN_NUM_GREFS_PER_PAGE) { + to_copy = grefs_left; + page_dir->gref_dir_next_page = XEN_GREF_LIST_END; + } else { + to_copy = XEN_NUM_GREFS_PER_PAGE; + page_dir->gref_dir_next_page = buf->grefs[i + 1]; + } + memcpy(&page_dir->gref, &buf->grefs[cur_gref], + to_copy * sizeof(grant_ref_t)); + ptr += PAGE_SIZE; + grefs_left -= to_copy; + cur_gref += to_copy; + } +} + +/* + * Grant references to the frontend's buffer pages. + * + * These will be shared with the backend, so it can + * access the buffer's data. + * + * \param buf shared buffer. + * \return zero on success or a negative number on failure. + */ +static int guest_grant_refs_for_buffer(struct xen_front_pgdir_shbuf *buf, + grant_ref_t *priv_gref_head, + int gref_idx) +{ + int i, cur_ref, otherend_id; + + otherend_id = buf->xb_dev->otherend_id; + for (i = 0; i < buf->num_pages; i++) { + cur_ref = gnttab_claim_grant_reference(priv_gref_head); + if (cur_ref < 0) + return cur_ref; + + gnttab_grant_foreign_access_ref(cur_ref, otherend_id, + xen_page_to_gfn(buf->pages[i]), + 0); + buf->grefs[gref_idx++] = cur_ref; + } + return 0; +} + +/* + * Grant all the references needed to share the buffer. + * + * Grant references to the page directory pages and, if + * needed, also to the pages of the shared buffer data. + * + * \param buf shared buffer. + * \return zero on success or a negative number on failure. + */ +static int grant_references(struct xen_front_pgdir_shbuf *buf) +{ + grant_ref_t priv_gref_head; + int ret, i, j, cur_ref; + int otherend_id, num_pages_dir; + + ret = gnttab_alloc_grant_references(buf->num_grefs, &priv_gref_head); + if (ret < 0) { + dev_err(&buf->xb_dev->dev, + "Cannot allocate grant references\n"); + return ret; + } + + otherend_id = buf->xb_dev->otherend_id; + j = 0; + num_pages_dir = get_num_pages_dir(buf); + for (i = 0; i < num_pages_dir; i++) { + unsigned long frame; + + cur_ref = gnttab_claim_grant_reference(&priv_gref_head); + if (cur_ref < 0) + return cur_ref; + + frame = xen_page_to_gfn(virt_to_page(buf->directory + + PAGE_SIZE * i)); + gnttab_grant_foreign_access_ref(cur_ref, otherend_id, frame, 0); + buf->grefs[j++] = cur_ref; + } + + if (buf->ops->grant_refs_for_buffer) { + ret = buf->ops->grant_refs_for_buffer(buf, &priv_gref_head, j); + if (ret) + return ret; + } + + gnttab_free_grant_references(priv_gref_head); + return 0; +} + +/* + * Allocate all required structures to mange shared buffer. + * + * \param buf shared buffer. + * \return zero on success or a negative number on failure. + */ +static int alloc_storage(struct xen_front_pgdir_shbuf *buf) +{ + buf->grefs = kcalloc(buf->num_grefs, sizeof(*buf->grefs), GFP_KERNEL); + if (!buf->grefs) + return -ENOMEM; + + buf->directory = kcalloc(get_num_pages_dir(buf), PAGE_SIZE, GFP_KERNEL); + if (!buf->directory) + return -ENOMEM; + + return 0; +} + +/* + * For backend allocated buffers we don't need grant_refs_for_buffer + * as those grant references are allocated at backend side. + */ +static const struct xen_front_pgdir_shbuf_ops backend_ops = { + .calc_num_grefs = backend_calc_num_grefs, + .fill_page_dir = backend_fill_page_dir, + .map = backend_map, + .unmap = backend_unmap +}; + +/* + * For locally granted references we do not need to map/unmap + * the references. + */ +static const struct xen_front_pgdir_shbuf_ops local_ops = { + .calc_num_grefs = guest_calc_num_grefs, + .fill_page_dir = guest_fill_page_dir, + .grant_refs_for_buffer = guest_grant_refs_for_buffer, +}; + +/* + * Allocate a new instance of a shared buffer. + * + * \param cfg configuration to be used while allocating a new shared buffer. + * \return zero on success or a negative number on failure. + */ +int xen_front_pgdir_shbuf_alloc(struct xen_front_pgdir_shbuf_cfg *cfg) +{ + struct xen_front_pgdir_shbuf *buf = cfg->pgdir; + int ret; + + if (cfg->be_alloc) + buf->ops = &backend_ops; + else + buf->ops = &local_ops; + buf->xb_dev = cfg->xb_dev; + buf->num_pages = cfg->num_pages; + buf->pages = cfg->pages; + + buf->ops->calc_num_grefs(buf); + + ret = alloc_storage(buf); + if (ret) + goto fail; + + ret = grant_references(buf); + if (ret) + goto fail; + + buf->ops->fill_page_dir(buf); + + return 0; + +fail: + xen_front_pgdir_shbuf_free(buf); + return ret; +} +EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_alloc); + +MODULE_DESCRIPTION("Xen frontend/backend page directory based " + "shared buffer handling"); +MODULE_AUTHOR("Oleksandr Andrushchenko"); +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xen-pciback/Makefile b/drivers/xen/xen-pciback/Makefile index ffe0ad3438bd..d63df09de81c 100644 --- a/drivers/xen/xen-pciback/Makefile +++ b/drivers/xen/xen-pciback/Makefile @@ -1,4 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 + +# N.B. The below cannot be expressed with a single line using +# CONFIG_XEN_PCI_STUB as it always remains in "y" state, +# thus preventing the driver to be built as a module. +# Please note, that CONFIG_XEN_PCIDEV_BACKEND and +# CONFIG_XEN_PCIDEV_STUB are mutually exclusive. obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o +obj-$(CONFIG_XEN_PCIDEV_STUB) += xen-pciback.o xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o xen-pciback-y += conf_space.o conf_space_header.o \ diff --git a/drivers/xen/xen-pciback/conf_space.c b/drivers/xen/xen-pciback/conf_space.c index 46ae0f9f02ad..d47eee6c5143 100644 --- a/drivers/xen/xen-pciback/conf_space.c +++ b/drivers/xen/xen-pciback/conf_space.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * PCI Backend - Functions for creating a virtual configuration space for * exported PCI Devices. @@ -9,15 +10,17 @@ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> */ +#define dev_fmt(fmt) DRV_NAME ": " fmt + #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/pci.h> #include "pciback.h" #include "conf_space.h" #include "conf_space_quirks.h" -static bool permissive; -module_param(permissive, bool, 0644); +bool xen_pcibk_permissive; +module_param_named(permissive, xen_pcibk_permissive, bool, 0644); /* This is where xen_pcibk_read_config_byte, xen_pcibk_read_config_word, * xen_pcibk_write_config_word, and xen_pcibk_write_config_byte are created. */ @@ -148,14 +151,12 @@ int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size, struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); const struct config_field_entry *cfg_entry; const struct config_field *field; - int req_start, req_end, field_start, field_end; + int field_start, field_end; /* if read fails for any reason, return 0 * (as if device didn't respond) */ u32 value = 0, tmp_val; - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: read %d bytes at 0x%x\n", - pci_name(dev), size, offset); + dev_dbg(&dev->dev, "read %d bytes at 0x%x\n", size, offset); if (!valid_request(offset, size)) { err = XEN_PCI_ERR_invalid_offset; @@ -178,13 +179,10 @@ int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size, list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { field = cfg_entry->field; - req_start = offset; - req_end = offset + size; field_start = OFFSET(cfg_entry); field_end = OFFSET(cfg_entry) + field->size; - if ((req_start >= field_start && req_start < field_end) - || (req_end > field_start && req_end <= field_end)) { + if (offset + size > field_start && field_end > offset) { err = conf_space_read(dev, cfg_entry, field_start, &tmp_val); if (err) @@ -192,14 +190,12 @@ int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size, value = merge_value(value, tmp_val, get_mask(field->size), - field_start - req_start); + field_start - offset); } } out: - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: read %d bytes at 0x%x = %x\n", - pci_name(dev), size, offset, value); + dev_dbg(&dev->dev, "read %d bytes at 0x%x = %x\n", size, offset, value); *ret_val = value; return xen_pcibios_err_to_errno(err); @@ -212,12 +208,10 @@ int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, u32 value) const struct config_field_entry *cfg_entry; const struct config_field *field; u32 tmp_val; - int req_start, req_end, field_start, field_end; + int field_start, field_end; - if (unlikely(verbose_request)) - printk(KERN_DEBUG - DRV_NAME ": %s: write request %d bytes at 0x%x = %x\n", - pci_name(dev), size, offset, value); + dev_dbg(&dev->dev, "write request %d bytes at 0x%x = %x\n", + size, offset, value); if (!valid_request(offset, size)) return XEN_PCI_ERR_invalid_offset; @@ -225,22 +219,17 @@ int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, u32 value) list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { field = cfg_entry->field; - req_start = offset; - req_end = offset + size; field_start = OFFSET(cfg_entry); field_end = OFFSET(cfg_entry) + field->size; - if ((req_start >= field_start && req_start < field_end) - || (req_end > field_start && req_end <= field_end)) { - tmp_val = 0; - - err = xen_pcibk_config_read(dev, field_start, - field->size, &tmp_val); + if (offset + size > field_start && field_end > offset) { + err = conf_space_read(dev, cfg_entry, field_start, + &tmp_val); if (err) break; tmp_val = merge_value(tmp_val, value, get_mask(size), - req_start - field_start); + offset - field_start); err = conf_space_write(dev, cfg_entry, field_start, tmp_val); @@ -262,7 +251,7 @@ int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, u32 value) * This means that some fields may still be read-only because * they have entries in the config_field list that intercept * the write and do nothing. */ - if (dev_data->permissive || permissive) { + if (dev_data->permissive || xen_pcibk_permissive) { switch (size) { case 1: err = pci_write_config_byte(dev, offset, @@ -293,6 +282,50 @@ int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, u32 value) return xen_pcibios_err_to_errno(err); } +int xen_pcibk_get_interrupt_type(struct pci_dev *dev) +{ + int err; + u16 val; + int ret = 0; + + /* + * Do not trust dev->msi(x)_enabled here, as enabling could be done + * bypassing the pci_*msi* functions, by the qemu. + */ + if (dev->msi_cap) { + err = pci_read_config_word(dev, + dev->msi_cap + PCI_MSI_FLAGS, + &val); + if (err) + return err; + if (val & PCI_MSI_FLAGS_ENABLE) + ret |= INTERRUPT_TYPE_MSI; + } + if (dev->msix_cap) { + err = pci_read_config_word(dev, + dev->msix_cap + PCI_MSIX_FLAGS, + &val); + if (err) + return err; + if (val & PCI_MSIX_FLAGS_ENABLE) + ret |= INTERRUPT_TYPE_MSIX; + } + + /* + * PCIe spec says device cannot use INTx if MSI/MSI-X is enabled, + * so check for INTx only when both are disabled. + */ + if (!ret) { + err = pci_read_config_word(dev, PCI_COMMAND, &val); + if (err) + return err; + if (!(val & PCI_COMMAND_INTX_DISABLE)) + ret |= INTERRUPT_TYPE_INTX; + } + + return ret ?: INTERRUPT_TYPE_NONE; +} + void xen_pcibk_config_free_dyn_fields(struct pci_dev *dev) { struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); diff --git a/drivers/xen/xen-pciback/conf_space.h b/drivers/xen/xen-pciback/conf_space.h index e56c934ad137..5fe431c79f25 100644 --- a/drivers/xen/xen-pciback/conf_space.h +++ b/drivers/xen/xen-pciback/conf_space.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * PCI Backend - Common data structures for overriding the configuration space * @@ -64,6 +65,13 @@ struct config_field_entry { void *data; }; +#define INTERRUPT_TYPE_NONE (0) +#define INTERRUPT_TYPE_INTX (1<<0) +#define INTERRUPT_TYPE_MSI (1<<1) +#define INTERRUPT_TYPE_MSIX (1<<2) + +extern bool xen_pcibk_permissive; + #define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset) /* Add fields to a device - the add_fields macro expects to get a pointer to @@ -123,4 +131,6 @@ int xen_pcibk_config_capability_init(void); int xen_pcibk_config_header_add_fields(struct pci_dev *dev); int xen_pcibk_config_capability_add_fields(struct pci_dev *dev); +int xen_pcibk_get_interrupt_type(struct pci_dev *dev); + #endif /* __XEN_PCIBACK_CONF_SPACE_H__ */ diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c index 7f83e9083e9d..cf568e899ee2 100644 --- a/drivers/xen/xen-pciback/conf_space_capability.c +++ b/drivers/xen/xen-pciback/conf_space_capability.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * PCI Backend - Handles the virtual fields found on the capability lists * in the configuration space. @@ -115,14 +116,13 @@ static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value, { int err; u16 old_value; - pci_power_t new_state, old_state; + pci_power_t new_state; err = pci_read_config_word(dev, offset, &old_value); if (err) goto out; - old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK); - new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK); + new_state = (__force pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK); new_value &= PM_OK_BITS; if ((old_value & PM_OK_BITS) != new_value) { @@ -160,7 +160,7 @@ static void *pm_ctrl_init(struct pci_dev *dev, int offset) } out: - return ERR_PTR(err); + return err ? ERR_PTR(err) : NULL; } static const struct config_field caplist_pm[] = { @@ -189,6 +189,94 @@ static const struct config_field caplist_pm[] = { {} }; +static struct msi_msix_field_config { + u16 enable_bit; /* bit for enabling MSI/MSI-X */ + u16 allowed_bits; /* bits allowed to be changed */ + unsigned int int_type; /* interrupt type for exclusiveness check */ +} msi_field_config = { + .enable_bit = PCI_MSI_FLAGS_ENABLE, + .allowed_bits = PCI_MSI_FLAGS_ENABLE, + .int_type = INTERRUPT_TYPE_MSI, +}, msix_field_config = { + .enable_bit = PCI_MSIX_FLAGS_ENABLE, + .allowed_bits = PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL, + .int_type = INTERRUPT_TYPE_MSIX, +}; + +static void *msi_field_init(struct pci_dev *dev, int offset) +{ + return &msi_field_config; +} + +static void *msix_field_init(struct pci_dev *dev, int offset) +{ + return &msix_field_config; +} + +static int msi_msix_flags_write(struct pci_dev *dev, int offset, u16 new_value, + void *data) +{ + int err; + u16 old_value; + const struct msi_msix_field_config *field_config = data; + const struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + + if (xen_pcibk_permissive || dev_data->permissive) + goto write; + + err = pci_read_config_word(dev, offset, &old_value); + if (err) + return err; + + if (new_value == old_value) + return 0; + + if (!dev_data->allow_interrupt_control || + (new_value ^ old_value) & ~field_config->allowed_bits) + return PCIBIOS_SET_FAILED; + + if (new_value & field_config->enable_bit) { + /* + * Don't allow enabling together with other interrupt type, but do + * allow enabling MSI(-X) while INTx is still active to please Linuxes + * MSI(-X) startup sequence. It is safe to do, as according to PCI + * spec, device with enabled MSI(-X) shouldn't use INTx. + */ + int int_type = xen_pcibk_get_interrupt_type(dev); + + if (int_type == INTERRUPT_TYPE_NONE || + int_type == INTERRUPT_TYPE_INTX || + int_type == field_config->int_type) + goto write; + return PCIBIOS_SET_FAILED; + } + +write: + return pci_write_config_word(dev, offset, new_value); +} + +static const struct config_field caplist_msix[] = { + { + .offset = PCI_MSIX_FLAGS, + .size = 2, + .init = msix_field_init, + .u.w.read = xen_pcibk_read_config_word, + .u.w.write = msi_msix_flags_write, + }, + {} +}; + +static const struct config_field caplist_msi[] = { + { + .offset = PCI_MSI_FLAGS, + .size = 2, + .init = msi_field_init, + .u.w.read = xen_pcibk_read_config_word, + .u.w.write = msi_msix_flags_write, + }, + {} +}; + static struct xen_pcibk_config_capability xen_pcibk_config_capability_pm = { .capability = PCI_CAP_ID_PM, .fields = caplist_pm, @@ -197,11 +285,21 @@ static struct xen_pcibk_config_capability xen_pcibk_config_capability_vpd = { .capability = PCI_CAP_ID_VPD, .fields = caplist_vpd, }; +static struct xen_pcibk_config_capability xen_pcibk_config_capability_msi = { + .capability = PCI_CAP_ID_MSI, + .fields = caplist_msi, +}; +static struct xen_pcibk_config_capability xen_pcibk_config_capability_msix = { + .capability = PCI_CAP_ID_MSIX, + .fields = caplist_msix, +}; int xen_pcibk_config_capability_init(void) { register_capability(&xen_pcibk_config_capability_vpd); register_capability(&xen_pcibk_config_capability_pm); + register_capability(&xen_pcibk_config_capability_msi); + register_capability(&xen_pcibk_config_capability_msix); return 0; } diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c index c5ee82587e8c..fc0332645966 100644 --- a/drivers/xen/xen-pciback/conf_space_header.c +++ b/drivers/xen/xen-pciback/conf_space_header.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * PCI Backend - Handles the virtual fields in the configuration space headers. * @@ -5,12 +6,17 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define dev_fmt pr_fmt #include <linux/kernel.h> #include <linux/pci.h> #include "pciback.h" #include "conf_space.h" +struct pci_cmd_info { + u16 val; +}; + struct pci_bar_info { u32 val; u32 len_val; @@ -20,22 +26,36 @@ struct pci_bar_info { #define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO)) #define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER) -static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data) +/* Bits guests are allowed to control in permissive mode. */ +#define PCI_COMMAND_GUEST (PCI_COMMAND_MASTER|PCI_COMMAND_SPECIAL| \ + PCI_COMMAND_INVALIDATE|PCI_COMMAND_VGA_PALETTE| \ + PCI_COMMAND_WAIT|PCI_COMMAND_FAST_BACK) + +static void *command_init(struct pci_dev *dev, int offset) { - int i; - int ret; - - ret = xen_pcibk_read_config_word(dev, offset, value, data); - if (!pci_is_enabled(dev)) - return ret; - - for (i = 0; i < PCI_ROM_RESOURCE; i++) { - if (dev->resource[i].flags & IORESOURCE_IO) - *value |= PCI_COMMAND_IO; - if (dev->resource[i].flags & IORESOURCE_MEM) - *value |= PCI_COMMAND_MEMORY; + struct pci_cmd_info *cmd = kmalloc(sizeof(*cmd), GFP_KERNEL); + int err; + + if (!cmd) + return ERR_PTR(-ENOMEM); + + err = pci_read_config_word(dev, PCI_COMMAND, &cmd->val); + if (err) { + kfree(cmd); + return ERR_PTR(err); } + return cmd; +} + +static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data) +{ + int ret = pci_read_config_word(dev, offset, value); + const struct pci_cmd_info *cmd = data; + + *value &= PCI_COMMAND_GUEST; + *value |= cmd->val & ~PCI_COMMAND_GUEST; + return ret; } @@ -43,46 +63,64 @@ static int command_write(struct pci_dev *dev, int offset, u16 value, void *data) { struct xen_pcibk_dev_data *dev_data; int err; + u16 val; + struct pci_cmd_info *cmd = data; dev_data = pci_get_drvdata(dev); if (!pci_is_enabled(dev) && is_enable_cmd(value)) { - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: enable\n", - pci_name(dev)); + dev_dbg(&dev->dev, "enable\n"); err = pci_enable_device(dev); if (err) return err; if (dev_data) dev_data->enable_intx = 1; } else if (pci_is_enabled(dev) && !is_enable_cmd(value)) { - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: disable\n", - pci_name(dev)); + dev_dbg(&dev->dev, "disable\n"); pci_disable_device(dev); if (dev_data) dev_data->enable_intx = 0; } if (!dev->is_busmaster && is_master_cmd(value)) { - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: set bus master\n", - pci_name(dev)); + dev_dbg(&dev->dev, "set bus master\n"); pci_set_master(dev); + } else if (dev->is_busmaster && !is_master_cmd(value)) { + dev_dbg(&dev->dev, "clear bus master\n"); + pci_clear_master(dev); } - if (value & PCI_COMMAND_INVALIDATE) { - if (unlikely(verbose_request)) - printk(KERN_DEBUG - DRV_NAME ": %s: enable memory-write-invalidate\n", - pci_name(dev)); + if (!(cmd->val & PCI_COMMAND_INVALIDATE) && + (value & PCI_COMMAND_INVALIDATE)) { + dev_dbg(&dev->dev, "enable memory-write-invalidate\n"); err = pci_set_mwi(dev); if (err) { - pr_warn("%s: cannot enable memory-write-invalidate (%d)\n", - pci_name(dev), err); + dev_warn(&dev->dev, "cannot enable memory-write-invalidate (%d)\n", + err); value &= ~PCI_COMMAND_INVALIDATE; } + } else if ((cmd->val & PCI_COMMAND_INVALIDATE) && + !(value & PCI_COMMAND_INVALIDATE)) { + dev_dbg(&dev->dev, "disable memory-write-invalidate\n"); + pci_clear_mwi(dev); } + if (dev_data && dev_data->allow_interrupt_control && + ((cmd->val ^ value) & PCI_COMMAND_INTX_DISABLE)) + pci_intx(dev, !(value & PCI_COMMAND_INTX_DISABLE)); + + cmd->val = value; + + if (!xen_pcibk_permissive && (!dev_data || !dev_data->permissive)) + return 0; + + /* Only allow the guest to control certain bits. */ + err = pci_read_config_word(dev, offset, &val); + if (err || val == value) + return err; + + value &= PCI_COMMAND_GUEST; + value |= val & ~PCI_COMMAND_GUEST; + return pci_write_config_word(dev, offset, value); } @@ -91,15 +129,14 @@ static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data) struct pci_bar_info *bar = data; if (unlikely(!bar)) { - pr_warn(DRV_NAME ": driver data not found for %s\n", - pci_name(dev)); + dev_warn(&dev->dev, "driver data not found\n"); return XEN_PCI_ERR_op_failed; } /* A write to obtain the length must happen as a 32-bit write. * This does not (yet) support writing individual bytes */ - if (value == ~PCI_ROM_ADDRESS_ENABLE) + if ((value | ~PCI_ROM_ADDRESS_MASK) == ~0U) bar->which = 1; else { u32 tmpval; @@ -123,17 +160,25 @@ static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data) static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data) { struct pci_bar_info *bar = data; + unsigned int pos = (offset - PCI_BASE_ADDRESS_0) / 4; + const struct resource *res = dev->resource; + u32 mask; if (unlikely(!bar)) { - pr_warn(DRV_NAME ": driver data not found for %s\n", - pci_name(dev)); + dev_warn(&dev->dev, "driver data not found\n"); return XEN_PCI_ERR_op_failed; } /* A write to obtain the length must happen as a 32-bit write. * This does not (yet) support writing individual bytes */ - if (value == ~0) + if (res[pos].flags & IORESOURCE_IO) + mask = ~PCI_BASE_ADDRESS_IO_MASK; + else if (pos && (res[pos - 1].flags & IORESOURCE_MEM_64)) + mask = 0; + else + mask = ~PCI_BASE_ADDRESS_MEM_MASK; + if ((value | mask) == ~0U) bar->which = 1; else { u32 tmpval; @@ -153,8 +198,7 @@ static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data) struct pci_bar_info *bar = data; if (unlikely(!bar)) { - pr_warn(DRV_NAME ": driver data not found for %s\n", - pci_name(dev)); + dev_warn(&dev->dev, "driver data not found\n"); return XEN_PCI_ERR_op_failed; } @@ -163,54 +207,39 @@ static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data) return 0; } -static inline void read_dev_bar(struct pci_dev *dev, - struct pci_bar_info *bar_info, int offset, - u32 len_mask) +static void *bar_init(struct pci_dev *dev, int offset) { - int pos; - struct resource *res = dev->resource; + unsigned int pos; + const struct resource *res = dev->resource; + struct pci_bar_info *bar = kzalloc(sizeof(*bar), GFP_KERNEL); + + if (!bar) + return ERR_PTR(-ENOMEM); if (offset == PCI_ROM_ADDRESS || offset == PCI_ROM_ADDRESS1) pos = PCI_ROM_RESOURCE; else { pos = (offset - PCI_BASE_ADDRESS_0) / 4; - if (pos && ((res[pos - 1].flags & (PCI_BASE_ADDRESS_SPACE | - PCI_BASE_ADDRESS_MEM_TYPE_MASK)) == - (PCI_BASE_ADDRESS_SPACE_MEMORY | - PCI_BASE_ADDRESS_MEM_TYPE_64))) { - bar_info->val = res[pos - 1].start >> 32; - bar_info->len_val = res[pos - 1].end >> 32; - return; + if (pos && (res[pos - 1].flags & IORESOURCE_MEM_64)) { + /* + * Use ">> 16 >> 16" instead of direct ">> 32" shift + * to avoid warnings on 32-bit architectures. + */ + bar->val = res[pos - 1].start >> 16 >> 16; + bar->len_val = -resource_size(&res[pos - 1]) >> 16 >> 16; + return bar; } } - bar_info->val = res[pos].start | - (res[pos].flags & PCI_REGION_FLAG_MASK); - bar_info->len_val = resource_size(&res[pos]); -} + if (!res[pos].flags || + (res[pos].flags & (IORESOURCE_DISABLED | IORESOURCE_UNSET | + IORESOURCE_BUSY))) + return bar; -static void *bar_init(struct pci_dev *dev, int offset) -{ - struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL); - - if (!bar) - return ERR_PTR(-ENOMEM); - - read_dev_bar(dev, bar, offset, ~0); - bar->which = 0; - - return bar; -} - -static void *rom_init(struct pci_dev *dev, int offset) -{ - struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL); - - if (!bar) - return ERR_PTR(-ENOMEM); - - read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE); - bar->which = 0; + bar->val = res[pos].start | + (res[pos].flags & PCI_REGION_FLAG_MASK); + bar->len_val = -resource_size(&res[pos]) | + (res[pos].flags & PCI_REGION_FLAG_MASK); return bar; } @@ -282,6 +311,8 @@ static const struct config_field header_common[] = { { .offset = PCI_COMMAND, .size = 2, + .init = command_init, + .release = bar_release, .u.w.read = command_read, .u.w.write = command_write, }, @@ -331,7 +362,7 @@ static const struct config_field header_common[] = { { \ .offset = reg_offset, \ .size = 4, \ - .init = rom_init, \ + .init = bar_init, \ .reset = bar_reset, \ .release = bar_release, \ .u.dw.read = bar_read, \ @@ -375,8 +406,8 @@ int xen_pcibk_config_header_add_fields(struct pci_dev *dev) default: err = -EINVAL; - pr_err("%s: Unsupported header type %d!\n", - pci_name(dev), dev->hdr_type); + dev_err(&dev->dev, "Unsupported header type %d!\n", + dev->hdr_type); break; } diff --git a/drivers/xen/xen-pciback/conf_space_quirks.c b/drivers/xen/xen-pciback/conf_space_quirks.c index 7476791cab40..7dc281086302 100644 --- a/drivers/xen/xen-pciback/conf_space_quirks.c +++ b/drivers/xen/xen-pciback/conf_space_quirks.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * PCI Backend - Handle special overlays for broken devices. * @@ -5,6 +6,8 @@ * Author: Chris Bookholt <hap10@epoch.ncsc.mil> */ +#define dev_fmt(fmt) DRV_NAME ": " fmt + #include <linux/kernel.h> #include <linux/pci.h> #include "pciback.h" @@ -34,8 +37,8 @@ static struct xen_pcibk_config_quirk *xen_pcibk_find_quirk(struct pci_dev *dev) if (match_one_device(&tmp_quirk->devid, dev) != NULL) goto out; tmp_quirk = NULL; - printk(KERN_DEBUG DRV_NAME - ": quirk didn't match any device known\n"); + dev_printk(KERN_DEBUG, &dev->dev, + "quirk didn't match any device known\n"); out: return tmp_quirk; } @@ -94,7 +97,7 @@ int xen_pcibk_config_quirks_init(struct pci_dev *dev) struct xen_pcibk_config_quirk *quirk; int ret = 0; - quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC); + quirk = kzalloc(sizeof(*quirk), GFP_KERNEL); if (!quirk) { ret = -ENOMEM; goto out; diff --git a/drivers/xen/xen-pciback/conf_space_quirks.h b/drivers/xen/xen-pciback/conf_space_quirks.h index cfcc517e4570..fc1557dfef49 100644 --- a/drivers/xen/xen-pciback/conf_space_quirks.h +++ b/drivers/xen/xen-pciback/conf_space_quirks.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * PCI Backend - Data structures for special overlays for broken devices. * @@ -20,8 +21,6 @@ struct xen_pcibk_config_quirk { int xen_pcibk_config_quirks_add_field(struct pci_dev *dev, struct config_field *field); -int xen_pcibk_config_quirks_remove_field(struct pci_dev *dev, int reg); - int xen_pcibk_config_quirks_init(struct pci_dev *dev); void xen_pcibk_config_field_free(struct config_field *field); diff --git a/drivers/xen/xen-pciback/passthrough.c b/drivers/xen/xen-pciback/passthrough.c index 828dddc360df..66e9b814cc86 100644 --- a/drivers/xen/xen-pciback/passthrough.c +++ b/drivers/xen/xen-pciback/passthrough.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * PCI Backend - Provides restricted access to the real PCI bus topology * to the frontend @@ -69,7 +70,7 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, } static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, - struct pci_dev *dev) + struct pci_dev *dev, bool lock) { struct passthrough_dev_data *dev_data = pdev->pci_dev_data; struct pci_dev_entry *dev_entry, *t; @@ -87,8 +88,13 @@ static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, mutex_unlock(&dev_data->lock); - if (found_dev) + if (found_dev) { + if (lock) + device_lock(&found_dev->dev); pcistub_put_pci_dev(found_dev); + if (lock) + device_unlock(&found_dev->dev); + } } static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev) @@ -156,8 +162,11 @@ static void __xen_pcibk_release_devices(struct xen_pcibk_device *pdev) struct pci_dev_entry *dev_entry, *t; list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) { + struct pci_dev *dev = dev_entry->dev; list_del(&dev_entry->list); - pcistub_put_pci_dev(dev_entry->dev); + device_lock(&dev->dev); + pcistub_put_pci_dev(dev); + device_unlock(&dev->dev); kfree(dev_entry); } diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c index 62fcd485f0a7..045e74847fe6 100644 --- a/drivers/xen/xen-pciback/pci_stub.c +++ b/drivers/xen/xen-pciback/pci_stub.c @@ -6,6 +6,7 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define dev_fmt pr_fmt #include <linux/module.h> #include <linux/init.h> @@ -18,13 +19,19 @@ #include <linux/sched.h> #include <linux/atomic.h> #include <xen/events.h> -#include <asm/xen/pci.h> +#include <xen/pci.h> +#include <xen/xen.h> +#ifdef CONFIG_XEN_ACPI +#include <xen/acpi.h> +#endif #include <asm/xen/hypervisor.h> #include <xen/interface/physdev.h> #include "pciback.h" #include "conf_space.h" #include "conf_space_quirks.h" +#define PCISTUB_DRIVER_NAME "pciback" + static char *pci_devs_to_hide; wait_queue_head_t xen_pcibk_aer_wait_queue; /*Add sem for sync AER handling and xen_pcibk remove/reconfigue ops, @@ -49,6 +56,9 @@ struct pcistub_device { struct pci_dev *dev; struct xen_pcibk_device *pdev;/* non-NULL if struct pci_dev is in use */ +#ifdef CONFIG_XEN_ACPI + int gsi; +#endif }; /* Access to pcistub_devices & seized_devices lists and the initialize_devices @@ -69,7 +79,7 @@ static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev) dev_dbg(&dev->dev, "pcistub_device_alloc\n"); - psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC); + psdev = kzalloc(sizeof(*psdev), GFP_KERNEL); if (!psdev) return NULL; @@ -81,10 +91,23 @@ static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev) kref_init(&psdev->kref); spin_lock_init(&psdev->lock); +#ifdef CONFIG_XEN_ACPI + psdev->gsi = -1; +#endif return psdev; } +static int pcistub_reset_device_state(struct pci_dev *dev) +{ + __pci_reset_function_locked(dev); + + if (!xen_pv_domain()) + return xen_reset_device(dev); + else + return 0; +} + /* Don't call this directly as it's called by pcistub_device_put */ static void pcistub_device_release(struct kref *kref) { @@ -103,9 +126,10 @@ static void pcistub_device_release(struct kref *kref) /* Call the reset function which does not take lock as this * is called from "unbind" which takes a device_lock mutex. */ - __pci_reset_function_locked(dev); - if (pci_load_and_free_saved_state(dev, &dev_data->pci_saved_state)) - dev_dbg(&dev->dev, "Could not reload PCI state\n"); + pcistub_reset_device_state(dev); + if (dev_data && + pci_load_and_free_saved_state(dev, &dev_data->pci_saved_state)) + dev_info(&dev->dev, "Could not reload PCI state\n"); else pci_restore_state(dev); @@ -118,7 +142,7 @@ static void pcistub_device_release(struct kref *kref) int err = HYPERVISOR_physdev_op(PHYSDEVOP_release_msix, &ppdev); - if (err) + if (err && err != -ENOSYS) dev_warn(&dev->dev, "MSI-X release failed (%d)\n", err); } @@ -133,7 +157,7 @@ static void pcistub_device_release(struct kref *kref) xen_pcibk_config_free_dyn_fields(dev); xen_pcibk_config_free_dev(dev); - dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED; + pci_clear_dev_assigned(dev); pci_dev_put(dev); kfree(psdev); @@ -149,13 +173,10 @@ static inline void pcistub_device_put(struct pcistub_device *psdev) kref_put(&psdev->kref, pcistub_device_release); } -static struct pcistub_device *pcistub_device_find(int domain, int bus, - int slot, int func) +static struct pcistub_device *pcistub_device_find_locked(int domain, int bus, + int slot, int func) { - struct pcistub_device *psdev = NULL; - unsigned long flags; - - spin_lock_irqsave(&pcistub_devices_lock, flags); + struct pcistub_device *psdev; list_for_each_entry(psdev, &pcistub_devices, dev_list) { if (psdev->dev != NULL @@ -163,15 +184,25 @@ static struct pcistub_device *pcistub_device_find(int domain, int bus, && bus == psdev->dev->bus->number && slot == PCI_SLOT(psdev->dev->devfn) && func == PCI_FUNC(psdev->dev->devfn)) { - pcistub_device_get(psdev); - goto out; + return psdev; } } - /* didn't find it */ - psdev = NULL; + return NULL; +} + +static struct pcistub_device *pcistub_device_find(int domain, int bus, + int slot, int func) +{ + struct pcistub_device *psdev; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + psdev = pcistub_device_find_locked(domain, bus, slot, func); + if (psdev) + pcistub_device_get(psdev); -out: spin_unlock_irqrestore(&pcistub_devices_lock, flags); return psdev; } @@ -182,8 +213,6 @@ static struct pci_dev *pcistub_device_get_pci_dev(struct xen_pcibk_device *pdev, struct pci_dev *pci_dev = NULL; unsigned long flags; - pcistub_device_get(psdev); - spin_lock_irqsave(&psdev->lock, flags); if (!psdev->pdev) { psdev->pdev = pdev; @@ -191,39 +220,33 @@ static struct pci_dev *pcistub_device_get_pci_dev(struct xen_pcibk_device *pdev, } spin_unlock_irqrestore(&psdev->lock, flags); - if (!pci_dev) - pcistub_device_put(psdev); + if (pci_dev) + pcistub_device_get(psdev); return pci_dev; } -struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev, - int domain, int bus, - int slot, int func) +#ifdef CONFIG_XEN_ACPI +static int pcistub_get_gsi_from_sbdf(unsigned int sbdf) { struct pcistub_device *psdev; - struct pci_dev *found_dev = NULL; - unsigned long flags; + int domain = (sbdf >> 16) & 0xffff; + int bus = PCI_BUS_NUM(sbdf); + int slot = PCI_SLOT(sbdf); + int func = PCI_FUNC(sbdf); - spin_lock_irqsave(&pcistub_devices_lock, flags); + psdev = pcistub_device_find(domain, bus, slot, func); - list_for_each_entry(psdev, &pcistub_devices, dev_list) { - if (psdev->dev != NULL - && domain == pci_domain_nr(psdev->dev->bus) - && bus == psdev->dev->bus->number - && slot == PCI_SLOT(psdev->dev->devfn) - && func == PCI_FUNC(psdev->dev->devfn)) { - found_dev = pcistub_device_get_pci_dev(pdev, psdev); - break; - } - } + if (!psdev) + return -ENODEV; - spin_unlock_irqrestore(&pcistub_devices_lock, flags); - return found_dev; + return psdev->gsi; } +#endif -struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev, - struct pci_dev *dev) +struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev, + int domain, int bus, + int slot, int func) { struct pcistub_device *psdev; struct pci_dev *found_dev = NULL; @@ -231,21 +254,31 @@ struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev, spin_lock_irqsave(&pcistub_devices_lock, flags); - list_for_each_entry(psdev, &pcistub_devices, dev_list) { - if (psdev->dev == dev) { - found_dev = pcistub_device_get_pci_dev(pdev, psdev); - break; - } - } + psdev = pcistub_device_find_locked(domain, bus, slot, func); + if (psdev) + found_dev = pcistub_device_get_pci_dev(pdev, psdev); spin_unlock_irqrestore(&pcistub_devices_lock, flags); return found_dev; } +/* + * Called when: + * - XenBus state has been reconfigure (pci unplug). See xen_pcibk_remove_device + * - XenBus state has been disconnected (guest shutdown). See xen_pcibk_xenbus_remove + * - 'echo BDF > unbind' on pciback module with no guest attached. See pcistub_remove + * - 'echo BDF > unbind' with a guest still using it. See pcistub_remove + * + * As such we have to be careful. + * + * To make this easier, the caller has to hold the device lock. + */ void pcistub_put_pci_dev(struct pci_dev *dev) { struct pcistub_device *psdev, *found_psdev = NULL; unsigned long flags; + struct xen_pcibk_dev_data *dev_data; + int ret; spin_lock_irqsave(&pcistub_devices_lock, flags); @@ -267,21 +300,30 @@ void pcistub_put_pci_dev(struct pci_dev *dev) /* Cleanup our device * (so it's ready for the next domain) */ + device_lock_assert(&dev->dev); + pcistub_reset_device_state(dev); - /* This is OK - we are running from workqueue context - * and want to inhibit the user from fiddling with 'reset' - */ - pci_reset_function(dev); - pci_restore_state(psdev->dev); - + dev_data = pci_get_drvdata(dev); + ret = pci_load_saved_state(dev, dev_data->pci_saved_state); + if (!ret) { + /* + * The usual sequence is pci_save_state & pci_restore_state + * but the guest might have messed the configuration space up. + * Use the initial version (when device was bound to us). + */ + pci_restore_state(dev); + } else + dev_info(&dev->dev, "Could not reload PCI state\n"); /* This disables the device. */ - xen_pcibk_reset_device(found_psdev->dev); + xen_pcibk_reset_device(dev); /* And cleanup up our emulated fields. */ - xen_pcibk_config_free_dyn_fields(found_psdev->dev); - xen_pcibk_config_reset_dev(found_psdev->dev); + xen_pcibk_config_reset_dev(dev); + xen_pcibk_config_free_dyn_fields(dev); + + dev_data->allow_interrupt_control = 0; - xen_unregister_device_domain_owner(found_psdev->dev); + xen_unregister_device_domain_owner(dev); spin_lock_irqsave(&found_psdev->lock, flags); found_psdev->pdev = NULL; @@ -329,11 +371,20 @@ static int pcistub_match(struct pci_dev *dev) return found; } -static int pcistub_init_device(struct pci_dev *dev) +static int pcistub_init_device(struct pcistub_device *psdev) { struct xen_pcibk_dev_data *dev_data; + struct pci_dev *dev; +#ifdef CONFIG_XEN_ACPI + int gsi, trigger, polarity; +#endif int err = 0; + if (!psdev) + return -EINVAL; + + dev = psdev->dev; + dev_dbg(&dev->dev, "initializing...\n"); /* The PCI backend is not intended to be a module (or to work with @@ -342,7 +393,7 @@ static int pcistub_init_device(struct pci_dev *dev) * here and then to call kfree(pci_get_drvdata(psdev->dev)). */ dev_data = kzalloc(sizeof(*dev_data) + strlen(DRV_NAME "[]") - + strlen(pci_name(dev)) + 1, GFP_ATOMIC); + + strlen(pci_name(dev)) + 1, GFP_KERNEL); if (!dev_data) { err = -ENOMEM; goto out; @@ -382,7 +433,7 @@ static int pcistub_init_device(struct pci_dev *dev) }; err = HYPERVISOR_physdev_op(PHYSDEVOP_prepare_msix, &ppdev); - if (err) + if (err && err != -ENOSYS) dev_err(&dev->dev, "MSI-X preparation failed (%d)\n", err); } @@ -395,16 +446,33 @@ static int pcistub_init_device(struct pci_dev *dev) dev_err(&dev->dev, "Could not store PCI conf saved state!\n"); else { dev_dbg(&dev->dev, "resetting (FLR, D3, etc) the device\n"); - __pci_reset_function_locked(dev); + err = pcistub_reset_device_state(dev); + if (err) + goto config_release; pci_restore_state(dev); } + +#ifdef CONFIG_XEN_ACPI + if (xen_initial_domain() && xen_pvh_domain()) { + err = xen_acpi_get_gsi_info(dev, &gsi, &trigger, &polarity); + if (err) { + dev_err(&dev->dev, "Fail to get gsi info!\n"); + goto config_release; + } + err = xen_pvh_setup_gsi(gsi, trigger, polarity); + if (err) + goto config_release; + psdev->gsi = gsi; + } +#endif + /* Now disable the device (this also ensures some private device * data is setup before we export) */ dev_dbg(&dev->dev, "reset device\n"); xen_pcibk_reset_device(dev); - dev->dev_flags |= PCI_DEV_FLAGS_ASSIGNED; + pci_set_dev_assigned(dev); return 0; config_release: @@ -437,7 +505,7 @@ static int __init pcistub_init_devices_late(void) spin_unlock_irqrestore(&pcistub_devices_lock, flags); - err = pcistub_init_device(psdev->dev); + err = pcistub_init_device(psdev); if (err) { dev_err(&psdev->dev->dev, "error %d initializing device\n", err); @@ -458,15 +526,48 @@ static int __init pcistub_init_devices_late(void) return 0; } -static int pcistub_seize(struct pci_dev *dev) +static void pcistub_device_id_add_list(struct pcistub_device_id *new, + int domain, int bus, unsigned int devfn) +{ + struct pcistub_device_id *pci_dev_id; + unsigned long flags; + int found = 0; + + spin_lock_irqsave(&device_ids_lock, flags); + + list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) { + if (pci_dev_id->domain == domain && pci_dev_id->bus == bus && + pci_dev_id->devfn == devfn) { + found = 1; + break; + } + } + + if (!found) { + new->domain = domain; + new->bus = bus; + new->devfn = devfn; + list_add_tail(&new->slot_list, &pcistub_device_ids); + } + + spin_unlock_irqrestore(&device_ids_lock, flags); + + if (found) + kfree(new); +} + +static int pcistub_seize(struct pci_dev *dev, + struct pcistub_device_id *pci_dev_id) { struct pcistub_device *psdev; unsigned long flags; int err = 0; psdev = pcistub_device_alloc(dev); - if (!psdev) + if (!psdev) { + kfree(pci_dev_id); return -ENOMEM; + } spin_lock_irqsave(&pcistub_devices_lock, flags); @@ -474,7 +575,7 @@ static int pcistub_seize(struct pci_dev *dev) spin_unlock_irqrestore(&pcistub_devices_lock, flags); /* don't want irqs disabled when calling pcistub_init_device */ - err = pcistub_init_device(psdev->dev); + err = pcistub_init_device(psdev); spin_lock_irqsave(&pcistub_devices_lock, flags); @@ -487,19 +588,30 @@ static int pcistub_seize(struct pci_dev *dev) spin_unlock_irqrestore(&pcistub_devices_lock, flags); - if (err) + if (err) { + kfree(pci_dev_id); pcistub_device_put(psdev); + } else if (pci_dev_id) + pcistub_device_id_add_list(pci_dev_id, pci_domain_nr(dev->bus), + dev->bus->number, dev->devfn); return err; } +/* Called when 'bind'. This means we must _NOT_ call pci_reset_function or + * other functions that take the sysfs lock. */ static int pcistub_probe(struct pci_dev *dev, const struct pci_device_id *id) { - int err = 0; + int err = 0, match; + struct pcistub_device_id *pci_dev_id = NULL; dev_dbg(&dev->dev, "probing...\n"); - if (pcistub_match(dev)) { + match = pcistub_match(dev); + + if ((dev->driver_override && + !strcmp(dev->driver_override, PCISTUB_DRIVER_NAME)) || + match) { if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) { @@ -510,8 +622,16 @@ static int pcistub_probe(struct pci_dev *dev, const struct pci_device_id *id) goto out; } + if (!match) { + pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL); + if (!pci_dev_id) { + err = -ENOMEM; + goto out; + } + } + dev_info(&dev->dev, "seizing device\n"); - err = pcistub_seize(dev); + err = pcistub_seize(dev, pci_dev_id); } else /* Didn't find the device */ err = -ENODEV; @@ -520,6 +640,8 @@ out: return err; } +/* Called when 'unbind'. This means we must _NOT_ call pci_reset_function or + * other functions that take the sysfs lock. */ static void pcistub_remove(struct pci_dev *dev) { struct pcistub_device *psdev, *found_psdev = NULL; @@ -541,18 +663,23 @@ static void pcistub_remove(struct pci_dev *dev) spin_unlock_irqrestore(&pcistub_devices_lock, flags); if (found_psdev) { - dev_dbg(&dev->dev, "found device to remove - in use? %p\n", - found_psdev->pdev); + dev_dbg(&dev->dev, "found device to remove %s\n", + found_psdev->pdev ? "- in-use" : ""); if (found_psdev->pdev) { - pr_warn("****** removing device %s while still in-use! ******\n", - pci_name(found_psdev->dev)); - pr_warn("****** driver domain may still access this device's i/o resources!\n"); - pr_warn("****** shutdown driver domain before binding device\n"); - pr_warn("****** to other drivers or domains\n"); + int domid = xen_find_device_domain_owner(dev); + + dev_warn(&dev->dev, "****** removing device %s while still in-use by domain %d! ******\n", + pci_name(found_psdev->dev), domid); + dev_warn(&dev->dev, "****** driver domain may still access this device's i/o resources!\n"); + dev_warn(&dev->dev, "****** shutdown driver domain before binding device\n"); + dev_warn(&dev->dev, "****** to other drivers or domains\n"); + /* N.B. This ends up calling pcistub_put_pci_dev which ends up + * doing the FLR. */ xen_pcibk_release_pci_dev(found_psdev->pdev, - found_psdev->dev); + found_psdev->dev, + false /* caller holds the lock. */); } spin_lock_irqsave(&pcistub_devices_lock, flags); @@ -564,7 +691,7 @@ static void pcistub_remove(struct pci_dev *dev) } } -static DEFINE_PCI_DEVICE_TABLE(pcistub_ids) = { +static const struct pci_device_id pcistub_ids[] = { { .vendor = PCI_ANY_ID, .device = PCI_ANY_ID, @@ -614,10 +741,12 @@ static pci_ers_result_t common_process(struct pcistub_device *psdev, { pci_ers_result_t res = result; struct xen_pcie_aer_op *aer_op; + struct xen_pcibk_device *pdev = psdev->pdev; + struct xen_pci_sharedinfo *sh_info = pdev->sh_info; int ret; /*with PV AER drivers*/ - aer_op = &(psdev->pdev->sh_info->aer_op); + aer_op = &(sh_info->aer_op); aer_op->cmd = aer_cmd ; /*useful for error_detected callback*/ aer_op->err = state; @@ -625,55 +754,53 @@ static pci_ers_result_t common_process(struct pcistub_device *psdev, ret = xen_pcibk_get_pcifront_dev(psdev->dev, psdev->pdev, &aer_op->domain, &aer_op->bus, &aer_op->devfn); if (!ret) { - dev_err(&psdev->dev->dev, - DRV_NAME ": failed to get pcifront device\n"); + dev_err(&psdev->dev->dev, "failed to get pcifront device\n"); return PCI_ERS_RESULT_NONE; } wmb(); - dev_dbg(&psdev->dev->dev, - DRV_NAME ": aer_op %x dom %x bus %x devfn %x\n", + dev_dbg(&psdev->dev->dev, "aer_op %x dom %x bus %x devfn %x\n", aer_cmd, aer_op->domain, aer_op->bus, aer_op->devfn); /*local flag to mark there's aer request, xen_pcibk callback will use * this flag to judge whether we need to check pci-front give aer * service ack signal */ - set_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags); + set_bit(_PCIB_op_pending, (unsigned long *)&pdev->flags); /*It is possible that a pcifront conf_read_write ops request invokes * the callback which cause the spurious execution of wake_up. * Yet it is harmless and better than a spinlock here */ set_bit(_XEN_PCIB_active, - (unsigned long *)&psdev->pdev->sh_info->flags); + (unsigned long *)&sh_info->flags); wmb(); - notify_remote_via_irq(psdev->pdev->evtchn_irq); + notify_remote_via_irq(pdev->evtchn_irq); + + /* Enable IRQ to signal "request done". */ + xen_pcibk_lateeoi(pdev, 0); ret = wait_event_timeout(xen_pcibk_aer_wait_queue, !(test_bit(_XEN_PCIB_active, (unsigned long *) - &psdev->pdev->sh_info->flags)), 300*HZ); + &sh_info->flags)), 300*HZ); + + /* Enable IRQ for pcifront request if not already active. */ + if (!test_bit(_PDEVF_op_active, &pdev->flags)) + xen_pcibk_lateeoi(pdev, 0); if (!ret) { if (test_bit(_XEN_PCIB_active, - (unsigned long *)&psdev->pdev->sh_info->flags)) { + (unsigned long *)&sh_info->flags)) { dev_err(&psdev->dev->dev, "pcifront aer process not responding!\n"); clear_bit(_XEN_PCIB_active, - (unsigned long *)&psdev->pdev->sh_info->flags); + (unsigned long *)&sh_info->flags); aer_op->err = PCI_ERS_RESULT_NONE; return res; } } - clear_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags); + clear_bit(_PCIB_op_pending, (unsigned long *)&pdev->flags); - if (test_bit(_XEN_PCIF_active, - (unsigned long *)&psdev->pdev->sh_info->flags)) { - dev_dbg(&psdev->dev->dev, - "schedule pci_conf service in " DRV_NAME "\n"); - xen_pcibk_test_and_schedule_op(psdev->pdev); - } - - res = (pci_ers_result_t)aer_op->err; + res = (__force pci_ers_result_t)aer_op->err; return res; } @@ -700,13 +827,12 @@ static pci_ers_result_t xen_pcibk_slot_reset(struct pci_dev *dev) PCI_FUNC(dev->devfn)); if (!psdev || !psdev->pdev) { - dev_err(&dev->dev, - DRV_NAME " device is not found/assigned\n"); + dev_err(&dev->dev, "device is not found/assigned\n"); goto end; } if (!psdev->pdev->sh_info) { - dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + dev_err(&dev->dev, "device is not connected or owned" " by HVM, kill it\n"); kill_domain_by_device(psdev); goto end; @@ -718,7 +844,7 @@ static pci_ers_result_t xen_pcibk_slot_reset(struct pci_dev *dev) "guest with no AER driver should have been killed\n"); goto end; } - result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result); + result = common_process(psdev, pci_channel_io_normal, XEN_PCI_OP_aer_slotreset, result); if (result == PCI_ERS_RESULT_NONE || result == PCI_ERS_RESULT_DISCONNECT) { @@ -758,13 +884,12 @@ static pci_ers_result_t xen_pcibk_mmio_enabled(struct pci_dev *dev) PCI_FUNC(dev->devfn)); if (!psdev || !psdev->pdev) { - dev_err(&dev->dev, - DRV_NAME " device is not found/assigned\n"); + dev_err(&dev->dev, "device is not found/assigned\n"); goto end; } if (!psdev->pdev->sh_info) { - dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + dev_err(&dev->dev, "device is not connected or owned" " by HVM, kill it\n"); kill_domain_by_device(psdev); goto end; @@ -776,7 +901,7 @@ static pci_ers_result_t xen_pcibk_mmio_enabled(struct pci_dev *dev) "guest with no AER driver should have been killed\n"); goto end; } - result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result); + result = common_process(psdev, pci_channel_io_normal, XEN_PCI_OP_aer_mmio, result); if (result == PCI_ERS_RESULT_NONE || result == PCI_ERS_RESULT_DISCONNECT) { @@ -816,13 +941,12 @@ static pci_ers_result_t xen_pcibk_error_detected(struct pci_dev *dev, PCI_FUNC(dev->devfn)); if (!psdev || !psdev->pdev) { - dev_err(&dev->dev, - DRV_NAME " device is not found/assigned\n"); + dev_err(&dev->dev, "device is not found/assigned\n"); goto end; } if (!psdev->pdev->sh_info) { - dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + dev_err(&dev->dev, "device is not connected or owned" " by HVM, kill it\n"); kill_domain_by_device(psdev); goto end; @@ -870,13 +994,12 @@ static void xen_pcibk_error_resume(struct pci_dev *dev) PCI_FUNC(dev->devfn)); if (!psdev || !psdev->pdev) { - dev_err(&dev->dev, - DRV_NAME " device is not found/assigned\n"); + dev_err(&dev->dev, "device is not found/assigned\n"); goto end; } if (!psdev->pdev->sh_info) { - dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + dev_err(&dev->dev, "device is not connected or owned" " by HVM, kill it\n"); kill_domain_by_device(psdev); goto end; @@ -889,7 +1012,7 @@ static void xen_pcibk_error_resume(struct pci_dev *dev) kill_domain_by_device(psdev); goto end; } - common_process(psdev, 1, XEN_PCI_OP_aer_resume, + common_process(psdev, pci_channel_io_normal, XEN_PCI_OP_aer_resume, PCI_ERS_RESULT_RECOVERED); end: if (psdev) @@ -914,7 +1037,7 @@ static const struct pci_error_handlers xen_pcibk_error_handler = { static struct pci_driver xen_pcibk_pci_driver = { /* The name should be xen_pciback, but until the tools are updated * we will keep it as pciback. */ - .name = "pciback", + .name = PCISTUB_DRIVER_NAME, .id_table = pcistub_ids, .probe = pcistub_probe, .remove = pcistub_remove, @@ -981,7 +1104,6 @@ static inline int str_to_quirk(const char *buf, int *domain, int *bus, int static int pcistub_device_id_add(int domain, int bus, int slot, int func) { struct pcistub_device_id *pci_dev_id; - unsigned long flags; int rc = 0, devfn = PCI_DEVFN(slot, func); if (slot < 0) { @@ -1011,16 +1133,10 @@ static int pcistub_device_id_add(int domain, int bus, int slot, int func) if (!pci_dev_id) return -ENOMEM; - pci_dev_id->domain = domain; - pci_dev_id->bus = bus; - pci_dev_id->devfn = devfn; - pr_debug("wants to seize %04x:%02x:%02x.%d\n", domain, bus, slot, func); - spin_lock_irqsave(&device_ids_lock, flags); - list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids); - spin_unlock_irqrestore(&device_ids_lock, flags); + pcistub_device_id_add_list(pci_dev_id, domain, bus, devfn); return 0; } @@ -1073,7 +1189,7 @@ static int pcistub_reg_add(int domain, int bus, int slot, int func, } dev = psdev->dev; - field = kzalloc(sizeof(*field), GFP_ATOMIC); + field = kzalloc(sizeof(*field), GFP_KERNEL); if (!field) { err = -ENOMEM; goto out; @@ -1096,8 +1212,8 @@ out: return err; } -static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf, - size_t count) +static ssize_t new_slot_store(struct device_driver *drv, const char *buf, + size_t count) { int domain, bus, slot, func; int err; @@ -1113,10 +1229,10 @@ out: err = count; return err; } -static DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add); +static DRIVER_ATTR_WO(new_slot); -static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf, - size_t count) +static ssize_t remove_slot_store(struct device_driver *drv, const char *buf, + size_t count) { int domain, bus, slot, func; int err; @@ -1132,9 +1248,9 @@ out: err = count; return err; } -static DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove); +static DRIVER_ATTR_WO(remove_slot); -static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf) +static ssize_t slots_show(struct device_driver *drv, char *buf) { struct pcistub_device_id *pci_dev_id; size_t count = 0; @@ -1145,7 +1261,7 @@ static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf) if (count >= PAGE_SIZE) break; - count += scnprintf(buf + count, PAGE_SIZE - count, + count += sysfs_emit_at(buf, count, "%04x:%02x:%02x.%d\n", pci_dev_id->domain, pci_dev_id->bus, PCI_SLOT(pci_dev_id->devfn), @@ -1155,9 +1271,9 @@ static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf) return count; } -static DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL); +static DRIVER_ATTR_RO(slots); -static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf) +static ssize_t irq_handlers_show(struct device_driver *drv, char *buf) { struct pcistub_device *psdev; struct xen_pcibk_dev_data *dev_data; @@ -1174,7 +1290,7 @@ static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf) if (!dev_data) continue; count += - scnprintf(buf + count, PAGE_SIZE - count, + sysfs_emit_at(buf, count, "%s:%s:%sing:%ld\n", pci_name(psdev->dev), dev_data->isr_on ? "on" : "off", @@ -1184,11 +1300,10 @@ static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf) spin_unlock_irqrestore(&pcistub_devices_lock, flags); return count; } -static DRIVER_ATTR(irq_handlers, S_IRUSR, pcistub_irq_handler_show, NULL); +static DRIVER_ATTR_RO(irq_handlers); -static ssize_t pcistub_irq_handler_switch(struct device_driver *drv, - const char *buf, - size_t count) +static ssize_t irq_handler_state_store(struct device_driver *drv, + const char *buf, size_t count) { struct pcistub_device *psdev; struct xen_pcibk_dev_data *dev_data; @@ -1225,11 +1340,10 @@ out: err = count; return err; } -static DRIVER_ATTR(irq_handler_state, S_IWUSR, NULL, - pcistub_irq_handler_switch); +static DRIVER_ATTR_WO(irq_handler_state); -static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf, - size_t count) +static ssize_t quirks_store(struct device_driver *drv, const char *buf, + size_t count) { int domain, bus, slot, func, reg, size, mask; int err; @@ -1247,7 +1361,7 @@ out: return err; } -static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf) +static ssize_t quirks_show(struct device_driver *drv, char *buf) { int count = 0; unsigned long flags; @@ -1261,7 +1375,7 @@ static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf) if (count >= PAGE_SIZE) goto out; - count += scnprintf(buf + count, PAGE_SIZE - count, + count += sysfs_emit_at(buf, count, "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n", quirk->pdev->bus->number, PCI_SLOT(quirk->pdev->devfn), @@ -1277,7 +1391,7 @@ static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf) if (count >= PAGE_SIZE) goto out; - count += scnprintf(buf + count, PAGE_SIZE - count, + count += sysfs_emit_at(buf, count, "\t\t%08x:%01x:%08x\n", cfg_entry->base_offset + field->offset, field->size, @@ -1290,11 +1404,10 @@ out: return count; } -static DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, - pcistub_quirk_add); +static DRIVER_ATTR_RW(quirks); -static ssize_t permissive_add(struct device_driver *drv, const char *buf, - size_t count) +static ssize_t permissive_store(struct device_driver *drv, const char *buf, + size_t count) { int domain, bus, slot, func; int err; @@ -1349,14 +1462,72 @@ static ssize_t permissive_show(struct device_driver *drv, char *buf) if (!dev_data || !dev_data->permissive) continue; count += - scnprintf(buf + count, PAGE_SIZE - count, "%s\n", + sysfs_emit_at(buf, count, "%s\n", + pci_name(psdev->dev)); + } + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return count; +} +static DRIVER_ATTR_RW(permissive); + +static ssize_t allow_interrupt_control_store(struct device_driver *drv, + const char *buf, size_t count) +{ + int domain, bus, slot, func; + int err; + struct pcistub_device *psdev; + struct xen_pcibk_dev_data *dev_data; + + err = str_to_slot(buf, &domain, &bus, &slot, &func); + if (err) + goto out; + + psdev = pcistub_device_find(domain, bus, slot, func); + if (!psdev) { + err = -ENODEV; + goto out; + } + + dev_data = pci_get_drvdata(psdev->dev); + /* the driver data for a device should never be null at this point */ + if (!dev_data) { + err = -ENXIO; + goto release; + } + dev_data->allow_interrupt_control = 1; +release: + pcistub_device_put(psdev); +out: + if (!err) + err = count; + return err; +} + +static ssize_t allow_interrupt_control_show(struct device_driver *drv, + char *buf) +{ + struct pcistub_device *psdev; + struct xen_pcibk_dev_data *dev_data; + size_t count = 0; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (count >= PAGE_SIZE) + break; + if (!psdev->dev) + continue; + dev_data = pci_get_drvdata(psdev->dev); + if (!dev_data || !dev_data->allow_interrupt_control) + continue; + count += + sysfs_emit_at(buf, count, "%s\n", pci_name(psdev->dev)); } spin_unlock_irqrestore(&pcistub_devices_lock, flags); return count; } -static DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, - permissive_add); +static DRIVER_ATTR_RW(allow_interrupt_control); static void pcistub_exit(void) { @@ -1368,6 +1539,8 @@ static void pcistub_exit(void) driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_permissive); driver_remove_file(&xen_pcibk_pci_driver.driver, + &driver_attr_allow_interrupt_control); + driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_irq_handlers); driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_irq_handler_state); @@ -1457,6 +1630,9 @@ static int __init pcistub_init(void) if (!err) err = driver_create_file(&xen_pcibk_pci_driver.driver, &driver_attr_permissive); + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_allow_interrupt_control); if (!err) err = driver_create_file(&xen_pcibk_pci_driver.driver, @@ -1487,6 +1663,53 @@ parse_error: fs_initcall(pcistub_init); #endif +#ifdef CONFIG_PCI_IOV +static struct pcistub_device *find_vfs(const struct pci_dev *pdev) +{ + struct pcistub_device *psdev = NULL; + unsigned long flags; + bool found = false; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (!psdev->pdev && psdev->dev != pdev + && pci_physfn(psdev->dev) == pdev) { + found = true; + break; + } + } + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + if (found) + return psdev; + return NULL; +} + +static int pci_stub_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + const struct pci_dev *pdev = to_pci_dev(dev); + + if (action != BUS_NOTIFY_UNBIND_DRIVER) + return NOTIFY_DONE; + + if (!pdev->is_physfn) + return NOTIFY_DONE; + + for (;;) { + struct pcistub_device *psdev = find_vfs(pdev); + if (!psdev) + break; + device_release_driver(&psdev->dev->dev); + } + return NOTIFY_DONE; +} + +static struct notifier_block pci_stub_nb = { + .notifier_call = pci_stub_notifier, +}; +#endif + static int __init xen_pcibk_init(void) { int err; @@ -1508,12 +1731,27 @@ static int __init xen_pcibk_init(void) err = xen_pcibk_xenbus_register(); if (err) pcistub_exit(); +#ifdef CONFIG_PCI_IOV + else + bus_register_notifier(&pci_bus_type, &pci_stub_nb); +#endif + +#ifdef CONFIG_XEN_ACPI + xen_acpi_register_get_gsi_func(pcistub_get_gsi_from_sbdf); +#endif return err; } static void __exit xen_pcibk_cleanup(void) { +#ifdef CONFIG_XEN_ACPI + xen_acpi_register_get_gsi_func(NULL); +#endif + +#ifdef CONFIG_PCI_IOV + bus_unregister_notifier(&pci_bus_type, &pci_stub_nb); +#endif xen_pcibk_xenbus_unregister(); pcistub_exit(); } @@ -1521,5 +1759,6 @@ static void __exit xen_pcibk_cleanup(void) module_init(xen_pcibk_init); module_exit(xen_pcibk_cleanup); +MODULE_DESCRIPTION("Xen PCI-device stub driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS("xen-backend:pci"); diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h index f72af87640e0..b786c1f74f85 100644 --- a/drivers/xen/xen-pciback/pciback.h +++ b/drivers/xen/xen-pciback/pciback.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * PCI Backend Common Data Structures & Function Declarations * @@ -13,6 +14,7 @@ #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/atomic.h> +#include <xen/events.h> #include <xen/interface/io/pciif.h> #define DRV_NAME "xen-pciback" @@ -26,6 +28,8 @@ struct pci_dev_entry { #define PDEVF_op_active (1<<(_PDEVF_op_active)) #define _PCIB_op_pending (1) #define PCIB_op_pending (1<<(_PCIB_op_pending)) +#define _EOI_pending (2) +#define EOI_pending (1<<(_EOI_pending)) struct xen_pcibk_device { void *pci_dev_data; @@ -37,24 +41,25 @@ struct xen_pcibk_device { struct xen_pci_sharedinfo *sh_info; unsigned long flags; struct work_struct op_work; + struct xen_pci_op op; }; struct xen_pcibk_dev_data { struct list_head config_fields; struct pci_saved_state *pci_saved_state; unsigned int permissive:1; + unsigned int allow_interrupt_control:1; unsigned int warned_on_write:1; unsigned int enable_intx:1; unsigned int isr_on:1; /* Whether the IRQ handler is installed. */ unsigned int ack_intr:1; /* .. and ACK-ing */ unsigned long handled; unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */ - char irq_name[0]; /* xen-pcibk[000:04:00.0] */ + char irq_name[]; /* xen-pcibk[000:04:00.0] */ }; /* Used by XenBus and xen_pcibk_ops.c */ extern wait_queue_head_t xen_pcibk_aer_wait_queue; -extern struct workqueue_struct *xen_pcibk_wq; /* Used by pcistub.c and conf_space_quirks.c */ extern struct list_head xen_pcibk_quirks; @@ -62,10 +67,13 @@ extern struct list_head xen_pcibk_quirks; struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev, int domain, int bus, int slot, int func); -struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev, - struct pci_dev *dev); void pcistub_put_pci_dev(struct pci_dev *dev); +static inline bool xen_pcibk_pv_support(void) +{ + return IS_ENABLED(CONFIG_XEN_PCIDEV_BACKEND); +} + /* Ensure a device is turned off or reset */ void xen_pcibk_reset_device(struct pci_dev *pdev); @@ -99,7 +107,8 @@ struct xen_pcibk_backend { unsigned int *domain, unsigned int *bus, unsigned int *devfn); int (*publish)(struct xen_pcibk_device *pdev, publish_pci_root_cb cb); - void (*release)(struct xen_pcibk_device *pdev, struct pci_dev *dev); + void (*release)(struct xen_pcibk_device *pdev, struct pci_dev *dev, + bool lock); int (*add)(struct xen_pcibk_device *pdev, struct pci_dev *dev, int devid, publish_pci_dev_cb publish_cb); struct pci_dev *(*get)(struct xen_pcibk_device *pdev, @@ -122,10 +131,10 @@ static inline int xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, } static inline void xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, - struct pci_dev *dev) + struct pci_dev *dev, bool lock) { if (xen_pcibk_backend && xen_pcibk_backend->release) - return xen_pcibk_backend->release(pdev, dev); + return xen_pcibk_backend->release(pdev, dev, lock); } static inline struct pci_dev * @@ -180,13 +189,13 @@ static inline void xen_pcibk_release_devices(struct xen_pcibk_device *pdev) irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id); void xen_pcibk_do_op(struct work_struct *data); +static inline void xen_pcibk_lateeoi(struct xen_pcibk_device *pdev, + unsigned int eoi_flag) +{ + if (test_and_clear_bit(_EOI_pending, &pdev->flags)) + xen_irq_lateeoi(pdev->evtchn_irq, eoi_flag); +} + int xen_pcibk_xenbus_register(void); void xen_pcibk_xenbus_unregister(void); - -extern int verbose_request; - -void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev); #endif - -/* Handles shared IRQs that can to device domain and control domain. */ -void xen_pcibk_irq_handler(struct pci_dev *dev, int reset); diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c index 64eb0cd8b8af..84e014490950 100644 --- a/drivers/xen/xen-pciback/pciback_ops.c +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * PCI Backend Operations - respond to PCI requests from Frontend * @@ -5,17 +6,15 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define dev_fmt pr_fmt -#include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/wait.h> #include <linux/bitops.h> #include <xen/events.h> #include <linux/sched.h> #include "pciback.h" -int verbose_request; -module_param(verbose_request, int, 0644); - static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id); /* Ensure a device is has the fake IRQ handler "turned on/off" and is @@ -70,6 +69,13 @@ static void xen_pcibk_control_isr(struct pci_dev *dev, int reset) enable ? "enable" : "disable"); if (enable) { + /* + * The MSI or MSI-X should not have an IRQ handler. Otherwise + * if the guest terminates we BUG_ON in free_msi_irqs. + */ + if (dev->msi_enabled || dev->msix_enabled) + goto out; + rc = request_irq(dev_data->irq, xen_pcibk_guest_interrupt, IRQF_SHARED, dev_data->irq_name, dev); @@ -119,8 +125,6 @@ void xen_pcibk_reset_device(struct pci_dev *dev) if (pci_is_enabled(dev)) pci_disable_device(dev); - pci_write_config_word(dev, PCI_COMMAND, 0); - dev->is_busmaster = 0; } else { pci_read_config_word(dev, PCI_COMMAND, &cmd); @@ -141,26 +145,26 @@ int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev, struct xen_pcibk_dev_data *dev_data; int status; - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: enable MSI\n", pci_name(dev)); - - status = pci_enable_msi(dev); + if (dev->msi_enabled) + status = -EALREADY; + else if (dev->msix_enabled) + status = -ENXIO; + else + status = pci_enable_msi(dev); if (status) { - pr_warn_ratelimited("%s: error enabling MSI for guest %u: err %d\n", - pci_name(dev), pdev->xdev->otherend_id, - status); + dev_warn_ratelimited(&dev->dev, "error enabling MSI for guest %u: err %d\n", + pdev->xdev->otherend_id, status); op->value = 0; return XEN_PCI_ERR_op_failed; } - /* The value the guest needs is actually the IDT vector, not the + /* The value the guest needs is actually the IDT vector, not * the local domain's IRQ number. */ op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev), - op->value); + + dev_dbg(&dev->dev, "MSI: %d\n", op->value); dev_data = pci_get_drvdata(dev); if (dev_data) @@ -173,20 +177,19 @@ static int xen_pcibk_disable_msi(struct xen_pcibk_device *pdev, struct pci_dev *dev, struct xen_pci_op *op) { - struct xen_pcibk_dev_data *dev_data; + if (dev->msi_enabled) { + struct xen_pcibk_dev_data *dev_data; - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: disable MSI\n", - pci_name(dev)); - pci_disable_msi(dev); + pci_disable_msi(dev); + dev_data = pci_get_drvdata(dev); + if (dev_data) + dev_data->ack_intr = 1; + } op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev), - op->value); - dev_data = pci_get_drvdata(dev); - if (dev_data) - dev_data->ack_intr = 1; + + dev_dbg(&dev->dev, "MSI: %d\n", op->value); + return 0; } @@ -197,14 +200,26 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev, struct xen_pcibk_dev_data *dev_data; int i, result; struct msix_entry *entries; + u16 cmd; + + dev_dbg(&dev->dev, "enable MSI-X\n"); - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: enable MSI-X\n", - pci_name(dev)); if (op->value > SH_INFO_MAX_VEC) return -EINVAL; - entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL); + if (dev->msix_enabled) + return -EALREADY; + + /* + * PCI_COMMAND_MEMORY must be enabled, otherwise we may not be able + * to access the BARs where the MSI-X entries reside. + * But VF devices are unique in which the PF needs to be checked. + */ + pci_read_config_word(pci_physfn(dev), PCI_COMMAND, &cmd); + if (dev->msi_enabled || !(cmd & PCI_COMMAND_MEMORY)) + return -ENXIO; + + entries = kmalloc_array(op->value, sizeof(*entries), GFP_KERNEL); if (entries == NULL) return -ENOMEM; @@ -213,24 +228,20 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev, entries[i].vector = op->msix_entries[i].vector; } - result = pci_enable_msix(dev, entries, op->value); - + result = pci_enable_msix_exact(dev, entries, op->value); if (result == 0) { for (i = 0; i < op->value; i++) { op->msix_entries[i].entry = entries[i].entry; - if (entries[i].vector) + if (entries[i].vector) { op->msix_entries[i].vector = xen_pirq_from_irq(entries[i].vector); - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: " \ - "MSI-X[%d]: %d\n", - pci_name(dev), i, - op->msix_entries[i].vector); + dev_dbg(&dev->dev, "MSI-X[%d]: %d\n", i, + op->msix_entries[i].vector); + } } } else - pr_warn_ratelimited("%s: error enabling MSI-X for guest %u: err %d!\n", - pci_name(dev), pdev->xdev->otherend_id, - result); + dev_warn_ratelimited(&dev->dev, "error enabling MSI-X for guest %u: err %d!\n", + pdev->xdev->otherend_id, result); kfree(entries); op->value = result; @@ -245,46 +256,61 @@ static int xen_pcibk_disable_msix(struct xen_pcibk_device *pdev, struct pci_dev *dev, struct xen_pci_op *op) { - struct xen_pcibk_dev_data *dev_data; - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: disable MSI-X\n", - pci_name(dev)); - pci_disable_msix(dev); + if (dev->msix_enabled) { + struct xen_pcibk_dev_data *dev_data; + pci_disable_msix(dev); + + dev_data = pci_get_drvdata(dev); + if (dev_data) + dev_data->ack_intr = 1; + } /* * SR-IOV devices (which don't have any legacy IRQ) have * an undefined IRQ value of zero. */ op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: MSI-X: %d\n", pci_name(dev), - op->value); - dev_data = pci_get_drvdata(dev); - if (dev_data) - dev_data->ack_intr = 1; + + dev_dbg(&dev->dev, "MSI-X: %d\n", op->value); + return 0; } #endif + +static inline bool xen_pcibk_test_op_pending(struct xen_pcibk_device *pdev) +{ + return test_bit(_XEN_PCIF_active, + (unsigned long *)&pdev->sh_info->flags) && + !test_and_set_bit(_PDEVF_op_active, &pdev->flags); +} + /* * Now the same evtchn is used for both pcifront conf_read_write request * as well as pcie aer front end ack. We use a new work_queue to schedule * xen_pcibk conf_read_write service for avoiding confict with aer_core * do_recovery job which also use the system default work_queue */ -void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev) +static void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev) { + bool eoi = true; + /* Check that frontend is requesting an operation and that we are not * already processing a request */ - if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags) - && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) { - queue_work(xen_pcibk_wq, &pdev->op_work); + if (xen_pcibk_test_op_pending(pdev)) { + schedule_work(&pdev->op_work); + eoi = false; } /*_XEN_PCIB_active should have been cleared by pcifront. And also make sure xen_pcibk is waiting for ack by checking _PCIB_op_pending*/ if (!test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags) && test_bit(_PCIB_op_pending, &pdev->flags)) { wake_up(&xen_pcibk_aer_wait_queue); + eoi = false; } + + /* EOI if there was nothing to do. */ + if (eoi) + xen_pcibk_lateeoi(pdev, XEN_EOI_FLAG_SPURIOUS); } /* Performing the configuration space reads/writes must not be done in atomic @@ -292,15 +318,18 @@ void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev) * use of semaphores). This function is intended to be called from a work * queue in process context taking a struct xen_pcibk_device as a parameter */ -void xen_pcibk_do_op(struct work_struct *data) +static void xen_pcibk_do_one_op(struct xen_pcibk_device *pdev) { - struct xen_pcibk_device *pdev = - container_of(data, struct xen_pcibk_device, op_work); struct pci_dev *dev; struct xen_pcibk_dev_data *dev_data = NULL; - struct xen_pci_op *op = &pdev->sh_info->op; + struct xen_pci_op *op = &pdev->op; int test_intx = 0; +#ifdef CONFIG_PCI_MSI + unsigned int nr = 0; +#endif + *op = pdev->sh_info->op; + barrier(); dev = xen_pcibk_get_pci_dev(pdev, op->domain, op->bus, op->devfn); if (dev == NULL) @@ -326,6 +355,7 @@ void xen_pcibk_do_op(struct work_struct *data) op->err = xen_pcibk_disable_msi(pdev, dev, op); break; case XEN_PCI_OP_enable_msix: + nr = op->value; op->err = xen_pcibk_enable_msix(pdev, dev, op); break; case XEN_PCI_OP_disable_msix: @@ -342,25 +372,51 @@ void xen_pcibk_do_op(struct work_struct *data) if ((dev_data->enable_intx != test_intx)) xen_pcibk_control_isr(dev, 0 /* no reset */); } + pdev->sh_info->op.err = op->err; + pdev->sh_info->op.value = op->value; +#ifdef CONFIG_PCI_MSI + if (op->cmd == XEN_PCI_OP_enable_msix && op->err == 0) { + unsigned int i; + + for (i = 0; i < nr; i++) + pdev->sh_info->op.msix_entries[i].vector = + op->msix_entries[i].vector; + } +#endif /* Tell the driver domain that we're done. */ wmb(); clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags); notify_remote_via_irq(pdev->evtchn_irq); /* Mark that we're done. */ - smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */ + smp_mb__before_atomic(); /* /after/ clearing PCIF_active */ clear_bit(_PDEVF_op_active, &pdev->flags); - smp_mb__after_clear_bit(); /* /before/ final check for work */ + smp_mb__after_atomic(); /* /before/ final check for work */ +} - /* Check to see if the driver domain tried to start another request in - * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. - */ - xen_pcibk_test_and_schedule_op(pdev); +void xen_pcibk_do_op(struct work_struct *data) +{ + struct xen_pcibk_device *pdev = + container_of(data, struct xen_pcibk_device, op_work); + + do { + xen_pcibk_do_one_op(pdev); + } while (xen_pcibk_test_op_pending(pdev)); + + xen_pcibk_lateeoi(pdev, 0); } irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id) { struct xen_pcibk_device *pdev = dev_id; + bool eoi; + + /* IRQs might come in before pdev->evtchn_irq is written. */ + if (unlikely(pdev->evtchn_irq != irq)) + pdev->evtchn_irq = irq; + + eoi = test_and_set_bit(_EOI_pending, &pdev->flags); + WARN(eoi, "IRQ while EOI pending\n"); xen_pcibk_test_and_schedule_op(pdev); @@ -375,7 +431,7 @@ static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id) dev_data->handled++; if ((dev_data->handled % 1000) == 0) { if (xen_test_irq_shared(irq)) { - pr_info("%s IRQ line is not shared " + dev_info(&dev->dev, "%s IRQ line is not shared " "with other domains. Turning ISR off\n", dev_data->irq_name); dev_data->ack_intr = 0; diff --git a/drivers/xen/xen-pciback/vpci.c b/drivers/xen/xen-pciback/vpci.c index 3165ce361b00..cc7450f2b2a9 100644 --- a/drivers/xen/xen-pciback/vpci.c +++ b/drivers/xen/xen-pciback/vpci.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * PCI Backend - Provides a Virtual PCI bus (with real devices) * to the frontend @@ -6,6 +7,7 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define dev_fmt pr_fmt #include <linux/list.h> #include <linux/slab.h> @@ -68,7 +70,7 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, struct pci_dev *dev, int devid, publish_pci_dev_cb publish_cb) { - int err = 0, slot, func = -1; + int err = 0, slot, func = PCI_FUNC(dev->devfn); struct pci_dev_entry *t, *dev_entry; struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; @@ -93,23 +95,25 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, /* * Keep multi-function devices together on the virtual PCI bus, except - * virtual functions. + * that we want to keep virtual functions at func 0 on their own. They + * aren't multi-function devices and hence their presence at func 0 + * may cause guests to not scan the other functions. */ - if (!dev->is_virtfn) { + if (!dev->is_virtfn || func) { for (slot = 0; slot < PCI_SLOT_MAX; slot++) { if (list_empty(&vpci_dev->dev_list[slot])) continue; t = list_entry(list_first(&vpci_dev->dev_list[slot]), struct pci_dev_entry, list); + if (t->dev->is_virtfn && !PCI_FUNC(t->dev->devfn)) + continue; if (match_slot(dev, t->dev)) { - pr_info("vpci: %s: assign to virtual slot %d func %d\n", - pci_name(dev), slot, - PCI_FUNC(dev->devfn)); + dev_info(&dev->dev, "vpci: assign to virtual slot %d func %d\n", + slot, func); list_add_tail(&dev_entry->list, &vpci_dev->dev_list[slot]); - func = PCI_FUNC(dev->devfn); goto unlock; } } @@ -118,11 +122,10 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, /* Assign to a new slot on the virtual PCI bus */ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { if (list_empty(&vpci_dev->dev_list[slot])) { - pr_info("vpci: %s: assign to virtual slot %d\n", - pci_name(dev), slot); + dev_info(&dev->dev, "vpci: assign to virtual slot %d\n", + slot); list_add_tail(&dev_entry->list, &vpci_dev->dev_list[slot]); - func = dev->is_virtfn ? 0 : PCI_FUNC(dev->devfn); goto unlock; } } @@ -137,13 +140,15 @@ unlock: /* Publish this device. */ if (!err) err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid); + else + kfree(dev_entry); out: return err; } static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, - struct pci_dev *dev) + struct pci_dev *dev, bool lock) { int slot; struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; @@ -167,8 +172,13 @@ static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, out: mutex_unlock(&vpci_dev->lock); - if (found_dev) + if (found_dev) { + if (lock) + device_lock(&found_dev->dev); pcistub_put_pci_dev(found_dev); + if (lock) + device_unlock(&found_dev->dev); + } } static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev) @@ -206,8 +216,11 @@ static void __xen_pcibk_release_devices(struct xen_pcibk_device *pdev) struct pci_dev_entry *e, *tmp; list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot], list) { + struct pci_dev *dev = e->dev; list_del(&e->list); - pcistub_put_pci_dev(e->dev); + device_lock(&dev->dev); + pcistub_put_pci_dev(dev); + device_unlock(&dev->dev); kfree(e); } } @@ -222,7 +235,6 @@ static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, unsigned int *devfn) { struct pci_dev_entry *entry; - struct pci_dev *dev = NULL; struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; int found = 0, slot; @@ -231,11 +243,7 @@ static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, list_for_each_entry(entry, &vpci_dev->dev_list[slot], list) { - dev = entry->dev; - if (dev && dev->bus->number == pcidev->bus->number - && pci_domain_nr(dev->bus) == - pci_domain_nr(pcidev->bus) - && dev->devfn == pcidev->devfn) { + if (entry->dev == pcidev) { found = 1; *domain = 0; *bus = 0; diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c index a9ed867afaba..b11e401f1b1e 100644 --- a/drivers/xen/xen-pciback/xenbus.c +++ b/drivers/xen/xen-pciback/xenbus.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * PCI Backend Xenbus Setup - handles setup with frontend and xend * @@ -6,18 +7,17 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/init.h> #include <linux/list.h> #include <linux/vmalloc.h> #include <linux/workqueue.h> #include <xen/xenbus.h> #include <xen/events.h> -#include <asm/xen/pci.h> +#include <xen/pci.h> #include "pciback.h" #define INVALID_EVTCHN_IRQ (-1) -struct workqueue_struct *xen_pcibk_wq; static bool __read_mostly passthrough; module_param(passthrough, bool, S_IRUGO); @@ -31,7 +31,7 @@ MODULE_PARM_DESC(passthrough, " frontend (for example, a device at 06:01.b will still appear at\n"\ " 06:01.b to the frontend). This is similar to how Xen 2.0.x\n"\ " exposed PCI devices to its driver domains. This may be required\n"\ - " for drivers which depend on finding their hardward in certain\n"\ + " for drivers which depend on finding their hardware in certain\n"\ " bus/slot locations."); static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev) @@ -44,7 +44,6 @@ static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev) dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev); pdev->xdev = xdev; - dev_set_drvdata(&xdev->dev, pdev); mutex_init(&pdev->dev_lock); @@ -58,6 +57,9 @@ static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev) kfree(pdev); pdev = NULL; } + + dev_set_drvdata(&xdev->dev, pdev); + out: return pdev; } @@ -74,8 +76,7 @@ static void xen_pcibk_disconnect(struct xen_pcibk_device *pdev) /* If the driver domain started an op, make sure we complete it * before releasing the shared memory */ - /* Note, the workqueue does not use spinlocks at all.*/ - flush_workqueue(xen_pcibk_wq); + flush_work(&pdev->op_work); if (pdev->sh_info != NULL) { xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info); @@ -93,6 +94,8 @@ static void free_pdev(struct xen_pcibk_device *pdev) xen_pcibk_disconnect(pdev); + /* N.B. This calls pcistub_put_pci_dev which does the FLR on all + * of the PCIe devices. */ xen_pcibk_release_devices(pdev); dev_set_drvdata(&pdev->xdev->dev, NULL); @@ -102,16 +105,16 @@ static void free_pdev(struct xen_pcibk_device *pdev) } static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref, - int remote_evtchn) + evtchn_port_t remote_evtchn) { int err = 0; void *vaddr; dev_dbg(&pdev->xdev->dev, - "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n", + "Attaching to frontend resources - gnt_ref=%d evtchn=%u\n", gnt_ref, remote_evtchn); - err = xenbus_map_ring_valloc(pdev->xdev, gnt_ref, &vaddr); + err = xenbus_map_ring_valloc(pdev->xdev, &gnt_ref, 1, &vaddr); if (err < 0) { xenbus_dev_fatal(pdev->xdev, err, "Error mapping other domain page in ours."); @@ -120,8 +123,8 @@ static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref, pdev->sh_info = vaddr; - err = bind_interdomain_evtchn_to_irqhandler( - pdev->xdev->otherend_id, remote_evtchn, xen_pcibk_handle_event, + err = bind_interdomain_evtchn_to_irqhandler_lateeoi( + pdev->xdev, remote_evtchn, xen_pcibk_handle_event, 0, DRV_NAME, pdev); if (err < 0) { xenbus_dev_fatal(pdev->xdev, err, @@ -139,7 +142,8 @@ out: static int xen_pcibk_attach(struct xen_pcibk_device *pdev) { int err = 0; - int gnt_ref, remote_evtchn; + int gnt_ref; + evtchn_port_t remote_evtchn; char *magic = NULL; @@ -172,6 +176,7 @@ static int xen_pcibk_attach(struct xen_pcibk_device *pdev) "version mismatch (%s/%s) with pcifront - " "halting " DRV_NAME, magic, XEN_PCI_MAGIC); + err = -EFAULT; goto out; } @@ -244,7 +249,7 @@ static int xen_pcibk_export_device(struct xen_pcibk_device *pdev, if (err) goto out; - dev_dbg(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id); + dev_info(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id); if (xen_register_device_domain_owner(dev, pdev->xdev->otherend_id) != 0) { dev_err(&dev->dev, "Stealing ownership from dom%d.\n", @@ -286,7 +291,9 @@ static int xen_pcibk_remove_device(struct xen_pcibk_device *pdev, dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id); xen_unregister_device_domain_owner(dev); - xen_pcibk_release_pci_dev(pdev, dev); + /* N.B. This ends up calling pcistub_put_pci_dev which ends up + * doing the FLR. */ + xen_pcibk_release_pci_dev(pdev, dev, true /* use the lock. */); out: return err; @@ -352,12 +359,13 @@ out: return err; } -static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev) +static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev, + enum xenbus_state state) { int err = 0; int num_devs; int domain, bus, slot, func; - int substate; + unsigned int substate; int i, len; char state_str[64]; char dev_str[64]; @@ -366,9 +374,7 @@ static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev) dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n"); mutex_lock(&pdev->dev_lock); - /* Make sure we only reconfigure once */ - if (xenbus_read_driver_state(pdev->xdev->nodename) != - XenbusStateReconfiguring) + if (xenbus_read_driver_state(pdev->xdev->nodename) != state) goto out; err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d", @@ -390,10 +396,8 @@ static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev) "configuration"); goto out; } - err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, state_str, - "%d", &substate); - if (err != 1) - substate = XenbusStateUnknown; + substate = xenbus_read_unsigned(pdev->xdev->nodename, state_str, + XenbusStateUnknown); switch (substate) { case XenbusStateInitialising: @@ -495,6 +499,10 @@ static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev) } } + if (state != XenbusStateReconfiguring) + /* Make sure we only reconfigure once. */ + goto out; + err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured); if (err) { xenbus_dev_fatal(pdev->xdev, err, @@ -520,7 +528,7 @@ static void xen_pcibk_frontend_changed(struct xenbus_device *xdev, break; case XenbusStateReconfiguring: - xen_pcibk_reconfigure(pdev); + xen_pcibk_reconfigure(pdev, XenbusStateReconfiguring); break; case XenbusStateConnected: @@ -540,7 +548,7 @@ static void xen_pcibk_frontend_changed(struct xenbus_device *xdev, xenbus_switch_state(xdev, XenbusStateClosed); if (xenbus_dev_is_online(xdev)) break; - /* fall through if not online */ + fallthrough; /* if not online */ case XenbusStateUnknown: dev_dbg(&xdev->dev, "frontend is gone! unregister device\n"); device_unregister(&xdev->dev); @@ -649,7 +657,7 @@ out: } static void xen_pcibk_be_watch(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { struct xen_pcibk_device *pdev = container_of(watch, struct xen_pcibk_device, be_watch); @@ -659,6 +667,15 @@ static void xen_pcibk_be_watch(struct xenbus_watch *watch, xen_pcibk_setup_backend(pdev); break; + case XenbusStateInitialised: + /* + * We typically move to Initialised when the first device was + * added. Hence subsequent devices getting added may need + * reconfiguring. + */ + xen_pcibk_reconfigure(pdev, XenbusStateInitialised); + break; + default: break; } @@ -684,7 +701,7 @@ static int xen_pcibk_xenbus_probe(struct xenbus_device *dev, /* watch the backend node for backend configuration information */ err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch, - xen_pcibk_be_watch); + NULL, xen_pcibk_be_watch); if (err) goto out; @@ -693,20 +710,18 @@ static int xen_pcibk_xenbus_probe(struct xenbus_device *dev, /* We need to force a call to our callback here in case * xend already configured us! */ - xen_pcibk_be_watch(&pdev->be_watch, NULL, 0); + xen_pcibk_be_watch(&pdev->be_watch, NULL, NULL); out: return err; } -static int xen_pcibk_xenbus_remove(struct xenbus_device *dev) +static void xen_pcibk_xenbus_remove(struct xenbus_device *dev) { struct xen_pcibk_device *pdev = dev_get_drvdata(&dev->dev); if (pdev != NULL) free_pdev(pdev); - - return 0; } static const struct xenbus_device_id xen_pcibk_ids[] = { @@ -714,21 +729,21 @@ static const struct xenbus_device_id xen_pcibk_ids[] = { {""}, }; -static DEFINE_XENBUS_DRIVER(xen_pcibk, DRV_NAME, +static struct xenbus_driver xen_pcibk_driver = { + .name = DRV_NAME, + .ids = xen_pcibk_ids, .probe = xen_pcibk_xenbus_probe, .remove = xen_pcibk_xenbus_remove, .otherend_changed = xen_pcibk_frontend_changed, -); +}; const struct xen_pcibk_backend *__read_mostly xen_pcibk_backend; int __init xen_pcibk_xenbus_register(void) { - xen_pcibk_wq = create_workqueue("xen_pciback_workqueue"); - if (!xen_pcibk_wq) { - pr_err("%s: create xen_pciback_workqueue failed\n", __func__); - return -EFAULT; - } + if (!xen_pcibk_pv_support()) + return 0; + xen_pcibk_backend = &xen_pcibk_vpci_backend; if (passthrough) xen_pcibk_backend = &xen_pcibk_passthrough_backend; @@ -738,6 +753,6 @@ int __init xen_pcibk_xenbus_register(void) void __exit xen_pcibk_xenbus_unregister(void) { - destroy_workqueue(xen_pcibk_wq); - xenbus_unregister_driver(&xen_pcibk_driver); + if (xen_pcibk_pv_support()) + xenbus_unregister_driver(&xen_pcibk_driver); } diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c new file mode 100644 index 000000000000..0c51edfd13dc --- /dev/null +++ b/drivers/xen/xen-scsiback.c @@ -0,0 +1,1891 @@ +/* + * Xen SCSI backend driver + * + * Copyright (c) 2008, FUJITSU Limited + * + * Based on the blkback driver code. + * Adaption to kernel taget core infrastructure taken from vhost/scsi.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen-pvscsi: " fmt + +#include <linux/module.h> +#include <linux/utsname.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/wait.h> +#include <linux/sched.h> +#include <linux/list.h> +#include <linux/gfp.h> +#include <linux/delay.h> +#include <linux/spinlock.h> +#include <linux/configfs.h> + +#include <generated/utsrelease.h> + +#include <scsi/scsi_host.h> /* SG_ALL */ + +#include <target/target_core_base.h> +#include <target/target_core_fabric.h> + +#include <asm/hypervisor.h> + +#include <xen/xen.h> +#include <xen/balloon.h> +#include <xen/events.h> +#include <xen/xenbus.h> +#include <xen/grant_table.h> +#include <xen/page.h> + +#include <xen/interface/grant_table.h> +#include <xen/interface/io/vscsiif.h> + +#define VSCSI_VERSION "v0.1" +#define VSCSI_NAMELEN 32 + +struct ids_tuple { + unsigned int hst; /* host */ + unsigned int chn; /* channel */ + unsigned int tgt; /* target */ + unsigned int lun; /* LUN */ +}; + +struct v2p_entry { + struct ids_tuple v; /* translate from */ + struct scsiback_tpg *tpg; /* translate to */ + unsigned int lun; + struct kref kref; + struct list_head l; +}; + +struct vscsibk_info { + struct xenbus_device *dev; + + domid_t domid; + unsigned int irq; + + struct vscsiif_back_ring ring; + + spinlock_t ring_lock; + atomic_t nr_unreplied_reqs; + + spinlock_t v2p_lock; + struct list_head v2p_entry_lists; + + wait_queue_head_t waiting_to_free; + + struct gnttab_page_cache free_pages; +}; + +/* theoretical maximum of grants for one request */ +#define VSCSI_MAX_GRANTS (SG_ALL + VSCSIIF_SG_TABLESIZE) + +/* + * VSCSI_GRANT_BATCH is the maximum number of grants to be processed in one + * call to map/unmap grants. Don't choose it too large, as there are arrays + * with VSCSI_GRANT_BATCH elements allocated on the stack. + */ +#define VSCSI_GRANT_BATCH 16 + +struct vscsibk_pend { + uint16_t rqid; + + uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE]; + uint8_t cmd_len; + + uint8_t sc_data_direction; + uint16_t n_sg; /* real length of SG list */ + uint16_t n_grants; /* SG pages and potentially SG list */ + uint32_t data_len; + uint32_t result; + + struct vscsibk_info *info; + struct v2p_entry *v2p; + struct scatterlist *sgl; + + uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE]; + + grant_handle_t grant_handles[VSCSI_MAX_GRANTS]; + struct page *pages[VSCSI_MAX_GRANTS]; + + struct se_cmd se_cmd; + + struct completion tmr_done; +}; + +#define VSCSI_DEFAULT_SESSION_TAGS 128 + +struct scsiback_nexus { + /* Pointer to TCM session for I_T Nexus */ + struct se_session *tvn_se_sess; +}; + +struct scsiback_tport { + /* SCSI protocol the tport is providing */ + u8 tport_proto_id; + /* Binary World Wide unique Port Name for pvscsi Target port */ + u64 tport_wwpn; + /* ASCII formatted WWPN for pvscsi Target port */ + char tport_name[VSCSI_NAMELEN]; + /* Returned by scsiback_make_tport() */ + struct se_wwn tport_wwn; +}; + +struct scsiback_tpg { + /* scsiback port target portal group tag for TCM */ + u16 tport_tpgt; + /* track number of TPG Port/Lun Links wrt explicit I_T Nexus shutdown */ + int tv_tpg_port_count; + /* xen-pvscsi references to tpg_nexus, protected by tv_tpg_mutex */ + int tv_tpg_fe_count; + /* list for scsiback_list */ + struct list_head tv_tpg_list; + /* Used to protect access for tpg_nexus */ + struct mutex tv_tpg_mutex; + /* Pointer to the TCM pvscsi I_T Nexus for this TPG endpoint */ + struct scsiback_nexus *tpg_nexus; + /* Pointer back to scsiback_tport */ + struct scsiback_tport *tport; + /* Returned by scsiback_make_tpg() */ + struct se_portal_group se_tpg; + /* alias used in xenstore */ + char param_alias[VSCSI_NAMELEN]; + /* list of info structures related to this target portal group */ + struct list_head info_list; +}; + +#define SCSIBACK_INVALID_HANDLE (~0) + +static bool log_print_stat; +module_param(log_print_stat, bool, 0644); + +static int scsiback_max_buffer_pages = 1024; +module_param_named(max_buffer_pages, scsiback_max_buffer_pages, int, 0644); +MODULE_PARM_DESC(max_buffer_pages, +"Maximum number of free pages to keep in backend buffer"); + +/* Global spinlock to protect scsiback TPG list */ +static DEFINE_MUTEX(scsiback_mutex); +static LIST_HEAD(scsiback_list); + +static void scsiback_get(struct vscsibk_info *info) +{ + atomic_inc(&info->nr_unreplied_reqs); +} + +static void scsiback_put(struct vscsibk_info *info) +{ + if (atomic_dec_and_test(&info->nr_unreplied_reqs)) + wake_up(&info->waiting_to_free); +} + +static unsigned long vaddr_page(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + + return (unsigned long)pfn_to_kaddr(pfn); +} + +static unsigned long vaddr(struct vscsibk_pend *req, int seg) +{ + return vaddr_page(req->pages[seg]); +} + +static void scsiback_print_status(char *sense_buffer, int errors, + struct vscsibk_pend *pending_req) +{ + struct scsiback_tpg *tpg = pending_req->v2p->tpg; + + pr_err("[%s:%d] cmnd[0]=%02x -> st=%02x msg=%02x host=%02x\n", + tpg->tport->tport_name, pending_req->v2p->lun, + pending_req->cmnd[0], errors & 0xff, COMMAND_COMPLETE, + host_byte(errors)); +} + +static void scsiback_fast_flush_area(struct vscsibk_pend *req) +{ + struct gnttab_unmap_grant_ref unmap[VSCSI_GRANT_BATCH]; + struct page *pages[VSCSI_GRANT_BATCH]; + unsigned int i, invcount = 0; + grant_handle_t handle; + int err; + + kfree(req->sgl); + req->sgl = NULL; + req->n_sg = 0; + + if (!req->n_grants) + return; + + for (i = 0; i < req->n_grants; i++) { + handle = req->grant_handles[i]; + if (handle == SCSIBACK_INVALID_HANDLE) + continue; + gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), + GNTMAP_host_map, handle); + req->grant_handles[i] = SCSIBACK_INVALID_HANDLE; + pages[invcount] = req->pages[i]; + put_page(pages[invcount]); + invcount++; + if (invcount < VSCSI_GRANT_BATCH) + continue; + err = gnttab_unmap_refs(unmap, NULL, pages, invcount); + BUG_ON(err); + invcount = 0; + } + + if (invcount) { + err = gnttab_unmap_refs(unmap, NULL, pages, invcount); + BUG_ON(err); + } + + gnttab_page_cache_put(&req->info->free_pages, req->pages, + req->n_grants); + req->n_grants = 0; +} + +static void scsiback_free_translation_entry(struct kref *kref) +{ + struct v2p_entry *entry = container_of(kref, struct v2p_entry, kref); + struct scsiback_tpg *tpg = entry->tpg; + + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_fe_count--; + mutex_unlock(&tpg->tv_tpg_mutex); + + kfree(entry); +} + +static int32_t scsiback_result(int32_t result) +{ + int32_t host_status; + + switch (XEN_VSCSIIF_RSLT_HOST(result)) { + case DID_OK: + host_status = XEN_VSCSIIF_RSLT_HOST_OK; + break; + case DID_NO_CONNECT: + host_status = XEN_VSCSIIF_RSLT_HOST_NO_CONNECT; + break; + case DID_BUS_BUSY: + host_status = XEN_VSCSIIF_RSLT_HOST_BUS_BUSY; + break; + case DID_TIME_OUT: + host_status = XEN_VSCSIIF_RSLT_HOST_TIME_OUT; + break; + case DID_BAD_TARGET: + host_status = XEN_VSCSIIF_RSLT_HOST_BAD_TARGET; + break; + case DID_ABORT: + host_status = XEN_VSCSIIF_RSLT_HOST_ABORT; + break; + case DID_PARITY: + host_status = XEN_VSCSIIF_RSLT_HOST_PARITY; + break; + case DID_ERROR: + host_status = XEN_VSCSIIF_RSLT_HOST_ERROR; + break; + case DID_RESET: + host_status = XEN_VSCSIIF_RSLT_HOST_RESET; + break; + case DID_BAD_INTR: + host_status = XEN_VSCSIIF_RSLT_HOST_BAD_INTR; + break; + case DID_PASSTHROUGH: + host_status = XEN_VSCSIIF_RSLT_HOST_PASSTHROUGH; + break; + case DID_SOFT_ERROR: + host_status = XEN_VSCSIIF_RSLT_HOST_SOFT_ERROR; + break; + case DID_IMM_RETRY: + host_status = XEN_VSCSIIF_RSLT_HOST_IMM_RETRY; + break; + case DID_REQUEUE: + host_status = XEN_VSCSIIF_RSLT_HOST_REQUEUE; + break; + case DID_TRANSPORT_DISRUPTED: + host_status = XEN_VSCSIIF_RSLT_HOST_TRANSPORT_DISRUPTED; + break; + case DID_TRANSPORT_FAILFAST: + host_status = XEN_VSCSIIF_RSLT_HOST_TRANSPORT_FAILFAST; + break; + case DID_TRANSPORT_MARGINAL: + host_status = XEN_VSCSIIF_RSLT_HOST_TRANSPORT_MARGINAL; + break; + default: + host_status = XEN_VSCSIIF_RSLT_HOST_ERROR; + break; + } + + return (host_status << 16) | (result & 0x00ffff); +} + +static void scsiback_send_response(struct vscsibk_info *info, + char *sense_buffer, int32_t result, uint32_t resid, + uint16_t rqid) +{ + struct vscsiif_response *ring_res; + int notify; + struct scsi_sense_hdr sshdr; + unsigned long flags; + unsigned len; + + spin_lock_irqsave(&info->ring_lock, flags); + + ring_res = RING_GET_RESPONSE(&info->ring, info->ring.rsp_prod_pvt); + info->ring.rsp_prod_pvt++; + + ring_res->rslt = scsiback_result(result); + ring_res->rqid = rqid; + + if (sense_buffer != NULL && + scsi_normalize_sense(sense_buffer, VSCSIIF_SENSE_BUFFERSIZE, + &sshdr)) { + len = min_t(unsigned, 8 + sense_buffer[7], + VSCSIIF_SENSE_BUFFERSIZE); + memcpy(ring_res->sense_buffer, sense_buffer, len); + ring_res->sense_len = len; + } else { + ring_res->sense_len = 0; + } + + ring_res->residual_len = resid; + + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&info->ring, notify); + spin_unlock_irqrestore(&info->ring_lock, flags); + + if (notify) + notify_remote_via_irq(info->irq); +} + +static void scsiback_do_resp_with_sense(char *sense_buffer, int32_t result, + uint32_t resid, struct vscsibk_pend *pending_req) +{ + scsiback_send_response(pending_req->info, sense_buffer, result, + resid, pending_req->rqid); + + if (pending_req->v2p) + kref_put(&pending_req->v2p->kref, + scsiback_free_translation_entry); +} + +static void scsiback_cmd_done(struct vscsibk_pend *pending_req) +{ + struct vscsibk_info *info = pending_req->info; + unsigned char *sense_buffer; + unsigned int resid; + int errors; + + sense_buffer = pending_req->sense_buffer; + resid = pending_req->se_cmd.residual_count; + errors = pending_req->result; + + if (errors && log_print_stat) + scsiback_print_status(sense_buffer, errors, pending_req); + + scsiback_fast_flush_area(pending_req); + scsiback_do_resp_with_sense(sense_buffer, errors, resid, pending_req); + scsiback_put(info); + /* + * Drop the extra KREF_ACK reference taken by target_submit_cmd_map_sgls() + * ahead of scsiback_check_stop_free() -> transport_generic_free_cmd() + * final se_cmd->cmd_kref put. + */ + target_put_sess_cmd(&pending_req->se_cmd); +} + +static void scsiback_cmd_exec(struct vscsibk_pend *pending_req) +{ + struct se_cmd *se_cmd = &pending_req->se_cmd; + struct se_session *sess = pending_req->v2p->tpg->tpg_nexus->tvn_se_sess; + + scsiback_get(pending_req->info); + se_cmd->tag = pending_req->rqid; + target_init_cmd(se_cmd, sess, pending_req->sense_buffer, + pending_req->v2p->lun, pending_req->data_len, 0, + pending_req->sc_data_direction, TARGET_SCF_ACK_KREF); + + if (target_submit_prep(se_cmd, pending_req->cmnd, pending_req->sgl, + pending_req->n_sg, NULL, 0, NULL, 0, GFP_KERNEL)) + return; + + target_submit(se_cmd); +} + +static int scsiback_gnttab_data_map_batch(struct gnttab_map_grant_ref *map, + struct page **pg, grant_handle_t *grant, int cnt) +{ + int err, i; + + if (!cnt) + return 0; + + err = gnttab_map_refs(map, NULL, pg, cnt); + for (i = 0; i < cnt; i++) { + if (unlikely(map[i].status != GNTST_okay)) { + pr_err("invalid buffer -- could not remap it\n"); + map[i].handle = SCSIBACK_INVALID_HANDLE; + if (!err) + err = -ENOMEM; + } else { + get_page(pg[i]); + } + grant[i] = map[i].handle; + } + return err; +} + +static int scsiback_gnttab_data_map_list(struct vscsibk_pend *pending_req, + struct scsiif_request_segment *seg, struct page **pg, + grant_handle_t *grant, int cnt, u32 flags) +{ + int mapcount = 0, i, err = 0; + struct gnttab_map_grant_ref map[VSCSI_GRANT_BATCH]; + struct vscsibk_info *info = pending_req->info; + + for (i = 0; i < cnt; i++) { + if (gnttab_page_cache_get(&info->free_pages, pg + mapcount)) { + gnttab_page_cache_put(&info->free_pages, pg, mapcount); + pr_err("no grant page\n"); + return -ENOMEM; + } + gnttab_set_map_op(&map[mapcount], vaddr_page(pg[mapcount]), + flags, seg[i].gref, info->domid); + mapcount++; + if (mapcount < VSCSI_GRANT_BATCH) + continue; + err = scsiback_gnttab_data_map_batch(map, pg, grant, mapcount); + pg += mapcount; + grant += mapcount; + pending_req->n_grants += mapcount; + if (err) + return err; + mapcount = 0; + } + err = scsiback_gnttab_data_map_batch(map, pg, grant, mapcount); + pending_req->n_grants += mapcount; + return err; +} + +static int scsiback_gnttab_data_map(struct vscsiif_request *ring_req, + struct vscsibk_pend *pending_req) +{ + u32 flags; + int i, err, n_segs, i_seg = 0; + struct page **pg; + struct scsiif_request_segment *seg; + unsigned long end_seg = 0; + unsigned int nr_segments = (unsigned int)ring_req->nr_segments; + unsigned int nr_sgl = 0; + struct scatterlist *sg; + grant_handle_t *grant; + + pending_req->n_sg = 0; + pending_req->n_grants = 0; + pending_req->data_len = 0; + + nr_segments &= ~VSCSIIF_SG_GRANT; + if (!nr_segments) + return 0; + + if (nr_segments > VSCSIIF_SG_TABLESIZE) { + pr_debug("invalid parameter nr_seg = %d\n", + ring_req->nr_segments); + return -EINVAL; + } + + if (ring_req->nr_segments & VSCSIIF_SG_GRANT) { + err = scsiback_gnttab_data_map_list(pending_req, ring_req->seg, + pending_req->pages, pending_req->grant_handles, + nr_segments, GNTMAP_host_map | GNTMAP_readonly); + if (err) + return err; + nr_sgl = nr_segments; + nr_segments = 0; + for (i = 0; i < nr_sgl; i++) { + n_segs = ring_req->seg[i].length / + sizeof(struct scsiif_request_segment); + if ((unsigned)ring_req->seg[i].offset + + (unsigned)ring_req->seg[i].length > PAGE_SIZE || + n_segs * sizeof(struct scsiif_request_segment) != + ring_req->seg[i].length) + return -EINVAL; + nr_segments += n_segs; + } + if (nr_segments > SG_ALL) { + pr_debug("invalid nr_seg = %d\n", nr_segments); + return -EINVAL; + } + } + + /* free of (sgl) in fast_flush_area() */ + pending_req->sgl = kmalloc_array(nr_segments, + sizeof(struct scatterlist), GFP_KERNEL); + if (!pending_req->sgl) + return -ENOMEM; + + sg_init_table(pending_req->sgl, nr_segments); + pending_req->n_sg = nr_segments; + + flags = GNTMAP_host_map; + if (pending_req->sc_data_direction == DMA_TO_DEVICE) + flags |= GNTMAP_readonly; + + pg = pending_req->pages + nr_sgl; + grant = pending_req->grant_handles + nr_sgl; + if (!nr_sgl) { + seg = ring_req->seg; + err = scsiback_gnttab_data_map_list(pending_req, seg, + pg, grant, nr_segments, flags); + if (err) + return err; + } else { + for (i = 0; i < nr_sgl; i++) { + seg = (struct scsiif_request_segment *)( + vaddr(pending_req, i) + ring_req->seg[i].offset); + n_segs = ring_req->seg[i].length / + sizeof(struct scsiif_request_segment); + err = scsiback_gnttab_data_map_list(pending_req, seg, + pg, grant, n_segs, flags); + if (err) + return err; + pg += n_segs; + grant += n_segs; + } + end_seg = vaddr(pending_req, 0) + ring_req->seg[0].offset; + seg = (struct scsiif_request_segment *)end_seg; + end_seg += ring_req->seg[0].length; + pg = pending_req->pages + nr_sgl; + } + + for_each_sg(pending_req->sgl, sg, nr_segments, i) { + sg_set_page(sg, pg[i], seg->length, seg->offset); + pending_req->data_len += seg->length; + seg++; + if (nr_sgl && (unsigned long)seg >= end_seg) { + i_seg++; + end_seg = vaddr(pending_req, i_seg) + + ring_req->seg[i_seg].offset; + seg = (struct scsiif_request_segment *)end_seg; + end_seg += ring_req->seg[i_seg].length; + } + if (sg->offset >= PAGE_SIZE || + sg->length > PAGE_SIZE || + sg->offset + sg->length > PAGE_SIZE) + return -EINVAL; + } + + return 0; +} + +static void scsiback_disconnect(struct vscsibk_info *info) +{ + wait_event(info->waiting_to_free, + atomic_read(&info->nr_unreplied_reqs) == 0); + + unbind_from_irqhandler(info->irq, info); + info->irq = 0; + xenbus_unmap_ring_vfree(info->dev, info->ring.sring); +} + +static void scsiback_device_action(struct vscsibk_pend *pending_req, + enum tcm_tmreq_table act, int tag) +{ + struct scsiback_tpg *tpg = pending_req->v2p->tpg; + struct scsiback_nexus *nexus = tpg->tpg_nexus; + struct se_cmd *se_cmd = &pending_req->se_cmd; + u64 unpacked_lun = pending_req->v2p->lun; + int rc, err = XEN_VSCSIIF_RSLT_RESET_FAILED; + + init_completion(&pending_req->tmr_done); + + rc = target_submit_tmr(&pending_req->se_cmd, nexus->tvn_se_sess, + &pending_req->sense_buffer[0], + unpacked_lun, NULL, act, GFP_KERNEL, + tag, TARGET_SCF_ACK_KREF); + if (rc) + goto err; + + wait_for_completion(&pending_req->tmr_done); + + err = (se_cmd->se_tmr_req->response == TMR_FUNCTION_COMPLETE) ? + XEN_VSCSIIF_RSLT_RESET_SUCCESS : XEN_VSCSIIF_RSLT_RESET_FAILED; + + scsiback_do_resp_with_sense(NULL, err, 0, pending_req); + transport_generic_free_cmd(&pending_req->se_cmd, 0); + return; + +err: + scsiback_do_resp_with_sense(NULL, err, 0, pending_req); +} + +/* + Perform virtual to physical translation +*/ +static struct v2p_entry *scsiback_do_translation(struct vscsibk_info *info, + struct ids_tuple *v) +{ + struct v2p_entry *entry; + struct list_head *head = &(info->v2p_entry_lists); + unsigned long flags; + + spin_lock_irqsave(&info->v2p_lock, flags); + list_for_each_entry(entry, head, l) { + if ((entry->v.chn == v->chn) && + (entry->v.tgt == v->tgt) && + (entry->v.lun == v->lun)) { + kref_get(&entry->kref); + goto out; + } + } + entry = NULL; + +out: + spin_unlock_irqrestore(&info->v2p_lock, flags); + return entry; +} + +static struct vscsibk_pend *scsiback_get_pend_req(struct vscsiif_back_ring *ring, + struct v2p_entry *v2p) +{ + struct scsiback_tpg *tpg = v2p->tpg; + struct scsiback_nexus *nexus = tpg->tpg_nexus; + struct se_session *se_sess = nexus->tvn_se_sess; + struct vscsibk_pend *req; + int tag, cpu, i; + + tag = sbitmap_queue_get(&se_sess->sess_tag_pool, &cpu); + if (tag < 0) { + pr_err("Unable to obtain tag for vscsiif_request\n"); + return ERR_PTR(-ENOMEM); + } + + req = &((struct vscsibk_pend *)se_sess->sess_cmd_map)[tag]; + memset(req, 0, sizeof(*req)); + req->se_cmd.map_tag = tag; + req->se_cmd.map_cpu = cpu; + + for (i = 0; i < VSCSI_MAX_GRANTS; i++) + req->grant_handles[i] = SCSIBACK_INVALID_HANDLE; + + return req; +} + +static struct vscsibk_pend *prepare_pending_reqs(struct vscsibk_info *info, + struct vscsiif_back_ring *ring, + struct vscsiif_request *ring_req) +{ + struct vscsibk_pend *pending_req; + struct v2p_entry *v2p; + struct ids_tuple vir; + + /* request range check from frontend */ + if ((ring_req->sc_data_direction != DMA_BIDIRECTIONAL) && + (ring_req->sc_data_direction != DMA_TO_DEVICE) && + (ring_req->sc_data_direction != DMA_FROM_DEVICE) && + (ring_req->sc_data_direction != DMA_NONE)) { + pr_debug("invalid parameter data_dir = %d\n", + ring_req->sc_data_direction); + return ERR_PTR(-EINVAL); + } + if (ring_req->cmd_len > VSCSIIF_MAX_COMMAND_SIZE) { + pr_debug("invalid parameter cmd_len = %d\n", + ring_req->cmd_len); + return ERR_PTR(-EINVAL); + } + + vir.chn = ring_req->channel; + vir.tgt = ring_req->id; + vir.lun = ring_req->lun; + + v2p = scsiback_do_translation(info, &vir); + if (!v2p) { + pr_debug("the v2p of (chn:%d, tgt:%d, lun:%d) doesn't exist.\n", + vir.chn, vir.tgt, vir.lun); + return ERR_PTR(-ENODEV); + } + + pending_req = scsiback_get_pend_req(ring, v2p); + if (IS_ERR(pending_req)) { + kref_put(&v2p->kref, scsiback_free_translation_entry); + return ERR_PTR(-ENOMEM); + } + pending_req->rqid = ring_req->rqid; + pending_req->info = info; + pending_req->v2p = v2p; + pending_req->sc_data_direction = ring_req->sc_data_direction; + pending_req->cmd_len = ring_req->cmd_len; + memcpy(pending_req->cmnd, ring_req->cmnd, pending_req->cmd_len); + + return pending_req; +} + +static int scsiback_do_cmd_fn(struct vscsibk_info *info, + unsigned int *eoi_flags) +{ + struct vscsiif_back_ring *ring = &info->ring; + struct vscsiif_request ring_req; + struct vscsibk_pend *pending_req; + RING_IDX rc, rp; + int more_to_do; + uint32_t result; + + rc = ring->req_cons; + rp = ring->sring->req_prod; + rmb(); /* guest system is accessing ring, too */ + + if (RING_REQUEST_PROD_OVERFLOW(ring, rp)) { + rc = ring->rsp_prod_pvt; + pr_warn("Dom%d provided bogus ring requests (%#x - %#x = %u). Halting ring processing\n", + info->domid, rp, rc, rp - rc); + return -EINVAL; + } + + while ((rc != rp)) { + *eoi_flags &= ~XEN_EOI_FLAG_SPURIOUS; + + if (RING_REQUEST_CONS_OVERFLOW(ring, rc)) + break; + + RING_COPY_REQUEST(ring, rc, &ring_req); + ring->req_cons = ++rc; + + pending_req = prepare_pending_reqs(info, ring, &ring_req); + if (IS_ERR(pending_req)) { + switch (PTR_ERR(pending_req)) { + case -ENODEV: + result = DID_NO_CONNECT; + break; + default: + result = DID_ERROR; + break; + } + scsiback_send_response(info, NULL, result << 16, 0, + ring_req.rqid); + return 1; + } + + switch (ring_req.act) { + case VSCSIIF_ACT_SCSI_CDB: + if (scsiback_gnttab_data_map(&ring_req, pending_req)) { + scsiback_fast_flush_area(pending_req); + scsiback_do_resp_with_sense(NULL, + DID_ERROR << 16, 0, pending_req); + transport_generic_free_cmd(&pending_req->se_cmd, 0); + } else { + scsiback_cmd_exec(pending_req); + } + break; + case VSCSIIF_ACT_SCSI_ABORT: + scsiback_device_action(pending_req, TMR_ABORT_TASK, + ring_req.ref_rqid); + break; + case VSCSIIF_ACT_SCSI_RESET: + scsiback_device_action(pending_req, TMR_LUN_RESET, 0); + break; + default: + pr_err_ratelimited("invalid request\n"); + scsiback_do_resp_with_sense(NULL, DID_ERROR << 16, 0, + pending_req); + transport_generic_free_cmd(&pending_req->se_cmd, 0); + break; + } + + /* Yield point for this unbounded loop. */ + cond_resched(); + } + + gnttab_page_cache_shrink(&info->free_pages, scsiback_max_buffer_pages); + + RING_FINAL_CHECK_FOR_REQUESTS(&info->ring, more_to_do); + return more_to_do; +} + +static irqreturn_t scsiback_irq_fn(int irq, void *dev_id) +{ + struct vscsibk_info *info = dev_id; + int rc; + unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS; + + while ((rc = scsiback_do_cmd_fn(info, &eoi_flags)) > 0) + cond_resched(); + + /* In case of a ring error we keep the event channel masked. */ + if (!rc) + xen_irq_lateeoi(irq, eoi_flags); + + return IRQ_HANDLED; +} + +static int scsiback_init_sring(struct vscsibk_info *info, grant_ref_t ring_ref, + evtchn_port_t evtchn) +{ + void *area; + struct vscsiif_sring *sring; + int err; + + if (info->irq) + return -1; + + err = xenbus_map_ring_valloc(info->dev, &ring_ref, 1, &area); + if (err) + return err; + + sring = (struct vscsiif_sring *)area; + BACK_RING_INIT(&info->ring, sring, PAGE_SIZE); + + err = bind_interdomain_evtchn_to_irq_lateeoi(info->dev, evtchn); + if (err < 0) + goto unmap_page; + + info->irq = err; + + err = request_threaded_irq(info->irq, NULL, scsiback_irq_fn, + IRQF_ONESHOT, "vscsiif-backend", info); + if (err) + goto free_irq; + + return 0; + +free_irq: + unbind_from_irqhandler(info->irq, info); + info->irq = 0; +unmap_page: + xenbus_unmap_ring_vfree(info->dev, area); + + return err; +} + +static int scsiback_map(struct vscsibk_info *info) +{ + struct xenbus_device *dev = info->dev; + unsigned int ring_ref; + evtchn_port_t evtchn; + int err; + + err = xenbus_gather(XBT_NIL, dev->otherend, + "ring-ref", "%u", &ring_ref, + "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_fatal(dev, err, "reading %s ring", dev->otherend); + return err; + } + + return scsiback_init_sring(info, ring_ref, evtchn); +} + +/* + Check for a translation entry being present +*/ +static struct v2p_entry *scsiback_chk_translation_entry( + struct vscsibk_info *info, struct ids_tuple *v) +{ + struct list_head *head = &(info->v2p_entry_lists); + struct v2p_entry *entry; + + list_for_each_entry(entry, head, l) + if ((entry->v.chn == v->chn) && + (entry->v.tgt == v->tgt) && + (entry->v.lun == v->lun)) + return entry; + + return NULL; +} + +/* + Add a new translation entry +*/ +static int scsiback_add_translation_entry(struct vscsibk_info *info, + char *phy, struct ids_tuple *v) +{ + int err = 0; + struct v2p_entry *new; + unsigned long flags; + char *lunp; + unsigned long long unpacked_lun; + struct se_lun *se_lun; + struct scsiback_tpg *tpg_entry, *tpg = NULL; + char *error = "doesn't exist"; + + lunp = strrchr(phy, ':'); + if (!lunp) { + pr_err("illegal format of physical device %s\n", phy); + return -EINVAL; + } + *lunp = 0; + lunp++; + err = kstrtoull(lunp, 10, &unpacked_lun); + if (err < 0) { + pr_err("lun number not valid: %s\n", lunp); + return err; + } + + mutex_lock(&scsiback_mutex); + list_for_each_entry(tpg_entry, &scsiback_list, tv_tpg_list) { + if (!strcmp(phy, tpg_entry->tport->tport_name) || + !strcmp(phy, tpg_entry->param_alias)) { + mutex_lock(&tpg_entry->se_tpg.tpg_lun_mutex); + hlist_for_each_entry(se_lun, &tpg_entry->se_tpg.tpg_lun_hlist, link) { + if (se_lun->unpacked_lun == unpacked_lun) { + if (!tpg_entry->tpg_nexus) + error = "nexus undefined"; + else + tpg = tpg_entry; + break; + } + } + mutex_unlock(&tpg_entry->se_tpg.tpg_lun_mutex); + break; + } + } + if (tpg) { + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_fe_count++; + mutex_unlock(&tpg->tv_tpg_mutex); + } + mutex_unlock(&scsiback_mutex); + + if (!tpg) { + pr_err("%s:%llu %s\n", phy, unpacked_lun, error); + return -ENODEV; + } + + new = kmalloc(sizeof(struct v2p_entry), GFP_KERNEL); + if (new == NULL) { + err = -ENOMEM; + goto out_free; + } + + spin_lock_irqsave(&info->v2p_lock, flags); + + /* Check double assignment to identical virtual ID */ + if (scsiback_chk_translation_entry(info, v)) { + pr_warn("Virtual ID is already used. Assignment was not performed.\n"); + err = -EEXIST; + goto out; + } + + /* Create a new translation entry and add to the list */ + kref_init(&new->kref); + new->v = *v; + new->tpg = tpg; + new->lun = unpacked_lun; + list_add_tail(&new->l, &info->v2p_entry_lists); + +out: + spin_unlock_irqrestore(&info->v2p_lock, flags); + +out_free: + if (err) { + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_fe_count--; + mutex_unlock(&tpg->tv_tpg_mutex); + kfree(new); + } + + return err; +} + +/* + Delete the translation entry specified +*/ +static int scsiback_del_translation_entry(struct vscsibk_info *info, + struct ids_tuple *v) +{ + struct v2p_entry *entry; + unsigned long flags; + + spin_lock_irqsave(&info->v2p_lock, flags); + /* Find out the translation entry specified */ + entry = scsiback_chk_translation_entry(info, v); + if (entry) + list_del(&entry->l); + + spin_unlock_irqrestore(&info->v2p_lock, flags); + + if (!entry) + return -ENOENT; + + kref_put(&entry->kref, scsiback_free_translation_entry); + return 0; +} + +static void scsiback_do_add_lun(struct vscsibk_info *info, const char *state, + char *phy, struct ids_tuple *vir, int try) +{ + struct v2p_entry *entry; + unsigned long flags; + int err; + + if (try) { + spin_lock_irqsave(&info->v2p_lock, flags); + entry = scsiback_chk_translation_entry(info, vir); + spin_unlock_irqrestore(&info->v2p_lock, flags); + if (entry) + return; + } + if (!scsiback_add_translation_entry(info, phy, vir)) { + if (xenbus_printf(XBT_NIL, info->dev->nodename, state, + "%d", XenbusStateInitialised)) { + pr_err("xenbus_printf error %s\n", state); + scsiback_del_translation_entry(info, vir); + } + } else if (!try) { + err = xenbus_printf(XBT_NIL, info->dev->nodename, state, + "%d", XenbusStateClosed); + if (err) + xenbus_dev_error(info->dev, err, + "%s: writing %s", __func__, state); + } +} + +static void scsiback_do_del_lun(struct vscsibk_info *info, const char *state, + struct ids_tuple *vir) +{ + if (!scsiback_del_translation_entry(info, vir)) { + if (xenbus_printf(XBT_NIL, info->dev->nodename, state, + "%d", XenbusStateClosed)) + pr_err("xenbus_printf error %s\n", state); + } +} + +#define VSCSIBACK_OP_ADD_OR_DEL_LUN 1 +#define VSCSIBACK_OP_UPDATEDEV_STATE 2 + +static void scsiback_do_1lun_hotplug(struct vscsibk_info *info, int op, + char *ent) +{ + int err; + struct ids_tuple vir; + char *val; + int device_state; + char phy[VSCSI_NAMELEN]; + char str[64]; + char state[64]; + struct xenbus_device *dev = info->dev; + + /* read status */ + snprintf(state, sizeof(state), "vscsi-devs/%s/state", ent); + err = xenbus_scanf(XBT_NIL, dev->nodename, state, "%u", &device_state); + if (XENBUS_EXIST_ERR(err)) + return; + + /* physical SCSI device */ + snprintf(str, sizeof(str), "vscsi-devs/%s/p-dev", ent); + val = xenbus_read(XBT_NIL, dev->nodename, str, NULL); + if (IS_ERR(val)) { + err = xenbus_printf(XBT_NIL, dev->nodename, state, + "%d", XenbusStateClosed); + if (err) + xenbus_dev_error(info->dev, err, + "%s: writing %s", __func__, state); + return; + } + strscpy(phy, val, VSCSI_NAMELEN); + kfree(val); + + /* virtual SCSI device */ + snprintf(str, sizeof(str), "vscsi-devs/%s/v-dev", ent); + err = xenbus_scanf(XBT_NIL, dev->nodename, str, "%u:%u:%u:%u", + &vir.hst, &vir.chn, &vir.tgt, &vir.lun); + if (XENBUS_EXIST_ERR(err)) { + err = xenbus_printf(XBT_NIL, dev->nodename, state, + "%d", XenbusStateClosed); + if (err) + xenbus_dev_error(info->dev, err, + "%s: writing %s", __func__, state); + return; + } + + switch (op) { + case VSCSIBACK_OP_ADD_OR_DEL_LUN: + switch (device_state) { + case XenbusStateInitialising: + scsiback_do_add_lun(info, state, phy, &vir, 0); + break; + case XenbusStateConnected: + scsiback_do_add_lun(info, state, phy, &vir, 1); + break; + case XenbusStateClosing: + scsiback_do_del_lun(info, state, &vir); + break; + default: + break; + } + break; + + case VSCSIBACK_OP_UPDATEDEV_STATE: + if (device_state == XenbusStateInitialised) { + /* modify vscsi-devs/dev-x/state */ + if (xenbus_printf(XBT_NIL, dev->nodename, state, + "%d", XenbusStateConnected)) { + pr_err("xenbus_printf error %s\n", str); + scsiback_del_translation_entry(info, &vir); + xenbus_printf(XBT_NIL, dev->nodename, state, + "%d", XenbusStateClosed); + } + } + break; + /* When it is necessary, processing is added here. */ + default: + break; + } +} + +static void scsiback_do_lun_hotplug(struct vscsibk_info *info, int op) +{ + int i; + char **dir; + unsigned int ndir = 0; + + dir = xenbus_directory(XBT_NIL, info->dev->nodename, "vscsi-devs", + &ndir); + if (IS_ERR(dir)) + return; + + for (i = 0; i < ndir; i++) + scsiback_do_1lun_hotplug(info, op, dir[i]); + + kfree(dir); +} + +static void scsiback_frontend_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state) +{ + struct vscsibk_info *info = dev_get_drvdata(&dev->dev); + + switch (frontend_state) { + case XenbusStateInitialising: + break; + + case XenbusStateInitialised: + if (scsiback_map(info)) + break; + + scsiback_do_lun_hotplug(info, VSCSIBACK_OP_ADD_OR_DEL_LUN); + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateConnected: + scsiback_do_lun_hotplug(info, VSCSIBACK_OP_UPDATEDEV_STATE); + + if (dev->state == XenbusStateConnected) + break; + + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateClosing: + if (info->irq) + scsiback_disconnect(info); + + xenbus_switch_state(dev, XenbusStateClosing); + break; + + case XenbusStateClosed: + xenbus_switch_state(dev, XenbusStateClosed); + if (xenbus_dev_is_online(dev)) + break; + fallthrough; /* if not online */ + case XenbusStateUnknown: + device_unregister(&dev->dev); + break; + + case XenbusStateReconfiguring: + scsiback_do_lun_hotplug(info, VSCSIBACK_OP_ADD_OR_DEL_LUN); + xenbus_switch_state(dev, XenbusStateReconfigured); + + break; + + default: + xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", + frontend_state); + break; + } +} + +/* + Release the translation entry specfied +*/ +static void scsiback_release_translation_entry(struct vscsibk_info *info) +{ + struct v2p_entry *entry, *tmp; + struct list_head *head = &(info->v2p_entry_lists); + struct list_head tmp_list; + unsigned long flags; + + spin_lock_irqsave(&info->v2p_lock, flags); + + list_cut_before(&tmp_list, head, head); + + spin_unlock_irqrestore(&info->v2p_lock, flags); + + list_for_each_entry_safe(entry, tmp, &tmp_list, l) { + list_del(&entry->l); + kref_put(&entry->kref, scsiback_free_translation_entry); + } +} + +static void scsiback_remove(struct xenbus_device *dev) +{ + struct vscsibk_info *info = dev_get_drvdata(&dev->dev); + + if (info->irq) + scsiback_disconnect(info); + + scsiback_release_translation_entry(info); + + gnttab_page_cache_shrink(&info->free_pages, 0); + + dev_set_drvdata(&dev->dev, NULL); +} + +static int scsiback_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err; + + struct vscsibk_info *info = kzalloc(sizeof(struct vscsibk_info), + GFP_KERNEL); + + pr_debug("%s %p %d\n", __func__, dev, dev->otherend_id); + + if (!info) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating backend structure"); + return -ENOMEM; + } + info->dev = dev; + dev_set_drvdata(&dev->dev, info); + + info->domid = dev->otherend_id; + spin_lock_init(&info->ring_lock); + atomic_set(&info->nr_unreplied_reqs, 0); + init_waitqueue_head(&info->waiting_to_free); + info->dev = dev; + info->irq = 0; + INIT_LIST_HEAD(&info->v2p_entry_lists); + spin_lock_init(&info->v2p_lock); + gnttab_page_cache_init(&info->free_pages); + + err = xenbus_printf(XBT_NIL, dev->nodename, "feature-sg-grant", "%u", + SG_ALL); + if (err) + xenbus_dev_error(dev, err, "writing feature-sg-grant"); + + err = xenbus_switch_state(dev, XenbusStateInitWait); + if (err) + goto fail; + + return 0; + +fail: + pr_warn("%s failed\n", __func__); + scsiback_remove(dev); + + return err; +} + +static char *scsiback_dump_proto_id(struct scsiback_tport *tport) +{ + switch (tport->tport_proto_id) { + case SCSI_PROTOCOL_SAS: + return "SAS"; + case SCSI_PROTOCOL_FCP: + return "FCP"; + case SCSI_PROTOCOL_ISCSI: + return "iSCSI"; + default: + break; + } + + return "Unknown"; +} + +static char *scsiback_get_fabric_wwn(struct se_portal_group *se_tpg) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport = tpg->tport; + + return &tport->tport_name[0]; +} + +static u16 scsiback_get_tag(struct se_portal_group *se_tpg) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + return tpg->tport_tpgt; +} + +static struct se_wwn * +scsiback_make_tport(struct target_fabric_configfs *tf, + struct config_group *group, + const char *name) +{ + struct scsiback_tport *tport; + char *ptr; + u64 wwpn = 0; + int off = 0; + + tport = kzalloc(sizeof(struct scsiback_tport), GFP_KERNEL); + if (!tport) + return ERR_PTR(-ENOMEM); + + tport->tport_wwpn = wwpn; + /* + * Determine the emulated Protocol Identifier and Target Port Name + * based on the incoming configfs directory name. + */ + ptr = strstr(name, "naa."); + if (ptr) { + tport->tport_proto_id = SCSI_PROTOCOL_SAS; + goto check_len; + } + ptr = strstr(name, "fc."); + if (ptr) { + tport->tport_proto_id = SCSI_PROTOCOL_FCP; + off = 3; /* Skip over "fc." */ + goto check_len; + } + ptr = strstr(name, "iqn."); + if (ptr) { + tport->tport_proto_id = SCSI_PROTOCOL_ISCSI; + goto check_len; + } + + pr_err("Unable to locate prefix for emulated Target Port: %s\n", name); + kfree(tport); + return ERR_PTR(-EINVAL); + +check_len: + if (strlen(name) >= VSCSI_NAMELEN) { + pr_err("Emulated %s Address: %s, exceeds max: %d\n", name, + scsiback_dump_proto_id(tport), VSCSI_NAMELEN); + kfree(tport); + return ERR_PTR(-EINVAL); + } + snprintf(&tport->tport_name[0], VSCSI_NAMELEN, "%s", &name[off]); + + pr_debug("Allocated emulated Target %s Address: %s\n", + scsiback_dump_proto_id(tport), name); + + return &tport->tport_wwn; +} + +static void scsiback_drop_tport(struct se_wwn *wwn) +{ + struct scsiback_tport *tport = container_of(wwn, + struct scsiback_tport, tport_wwn); + + pr_debug("Deallocating emulated Target %s Address: %s\n", + scsiback_dump_proto_id(tport), tport->tport_name); + + kfree(tport); +} + +static int scsiback_check_stop_free(struct se_cmd *se_cmd) +{ + return transport_generic_free_cmd(se_cmd, 0); +} + +static void scsiback_release_cmd(struct se_cmd *se_cmd) +{ + target_free_tag(se_cmd->se_sess, se_cmd); +} + +static int scsiback_write_pending(struct se_cmd *se_cmd) +{ + /* Go ahead and process the write immediately */ + target_execute_cmd(se_cmd); + + return 0; +} + +static int scsiback_queue_data_in(struct se_cmd *se_cmd) +{ + struct vscsibk_pend *pending_req = container_of(se_cmd, + struct vscsibk_pend, se_cmd); + + pending_req->result = SAM_STAT_GOOD; + scsiback_cmd_done(pending_req); + return 0; +} + +static int scsiback_queue_status(struct se_cmd *se_cmd) +{ + struct vscsibk_pend *pending_req = container_of(se_cmd, + struct vscsibk_pend, se_cmd); + + if (se_cmd->sense_buffer && + ((se_cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) || + (se_cmd->se_cmd_flags & SCF_EMULATED_TASK_SENSE))) + pending_req->result = SAM_STAT_CHECK_CONDITION; + else + pending_req->result = se_cmd->scsi_status; + + scsiback_cmd_done(pending_req); + return 0; +} + +static void scsiback_queue_tm_rsp(struct se_cmd *se_cmd) +{ + struct vscsibk_pend *pending_req = container_of(se_cmd, + struct vscsibk_pend, se_cmd); + + complete(&pending_req->tmr_done); +} + +static void scsiback_aborted_task(struct se_cmd *se_cmd) +{ +} + +static ssize_t scsiback_tpg_param_alias_show(struct config_item *item, + char *page) +{ + struct se_portal_group *se_tpg = param_to_tpg(item); + struct scsiback_tpg *tpg = container_of(se_tpg, struct scsiback_tpg, + se_tpg); + ssize_t rb; + + mutex_lock(&tpg->tv_tpg_mutex); + rb = snprintf(page, PAGE_SIZE, "%s\n", tpg->param_alias); + mutex_unlock(&tpg->tv_tpg_mutex); + + return rb; +} + +static ssize_t scsiback_tpg_param_alias_store(struct config_item *item, + const char *page, size_t count) +{ + struct se_portal_group *se_tpg = param_to_tpg(item); + struct scsiback_tpg *tpg = container_of(se_tpg, struct scsiback_tpg, + se_tpg); + int len; + + if (strlen(page) >= VSCSI_NAMELEN) { + pr_err("param alias: %s, exceeds max: %d\n", page, + VSCSI_NAMELEN); + return -EINVAL; + } + + mutex_lock(&tpg->tv_tpg_mutex); + len = snprintf(tpg->param_alias, VSCSI_NAMELEN, "%s", page); + if (tpg->param_alias[len - 1] == '\n') + tpg->param_alias[len - 1] = '\0'; + mutex_unlock(&tpg->tv_tpg_mutex); + + return count; +} + +CONFIGFS_ATTR(scsiback_tpg_param_, alias); + +static struct configfs_attribute *scsiback_param_attrs[] = { + &scsiback_tpg_param_attr_alias, + NULL, +}; + +static int scsiback_alloc_sess_cb(struct se_portal_group *se_tpg, + struct se_session *se_sess, void *p) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + + tpg->tpg_nexus = p; + return 0; +} + +static int scsiback_make_nexus(struct scsiback_tpg *tpg, + const char *name) +{ + struct scsiback_nexus *tv_nexus; + int ret = 0; + + mutex_lock(&tpg->tv_tpg_mutex); + if (tpg->tpg_nexus) { + pr_debug("tpg->tpg_nexus already exists\n"); + ret = -EEXIST; + goto out_unlock; + } + + tv_nexus = kzalloc(sizeof(struct scsiback_nexus), GFP_KERNEL); + if (!tv_nexus) { + ret = -ENOMEM; + goto out_unlock; + } + + tv_nexus->tvn_se_sess = target_setup_session(&tpg->se_tpg, + VSCSI_DEFAULT_SESSION_TAGS, + sizeof(struct vscsibk_pend), + TARGET_PROT_NORMAL, name, + tv_nexus, scsiback_alloc_sess_cb); + if (IS_ERR(tv_nexus->tvn_se_sess)) { + kfree(tv_nexus); + ret = -ENOMEM; + goto out_unlock; + } + +out_unlock: + mutex_unlock(&tpg->tv_tpg_mutex); + return ret; +} + +static int scsiback_drop_nexus(struct scsiback_tpg *tpg) +{ + struct se_session *se_sess; + struct scsiback_nexus *tv_nexus; + + mutex_lock(&tpg->tv_tpg_mutex); + tv_nexus = tpg->tpg_nexus; + if (!tv_nexus) { + mutex_unlock(&tpg->tv_tpg_mutex); + return -ENODEV; + } + + se_sess = tv_nexus->tvn_se_sess; + if (!se_sess) { + mutex_unlock(&tpg->tv_tpg_mutex); + return -ENODEV; + } + + if (tpg->tv_tpg_port_count != 0) { + mutex_unlock(&tpg->tv_tpg_mutex); + pr_err("Unable to remove xen-pvscsi I_T Nexus with active TPG port count: %d\n", + tpg->tv_tpg_port_count); + return -EBUSY; + } + + if (tpg->tv_tpg_fe_count != 0) { + mutex_unlock(&tpg->tv_tpg_mutex); + pr_err("Unable to remove xen-pvscsi I_T Nexus with active TPG frontend count: %d\n", + tpg->tv_tpg_fe_count); + return -EBUSY; + } + + pr_debug("Removing I_T Nexus to emulated %s Initiator Port: %s\n", + scsiback_dump_proto_id(tpg->tport), + tv_nexus->tvn_se_sess->se_node_acl->initiatorname); + + /* + * Release the SCSI I_T Nexus to the emulated xen-pvscsi Target Port + */ + target_remove_session(se_sess); + tpg->tpg_nexus = NULL; + mutex_unlock(&tpg->tv_tpg_mutex); + + kfree(tv_nexus); + return 0; +} + +static ssize_t scsiback_tpg_nexus_show(struct config_item *item, char *page) +{ + struct se_portal_group *se_tpg = to_tpg(item); + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_nexus *tv_nexus; + ssize_t ret; + + mutex_lock(&tpg->tv_tpg_mutex); + tv_nexus = tpg->tpg_nexus; + if (!tv_nexus) { + mutex_unlock(&tpg->tv_tpg_mutex); + return -ENODEV; + } + ret = snprintf(page, PAGE_SIZE, "%s\n", + tv_nexus->tvn_se_sess->se_node_acl->initiatorname); + mutex_unlock(&tpg->tv_tpg_mutex); + + return ret; +} + +static ssize_t scsiback_tpg_nexus_store(struct config_item *item, + const char *page, size_t count) +{ + struct se_portal_group *se_tpg = to_tpg(item); + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport_wwn = tpg->tport; + unsigned char i_port[VSCSI_NAMELEN], *ptr, *port_ptr; + int ret; + /* + * Shutdown the active I_T nexus if 'NULL' is passed. + */ + if (!strncmp(page, "NULL", 4)) { + ret = scsiback_drop_nexus(tpg); + return (!ret) ? count : ret; + } + /* + * Otherwise make sure the passed virtual Initiator port WWN matches + * the fabric protocol_id set in scsiback_make_tport(), and call + * scsiback_make_nexus(). + */ + if (strlen(page) >= VSCSI_NAMELEN) { + pr_err("Emulated NAA Sas Address: %s, exceeds max: %d\n", + page, VSCSI_NAMELEN); + return -EINVAL; + } + snprintf(&i_port[0], VSCSI_NAMELEN, "%s", page); + + ptr = strstr(i_port, "naa."); + if (ptr) { + if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_SAS) { + pr_err("Passed SAS Initiator Port %s does not match target port protoid: %s\n", + i_port, scsiback_dump_proto_id(tport_wwn)); + return -EINVAL; + } + port_ptr = &i_port[0]; + goto check_newline; + } + ptr = strstr(i_port, "fc."); + if (ptr) { + if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_FCP) { + pr_err("Passed FCP Initiator Port %s does not match target port protoid: %s\n", + i_port, scsiback_dump_proto_id(tport_wwn)); + return -EINVAL; + } + port_ptr = &i_port[3]; /* Skip over "fc." */ + goto check_newline; + } + ptr = strstr(i_port, "iqn."); + if (ptr) { + if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_ISCSI) { + pr_err("Passed iSCSI Initiator Port %s does not match target port protoid: %s\n", + i_port, scsiback_dump_proto_id(tport_wwn)); + return -EINVAL; + } + port_ptr = &i_port[0]; + goto check_newline; + } + pr_err("Unable to locate prefix for emulated Initiator Port: %s\n", + i_port); + return -EINVAL; + /* + * Clear any trailing newline for the NAA WWN + */ +check_newline: + if (i_port[strlen(i_port) - 1] == '\n') + i_port[strlen(i_port) - 1] = '\0'; + + ret = scsiback_make_nexus(tpg, port_ptr); + if (ret < 0) + return ret; + + return count; +} + +CONFIGFS_ATTR(scsiback_tpg_, nexus); + +static struct configfs_attribute *scsiback_tpg_attrs[] = { + &scsiback_tpg_attr_nexus, + NULL, +}; + +static ssize_t +scsiback_wwn_version_show(struct config_item *item, char *page) +{ + return sprintf(page, "xen-pvscsi fabric module %s on %s/%s on " + UTS_RELEASE"\n", + VSCSI_VERSION, utsname()->sysname, utsname()->machine); +} + +CONFIGFS_ATTR_RO(scsiback_wwn_, version); + +static struct configfs_attribute *scsiback_wwn_attrs[] = { + &scsiback_wwn_attr_version, + NULL, +}; + +static int scsiback_port_link(struct se_portal_group *se_tpg, + struct se_lun *lun) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_port_count++; + mutex_unlock(&tpg->tv_tpg_mutex); + + return 0; +} + +static void scsiback_port_unlink(struct se_portal_group *se_tpg, + struct se_lun *lun) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_port_count--; + mutex_unlock(&tpg->tv_tpg_mutex); +} + +static struct se_portal_group * +scsiback_make_tpg(struct se_wwn *wwn, const char *name) +{ + struct scsiback_tport *tport = container_of(wwn, + struct scsiback_tport, tport_wwn); + + struct scsiback_tpg *tpg; + u16 tpgt; + int ret; + + if (strstr(name, "tpgt_") != name) + return ERR_PTR(-EINVAL); + ret = kstrtou16(name + 5, 10, &tpgt); + if (ret) + return ERR_PTR(ret); + + tpg = kzalloc(sizeof(struct scsiback_tpg), GFP_KERNEL); + if (!tpg) + return ERR_PTR(-ENOMEM); + + mutex_init(&tpg->tv_tpg_mutex); + INIT_LIST_HEAD(&tpg->tv_tpg_list); + INIT_LIST_HEAD(&tpg->info_list); + tpg->tport = tport; + tpg->tport_tpgt = tpgt; + + ret = core_tpg_register(wwn, &tpg->se_tpg, tport->tport_proto_id); + if (ret < 0) { + kfree(tpg); + return NULL; + } + mutex_lock(&scsiback_mutex); + list_add_tail(&tpg->tv_tpg_list, &scsiback_list); + mutex_unlock(&scsiback_mutex); + + return &tpg->se_tpg; +} + +static void scsiback_drop_tpg(struct se_portal_group *se_tpg) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + + mutex_lock(&scsiback_mutex); + list_del(&tpg->tv_tpg_list); + mutex_unlock(&scsiback_mutex); + /* + * Release the virtual I_T Nexus for this xen-pvscsi TPG + */ + scsiback_drop_nexus(tpg); + /* + * Deregister the se_tpg from TCM. + */ + core_tpg_deregister(se_tpg); + kfree(tpg); +} + +static int scsiback_check_true(struct se_portal_group *se_tpg) +{ + return 1; +} + +static const struct target_core_fabric_ops scsiback_ops = { + .module = THIS_MODULE, + .fabric_name = "xen-pvscsi", + .tpg_get_wwn = scsiback_get_fabric_wwn, + .tpg_get_tag = scsiback_get_tag, + .tpg_check_demo_mode = scsiback_check_true, + .tpg_check_demo_mode_cache = scsiback_check_true, + .check_stop_free = scsiback_check_stop_free, + .release_cmd = scsiback_release_cmd, + .sess_get_initiator_sid = NULL, + .write_pending = scsiback_write_pending, + .queue_data_in = scsiback_queue_data_in, + .queue_status = scsiback_queue_status, + .queue_tm_rsp = scsiback_queue_tm_rsp, + .aborted_task = scsiback_aborted_task, + /* + * Setup callers for generic logic in target_core_fabric_configfs.c + */ + .fabric_make_wwn = scsiback_make_tport, + .fabric_drop_wwn = scsiback_drop_tport, + .fabric_make_tpg = scsiback_make_tpg, + .fabric_drop_tpg = scsiback_drop_tpg, + .fabric_post_link = scsiback_port_link, + .fabric_pre_unlink = scsiback_port_unlink, + + .tfc_wwn_attrs = scsiback_wwn_attrs, + .tfc_tpg_base_attrs = scsiback_tpg_attrs, + .tfc_tpg_param_attrs = scsiback_param_attrs, + + .default_submit_type = TARGET_DIRECT_SUBMIT, + .direct_submit_supp = 1, +}; + +static const struct xenbus_device_id scsiback_ids[] = { + { "vscsi" }, + { "" } +}; + +static struct xenbus_driver scsiback_driver = { + .ids = scsiback_ids, + .probe = scsiback_probe, + .remove = scsiback_remove, + .otherend_changed = scsiback_frontend_changed +}; + +static int __init scsiback_init(void) +{ + int ret; + + if (!xen_domain()) + return -ENODEV; + + pr_debug("xen-pvscsi: fabric module %s on %s/%s on "UTS_RELEASE"\n", + VSCSI_VERSION, utsname()->sysname, utsname()->machine); + + ret = xenbus_register_backend(&scsiback_driver); + if (ret) + goto out; + + ret = target_register_template(&scsiback_ops); + if (ret) + goto out_unregister_xenbus; + + return 0; + +out_unregister_xenbus: + xenbus_unregister_driver(&scsiback_driver); +out: + pr_err("%s: error %d\n", __func__, ret); + return ret; +} + +static void __exit scsiback_exit(void) +{ + target_unregister_template(&scsiback_ops); + xenbus_unregister_driver(&scsiback_driver); +} + +module_init(scsiback_init); +module_exit(scsiback_exit); + +MODULE_DESCRIPTION("Xen SCSI backend driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS("xen-backend:vscsi"); +MODULE_AUTHOR("Juergen Gross <jgross@suse.com>"); diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c deleted file mode 100644 index 02817a85f877..000000000000 --- a/drivers/xen/xen-selfballoon.c +++ /dev/null @@ -1,538 +0,0 @@ -/****************************************************************************** - * Xen selfballoon driver (and optional frontswap self-shrinking driver) - * - * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. - * - * This code complements the cleancache and frontswap patchsets to optimize - * support for Xen Transcendent Memory ("tmem"). The policy it implements - * is rudimentary and will likely improve over time, but it does work well - * enough today. - * - * Two functionalities are implemented here which both use "control theory" - * (feedback) to optimize memory utilization. In a virtualized environment - * such as Xen, RAM is often a scarce resource and we would like to ensure - * that each of a possibly large number of virtual machines is using RAM - * efficiently, i.e. using as little as possible when under light load - * and obtaining as much as possible when memory demands are high. - * Since RAM needs vary highly dynamically and sometimes dramatically, - * "hysteresis" is used, that is, memory target is determined not just - * on current data but also on past data stored in the system. - * - * "Selfballooning" creates memory pressure by managing the Xen balloon - * driver to decrease and increase available kernel memory, driven - * largely by the target value of "Committed_AS" (see /proc/meminfo). - * Since Committed_AS does not account for clean mapped pages (i.e. pages - * in RAM that are identical to pages on disk), selfballooning has the - * affect of pushing less frequently used clean pagecache pages out of - * kernel RAM and, presumably using cleancache, into Xen tmem where - * Xen can more efficiently optimize RAM utilization for such pages. - * - * When kernel memory demand unexpectedly increases faster than Xen, via - * the selfballoon driver, is able to (or chooses to) provide usable RAM, - * the kernel may invoke swapping. In most cases, frontswap is able - * to absorb this swapping into Xen tmem. However, due to the fact - * that the kernel swap subsystem assumes swapping occurs to a disk, - * swapped pages may sit on the disk for a very long time; even if - * the kernel knows the page will never be used again. This is because - * the disk space costs very little and can be overwritten when - * necessary. When such stale pages are in frontswap, however, they - * are taking up valuable real estate. "Frontswap selfshrinking" works - * to resolve this: When frontswap activity is otherwise stable - * and the guest kernel is not under memory pressure, the "frontswap - * selfshrinking" accounts for this by providing pressure to remove some - * pages from frontswap and return them to kernel memory. - * - * For both "selfballooning" and "frontswap-selfshrinking", a worker - * thread is used and sysfs tunables are provided to adjust the frequency - * and rate of adjustments to achieve the goal, as well as to disable one - * or both functions independently. - * - * While some argue that this functionality can and should be implemented - * in userspace, it has been observed that bad things happen (e.g. OOMs). - * - * System configuration note: Selfballooning should not be enabled on - * systems without a sufficiently large swap device configured; for best - * results, it is recommended that total swap be increased by the size - * of the guest memory. Note, that selfballooning should be disabled by default - * if frontswap is not configured. Similarly selfballooning should be enabled - * by default if frontswap is configured and can be disabled with the - * "tmem.selfballooning=0" kernel boot option. Finally, when frontswap is - * configured, frontswap-selfshrinking can be disabled with the - * "tmem.selfshrink=0" kernel boot option. - * - * Selfballooning is disallowed in domain0 and force-disabled. - * - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/bootmem.h> -#include <linux/swap.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/module.h> -#include <linux/workqueue.h> -#include <linux/device.h> -#include <xen/balloon.h> -#include <xen/tmem.h> -#include <xen/xen.h> - -/* Enable/disable with sysfs. */ -static int xen_selfballooning_enabled __read_mostly; - -/* - * Controls rate at which memory target (this iteration) approaches - * ultimate goal when memory need is increasing (up-hysteresis) or - * decreasing (down-hysteresis). Higher values of hysteresis cause - * slower increases/decreases. The default values for the various - * parameters were deemed reasonable by experimentation, may be - * workload-dependent, and can all be adjusted via sysfs. - */ -static unsigned int selfballoon_downhysteresis __read_mostly = 8; -static unsigned int selfballoon_uphysteresis __read_mostly = 1; - -/* In HZ, controls frequency of worker invocation. */ -static unsigned int selfballoon_interval __read_mostly = 5; - -/* - * Minimum usable RAM in MB for selfballooning target for balloon. - * If non-zero, it is added to totalreserve_pages and self-ballooning - * will not balloon below the sum. If zero, a piecewise linear function - * is calculated as a minimum and added to totalreserve_pages. Note that - * setting this value indiscriminately may cause OOMs and crashes. - */ -static unsigned int selfballoon_min_usable_mb; - -/* - * Amount of RAM in MB to add to the target number of pages. - * Can be used to reserve some more room for caches and the like. - */ -static unsigned int selfballoon_reserved_mb; - -static void selfballoon_process(struct work_struct *work); -static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process); - -#ifdef CONFIG_FRONTSWAP -#include <linux/frontswap.h> - -/* Enable/disable with sysfs. */ -static bool frontswap_selfshrinking __read_mostly; - -/* - * The default values for the following parameters were deemed reasonable - * by experimentation, may be workload-dependent, and can all be - * adjusted via sysfs. - */ - -/* Control rate for frontswap shrinking. Higher hysteresis is slower. */ -static unsigned int frontswap_hysteresis __read_mostly = 20; - -/* - * Number of selfballoon worker invocations to wait before observing that - * frontswap selfshrinking should commence. Note that selfshrinking does - * not use a separate worker thread. - */ -static unsigned int frontswap_inertia __read_mostly = 3; - -/* Countdown to next invocation of frontswap_shrink() */ -static unsigned long frontswap_inertia_counter; - -/* - * Invoked by the selfballoon worker thread, uses current number of pages - * in frontswap (frontswap_curr_pages()), previous status, and control - * values (hysteresis and inertia) to determine if frontswap should be - * shrunk and what the new frontswap size should be. Note that - * frontswap_shrink is essentially a partial swapoff that immediately - * transfers pages from the "swap device" (frontswap) back into kernel - * RAM; despite the name, frontswap "shrinking" is very different from - * the "shrinker" interface used by the kernel MM subsystem to reclaim - * memory. - */ -static void frontswap_selfshrink(void) -{ - static unsigned long cur_frontswap_pages; - static unsigned long last_frontswap_pages; - static unsigned long tgt_frontswap_pages; - - last_frontswap_pages = cur_frontswap_pages; - cur_frontswap_pages = frontswap_curr_pages(); - if (!cur_frontswap_pages || - (cur_frontswap_pages > last_frontswap_pages)) { - frontswap_inertia_counter = frontswap_inertia; - return; - } - if (frontswap_inertia_counter && --frontswap_inertia_counter) - return; - if (cur_frontswap_pages <= frontswap_hysteresis) - tgt_frontswap_pages = 0; - else - tgt_frontswap_pages = cur_frontswap_pages - - (cur_frontswap_pages / frontswap_hysteresis); - frontswap_shrink(tgt_frontswap_pages); -} - -#endif /* CONFIG_FRONTSWAP */ - -#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) - -/* - * Use current balloon size, the goal (vm_committed_as), and hysteresis - * parameters to set a new target balloon size - */ -static void selfballoon_process(struct work_struct *work) -{ - unsigned long cur_pages, goal_pages, tgt_pages, floor_pages; - unsigned long useful_pages; - bool reset_timer = false; - - if (xen_selfballooning_enabled) { - cur_pages = totalram_pages; - tgt_pages = cur_pages; /* default is no change */ - goal_pages = vm_memory_committed() + - totalreserve_pages + - MB2PAGES(selfballoon_reserved_mb); -#ifdef CONFIG_FRONTSWAP - /* allow space for frontswap pages to be repatriated */ - if (frontswap_selfshrinking && frontswap_enabled) - goal_pages += frontswap_curr_pages(); -#endif - if (cur_pages > goal_pages) - tgt_pages = cur_pages - - ((cur_pages - goal_pages) / - selfballoon_downhysteresis); - else if (cur_pages < goal_pages) - tgt_pages = cur_pages + - ((goal_pages - cur_pages) / - selfballoon_uphysteresis); - /* else if cur_pages == goal_pages, no change */ - useful_pages = max_pfn - totalreserve_pages; - if (selfballoon_min_usable_mb != 0) - floor_pages = totalreserve_pages + - MB2PAGES(selfballoon_min_usable_mb); - /* piecewise linear function ending in ~3% slope */ - else if (useful_pages < MB2PAGES(16)) - floor_pages = max_pfn; /* not worth ballooning */ - else if (useful_pages < MB2PAGES(64)) - floor_pages = totalreserve_pages + MB2PAGES(16) + - ((useful_pages - MB2PAGES(16)) >> 1); - else if (useful_pages < MB2PAGES(512)) - floor_pages = totalreserve_pages + MB2PAGES(40) + - ((useful_pages - MB2PAGES(40)) >> 3); - else /* useful_pages >= MB2PAGES(512) */ - floor_pages = totalreserve_pages + MB2PAGES(99) + - ((useful_pages - MB2PAGES(99)) >> 5); - if (tgt_pages < floor_pages) - tgt_pages = floor_pages; - balloon_set_new_target(tgt_pages + - balloon_stats.current_pages - totalram_pages); - reset_timer = true; - } -#ifdef CONFIG_FRONTSWAP - if (frontswap_selfshrinking && frontswap_enabled) { - frontswap_selfshrink(); - reset_timer = true; - } -#endif - if (reset_timer) - schedule_delayed_work(&selfballoon_worker, - selfballoon_interval * HZ); -} - -#ifdef CONFIG_SYSFS - -#include <linux/capability.h> - -#define SELFBALLOON_SHOW(name, format, args...) \ - static ssize_t show_##name(struct device *dev, \ - struct device_attribute *attr, \ - char *buf) \ - { \ - return sprintf(buf, format, ##args); \ - } - -SELFBALLOON_SHOW(selfballooning, "%d\n", xen_selfballooning_enabled); - -static ssize_t store_selfballooning(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - bool was_enabled = xen_selfballooning_enabled; - unsigned long tmp; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - err = strict_strtoul(buf, 10, &tmp); - if (err || ((tmp != 0) && (tmp != 1))) - return -EINVAL; - - xen_selfballooning_enabled = !!tmp; - if (!was_enabled && xen_selfballooning_enabled) - schedule_delayed_work(&selfballoon_worker, - selfballoon_interval * HZ); - - return count; -} - -static DEVICE_ATTR(selfballooning, S_IRUGO | S_IWUSR, - show_selfballooning, store_selfballooning); - -SELFBALLOON_SHOW(selfballoon_interval, "%d\n", selfballoon_interval); - -static ssize_t store_selfballoon_interval(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) - return -EINVAL; - selfballoon_interval = val; - return count; -} - -static DEVICE_ATTR(selfballoon_interval, S_IRUGO | S_IWUSR, - show_selfballoon_interval, store_selfballoon_interval); - -SELFBALLOON_SHOW(selfballoon_downhys, "%d\n", selfballoon_downhysteresis); - -static ssize_t store_selfballoon_downhys(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) - return -EINVAL; - selfballoon_downhysteresis = val; - return count; -} - -static DEVICE_ATTR(selfballoon_downhysteresis, S_IRUGO | S_IWUSR, - show_selfballoon_downhys, store_selfballoon_downhys); - - -SELFBALLOON_SHOW(selfballoon_uphys, "%d\n", selfballoon_uphysteresis); - -static ssize_t store_selfballoon_uphys(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) - return -EINVAL; - selfballoon_uphysteresis = val; - return count; -} - -static DEVICE_ATTR(selfballoon_uphysteresis, S_IRUGO | S_IWUSR, - show_selfballoon_uphys, store_selfballoon_uphys); - -SELFBALLOON_SHOW(selfballoon_min_usable_mb, "%d\n", - selfballoon_min_usable_mb); - -static ssize_t store_selfballoon_min_usable_mb(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) - return -EINVAL; - selfballoon_min_usable_mb = val; - return count; -} - -static DEVICE_ATTR(selfballoon_min_usable_mb, S_IRUGO | S_IWUSR, - show_selfballoon_min_usable_mb, - store_selfballoon_min_usable_mb); - -SELFBALLOON_SHOW(selfballoon_reserved_mb, "%d\n", - selfballoon_reserved_mb); - -static ssize_t store_selfballoon_reserved_mb(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) - return -EINVAL; - selfballoon_reserved_mb = val; - return count; -} - -static DEVICE_ATTR(selfballoon_reserved_mb, S_IRUGO | S_IWUSR, - show_selfballoon_reserved_mb, - store_selfballoon_reserved_mb); - - -#ifdef CONFIG_FRONTSWAP -SELFBALLOON_SHOW(frontswap_selfshrinking, "%d\n", frontswap_selfshrinking); - -static ssize_t store_frontswap_selfshrinking(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - bool was_enabled = frontswap_selfshrinking; - unsigned long tmp; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - err = strict_strtoul(buf, 10, &tmp); - if (err || ((tmp != 0) && (tmp != 1))) - return -EINVAL; - frontswap_selfshrinking = !!tmp; - if (!was_enabled && !xen_selfballooning_enabled && - frontswap_selfshrinking) - schedule_delayed_work(&selfballoon_worker, - selfballoon_interval * HZ); - - return count; -} - -static DEVICE_ATTR(frontswap_selfshrinking, S_IRUGO | S_IWUSR, - show_frontswap_selfshrinking, store_frontswap_selfshrinking); - -SELFBALLOON_SHOW(frontswap_inertia, "%d\n", frontswap_inertia); - -static ssize_t store_frontswap_inertia(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) - return -EINVAL; - frontswap_inertia = val; - frontswap_inertia_counter = val; - return count; -} - -static DEVICE_ATTR(frontswap_inertia, S_IRUGO | S_IWUSR, - show_frontswap_inertia, store_frontswap_inertia); - -SELFBALLOON_SHOW(frontswap_hysteresis, "%d\n", frontswap_hysteresis); - -static ssize_t store_frontswap_hysteresis(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) - return -EINVAL; - frontswap_hysteresis = val; - return count; -} - -static DEVICE_ATTR(frontswap_hysteresis, S_IRUGO | S_IWUSR, - show_frontswap_hysteresis, store_frontswap_hysteresis); - -#endif /* CONFIG_FRONTSWAP */ - -static struct attribute *selfballoon_attrs[] = { - &dev_attr_selfballooning.attr, - &dev_attr_selfballoon_interval.attr, - &dev_attr_selfballoon_downhysteresis.attr, - &dev_attr_selfballoon_uphysteresis.attr, - &dev_attr_selfballoon_min_usable_mb.attr, - &dev_attr_selfballoon_reserved_mb.attr, -#ifdef CONFIG_FRONTSWAP - &dev_attr_frontswap_selfshrinking.attr, - &dev_attr_frontswap_hysteresis.attr, - &dev_attr_frontswap_inertia.attr, -#endif - NULL -}; - -static const struct attribute_group selfballoon_group = { - .name = "selfballoon", - .attrs = selfballoon_attrs -}; -#endif - -int register_xen_selfballooning(struct device *dev) -{ - int error = -1; - -#ifdef CONFIG_SYSFS - error = sysfs_create_group(&dev->kobj, &selfballoon_group); -#endif - return error; -} -EXPORT_SYMBOL(register_xen_selfballooning); - -int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink) -{ - bool enable = false; - - if (!xen_domain()) - return -ENODEV; - - if (xen_initial_domain()) { - pr_info("Xen selfballooning driver disabled for domain0\n"); - return -ENODEV; - } - - xen_selfballooning_enabled = tmem_enabled && use_selfballooning; - if (xen_selfballooning_enabled) { - pr_info("Initializing Xen selfballooning driver\n"); - enable = true; - } -#ifdef CONFIG_FRONTSWAP - frontswap_selfshrinking = tmem_enabled && use_frontswap_selfshrink; - if (frontswap_selfshrinking) { - pr_info("Initializing frontswap selfshrinking driver\n"); - enable = true; - } -#endif - if (!enable) - return -ENODEV; - - schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ); - - return 0; -} -EXPORT_SYMBOL(xen_selfballoon_init); diff --git a/drivers/xen/xen-stub.c b/drivers/xen/xen-stub.c deleted file mode 100644 index bbef194c5b01..000000000000 --- a/drivers/xen/xen-stub.c +++ /dev/null @@ -1,100 +0,0 @@ -/* - * xen-stub.c - stub drivers to reserve space for Xen - * - * Copyright (C) 2012 Intel Corporation - * Author: Liu Jinsong <jinsong.liu@intel.com> - * Author: Jiang Yunhong <yunhong.jiang@intel.com> - * - * Copyright (C) 2012 Oracle Inc - * Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/export.h> -#include <linux/types.h> -#include <linux/acpi.h> -#include <xen/acpi.h> - -#ifdef CONFIG_ACPI - -/*-------------------------------------------- - stub driver for Xen memory hotplug ---------------------------------------------*/ - -static const struct acpi_device_id memory_device_ids[] = { - {ACPI_MEMORY_DEVICE_HID, 0}, - {"", 0}, -}; - -static struct acpi_driver xen_stub_memory_device_driver = { - /* same name as native memory driver to block native loaded */ - .name = "acpi_memhotplug", - .class = ACPI_MEMORY_DEVICE_CLASS, - .ids = memory_device_ids, -}; - -int xen_stub_memory_device_init(void) -{ - if (!xen_initial_domain()) - return -ENODEV; - - /* just reserve space for Xen, block native driver loaded */ - return acpi_bus_register_driver(&xen_stub_memory_device_driver); -} -EXPORT_SYMBOL_GPL(xen_stub_memory_device_init); -subsys_initcall(xen_stub_memory_device_init); - -void xen_stub_memory_device_exit(void) -{ - acpi_bus_unregister_driver(&xen_stub_memory_device_driver); -} -EXPORT_SYMBOL_GPL(xen_stub_memory_device_exit); - - -/*-------------------------------------------- - stub driver for Xen cpu hotplug ---------------------------------------------*/ - -static const struct acpi_device_id processor_device_ids[] = { - {ACPI_PROCESSOR_OBJECT_HID, 0}, - {ACPI_PROCESSOR_DEVICE_HID, 0}, - {"", 0}, -}; - -static struct acpi_driver xen_stub_processor_driver = { - /* same name as native processor driver to block native loaded */ - .name = "processor", - .class = ACPI_PROCESSOR_CLASS, - .ids = processor_device_ids, -}; - -int xen_stub_processor_init(void) -{ - if (!xen_initial_domain()) - return -ENODEV; - - /* just reserve space for Xen, block native driver loaded */ - return acpi_bus_register_driver(&xen_stub_processor_driver); -} -EXPORT_SYMBOL_GPL(xen_stub_processor_init); -subsys_initcall(xen_stub_processor_init); - -void xen_stub_processor_exit(void) -{ - acpi_bus_unregister_driver(&xen_stub_processor_driver); -} -EXPORT_SYMBOL_GPL(xen_stub_processor_exit); - -#endif diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile index 31e2e9050c7a..b0d69602214e 100644 --- a/drivers/xen/xenbus/Makefile +++ b/drivers/xen/xenbus/Makefile @@ -1,14 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 obj-y += xenbus.o -obj-y += xenbus_dev_frontend.o -xenbus-objs = -xenbus-objs += xenbus_client.o -xenbus-objs += xenbus_comms.o -xenbus-objs += xenbus_xs.o -xenbus-objs += xenbus_probe.o +xenbus-y := xenbus_client.o +xenbus-y += xenbus_comms.o +xenbus-y += xenbus_xs.o +xenbus-y += xenbus_probe.o -xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o -xenbus-objs += $(xenbus-be-objs-y) +xenbus-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o +obj-y += xenbus_dev_frontend.o obj-$(CONFIG_XEN_BACKEND) += xenbus_dev_backend.o obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o diff --git a/drivers/xen/xenbus/xenbus.h b/drivers/xen/xenbus/xenbus.h new file mode 100644 index 000000000000..9ac0427724a3 --- /dev/null +++ b/drivers/xen/xenbus/xenbus.h @@ -0,0 +1,141 @@ +/* + * Private include for xenbus communications. + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 XenSource Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _XENBUS_XENBUS_H +#define _XENBUS_XENBUS_H + +#include <linux/mutex.h> +#include <linux/uio.h> +#include <xen/xenbus.h> + +#define XEN_BUS_ID_SIZE 20 + +struct xen_bus_type { + char *root; + unsigned int levels; + int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename); + int (*probe)(struct xen_bus_type *bus, const char *type, + const char *dir); + bool (*otherend_will_handle)(struct xenbus_watch *watch, + const char *path, const char *token); + void (*otherend_changed)(struct xenbus_watch *watch, const char *path, + const char *token); + struct bus_type bus; +}; + +enum xenstore_init { + XS_UNKNOWN, + XS_PV, + XS_HVM, + XS_LOCAL, +}; + +struct xs_watch_event { + struct list_head list; + unsigned int len; + struct xenbus_watch *handle; + const char *path; + const char *token; + char body[]; +}; + +enum xb_req_state { + xb_req_state_queued, + xb_req_state_wait_reply, + xb_req_state_got_reply, + xb_req_state_aborted +}; + +struct xb_req_data { + struct list_head list; + wait_queue_head_t wq; + struct kref kref; + struct xsd_sockmsg msg; + uint32_t caller_req_id; + enum xsd_sockmsg_type type; + char *body; + const struct kvec *vec; + int num_vecs; + int err; + enum xb_req_state state; + bool user_req; + void (*cb)(struct xb_req_data *); + void *par; +}; + +extern enum xenstore_init xen_store_domain_type; +extern const struct attribute_group *xenbus_dev_groups[]; +extern struct mutex xs_response_mutex; +extern struct list_head xs_reply_list; +extern struct list_head xb_write_list; +extern wait_queue_head_t xb_waitq; +extern struct mutex xb_write_mutex; + +int xs_init(void); +int xb_init_comms(void); +void xb_deinit_comms(void); +int xs_watch_msg(struct xs_watch_event *event); +void xs_request_exit(struct xb_req_data *req); +void xs_free_req(struct kref *kref); + +int xenbus_match(struct device *_dev, const struct device_driver *_drv); +int xenbus_dev_probe(struct device *_dev); +void xenbus_dev_remove(struct device *_dev); +int xenbus_register_driver_common(struct xenbus_driver *drv, + struct xen_bus_type *bus, + struct module *owner, + const char *mod_name); +int xenbus_probe_node(struct xen_bus_type *bus, + const char *type, + const char *nodename); +int xenbus_probe_devices(struct xen_bus_type *bus); + +void xenbus_dev_changed(const char *node, struct xen_bus_type *bus); + +int xenbus_dev_suspend(struct device *dev); +int xenbus_dev_resume(struct device *dev); +int xenbus_dev_cancel(struct device *dev); + +void xenbus_otherend_changed(struct xenbus_watch *watch, + const char *path, const char *token, + int ignore_on_shutdown); + +int xenbus_read_otherend_details(struct xenbus_device *xendev, + char *id_node, char *path_node); + +void xenbus_ring_ops_init(void); + +int xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void *par); +void xenbus_dev_queue_reply(struct xb_req_data *req); + +extern unsigned int xb_dev_generation_id; + +#endif diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index ec097d6f964d..2dc874fb5506 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -37,7 +37,7 @@ #include <linux/vmalloc.h> #include <linux/export.h> #include <asm/xen/hypervisor.h> -#include <asm/xen/page.h> +#include <xen/page.h> #include <xen/interface/xen.h> #include <xen/interface/event_channel.h> #include <xen/balloon.h> @@ -45,23 +45,50 @@ #include <xen/grant_table.h> #include <xen/xenbus.h> #include <xen/xen.h> +#include <xen/features.h> -#include "xenbus_probe.h" +#include "xenbus.h" + +#define XENBUS_PAGES(_grants) (DIV_ROUND_UP(_grants, XEN_PFN_PER_PAGE)) + +#define XENBUS_MAX_RING_PAGES (XENBUS_PAGES(XENBUS_MAX_RING_GRANTS)) struct xenbus_map_node { struct list_head next; union { - struct vm_struct *area; /* PV */ - struct page *page; /* HVM */ + struct { + struct vm_struct *area; + } pv; + struct { + struct page *pages[XENBUS_MAX_RING_PAGES]; + unsigned long addrs[XENBUS_MAX_RING_GRANTS]; + void *addr; + } hvm; }; - grant_handle_t handle; + grant_handle_t handles[XENBUS_MAX_RING_GRANTS]; + unsigned int nr_handles; +}; + +struct map_ring_valloc { + struct xenbus_map_node *node; + + /* Why do we need two arrays? See comment of __xenbus_map_ring */ + unsigned long addrs[XENBUS_MAX_RING_GRANTS]; + phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; + + struct gnttab_map_grant_ref map[XENBUS_MAX_RING_GRANTS]; + struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS]; + + unsigned int idx; }; static DEFINE_SPINLOCK(xenbus_valloc_lock); static LIST_HEAD(xenbus_valloc_pages); struct xenbus_ring_ops { - int (*map)(struct xenbus_device *dev, int gnt, void **vaddr); + int (*map)(struct xenbus_device *dev, struct map_ring_valloc *info, + grant_ref_t *gnt_refs, unsigned int nr_grefs, + void **vaddr); int (*unmap)(struct xenbus_device *dev, void *vaddr); }; @@ -89,29 +116,36 @@ EXPORT_SYMBOL_GPL(xenbus_strstate); * @dev: xenbus device * @path: path to watch * @watch: watch to register + * @will_handle: events queuing determine callback * @callback: callback to register * * Register a @watch on the given path, using the given xenbus_watch structure - * for storage, and the given @callback function as the callback. Return 0 on - * success, or -errno on error. On success, the given @path will be saved as - * @watch->node, and remains the caller's to free. On error, @watch->node will - * be NULL, the device will switch to %XenbusStateClosing, and the error will - * be saved in the store. + * for storage, @will_handle function as the callback to determine if each + * event need to be queued, and the given @callback function as the callback. + * On success, the given @path will be saved as @watch->node, and remains the + * caller's to free. On error, @watch->node will be NULL, the device will + * switch to %XenbusStateClosing, and the error will be saved in the store. + * + * Returns: %0 on success or -errno on error */ int xenbus_watch_path(struct xenbus_device *dev, const char *path, struct xenbus_watch *watch, + bool (*will_handle)(struct xenbus_watch *, + const char *, const char *), void (*callback)(struct xenbus_watch *, - const char **, unsigned int)) + const char *, const char *)) { int err; watch->node = path; + watch->will_handle = will_handle; watch->callback = callback; err = register_xenbus_watch(watch); if (err) { watch->node = NULL; + watch->will_handle = NULL; watch->callback = NULL; xenbus_dev_fatal(dev, err, "adding watch on %s", path); } @@ -125,21 +159,27 @@ EXPORT_SYMBOL_GPL(xenbus_watch_path); * xenbus_watch_pathfmt - register a watch on a sprintf-formatted path * @dev: xenbus device * @watch: watch to register + * @will_handle: events queuing determine callback * @callback: callback to register * @pathfmt: format of path to watch * * Register a watch on the given @path, using the given xenbus_watch - * structure for storage, and the given @callback function as the callback. - * Return 0 on success, or -errno on error. On success, the watched path - * (@path/@path2) will be saved as @watch->node, and becomes the caller's to - * kfree(). On error, watch->node will be NULL, so the caller has nothing to + * structure for storage, @will_handle function as the callback to determine if + * each event need to be queued, and the given @callback function as the + * callback. On success, the watched path (@path/@path2) will be saved + * as @watch->node, and becomes the caller's to kfree(). + * On error, watch->node will be NULL, so the caller has nothing to * free, the device will switch to %XenbusStateClosing, and the error will be * saved in the store. + * + * Returns: %0 on success or -errno on error */ int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch, + bool (*will_handle)(struct xenbus_watch *, + const char *, const char *), void (*callback)(struct xenbus_watch *, - const char **, unsigned int), + const char *, const char *), const char *pathfmt, ...) { int err; @@ -154,7 +194,7 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev, xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch"); return -ENOMEM; } - err = xenbus_watch_path(dev, path, watch, callback); + err = xenbus_watch_path(dev, path, watch, will_handle, callback); if (err) kfree(path); @@ -162,6 +202,7 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev, } EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt); +__printf(4, 5) static void xenbus_switch_fatal(struct xenbus_device *, int, int, const char *, ...); @@ -222,13 +263,15 @@ abort: } /** - * xenbus_switch_state + * xenbus_switch_state - save the new state of a driver * @dev: xenbus device * @state: new state * * Advertise in the store a change of the given driver to the given new_state. - * Return 0 on success, or -errno on error. On error, the device will switch - * to XenbusStateClosing, and the error will be saved in the store. + * On error, the device will switch to XenbusStateClosing, and the error + * will be saved in the store. + * + * Returns: %0 on success or -errno on error */ int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) { @@ -245,58 +288,35 @@ int xenbus_frontend_closed(struct xenbus_device *dev) } EXPORT_SYMBOL_GPL(xenbus_frontend_closed); -/** - * Return the path to the error node for the given device, or NULL on failure. - * If the value returned is non-NULL, then it is the caller's to kfree. - */ -static char *error_path(struct xenbus_device *dev) -{ - return kasprintf(GFP_KERNEL, "error/%s", dev->nodename); -} - - +__printf(3, 0) static void xenbus_va_dev_error(struct xenbus_device *dev, int err, const char *fmt, va_list ap) { - int ret; unsigned int len; - char *printf_buffer = NULL; - char *path_buffer = NULL; + char *printf_buffer; + char *path_buffer; #define PRINTF_BUFFER_SIZE 4096 + printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL); - if (printf_buffer == NULL) - goto fail; + if (!printf_buffer) + return; len = sprintf(printf_buffer, "%i ", -err); - ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap); - - BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1); + vsnprintf(printf_buffer + len, PRINTF_BUFFER_SIZE - len, fmt, ap); dev_err(&dev->dev, "%s\n", printf_buffer); - path_buffer = error_path(dev); + path_buffer = kasprintf(GFP_KERNEL, "error/%s", dev->nodename); + if (path_buffer) + xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer); - if (path_buffer == NULL) { - dev_err(&dev->dev, "failed to write error node for %s (%s)\n", - dev->nodename, printf_buffer); - goto fail; - } - - if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) { - dev_err(&dev->dev, "failed to write error node for %s (%s)\n", - dev->nodename, printf_buffer); - goto fail; - } - -fail: kfree(printf_buffer); kfree(path_buffer); } - /** - * xenbus_dev_error + * xenbus_dev_error - place an error message into the store * @dev: xenbus device * @err: error to report * @fmt: error message format @@ -315,7 +335,7 @@ void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...) EXPORT_SYMBOL_GPL(xenbus_dev_error); /** - * xenbus_dev_fatal + * xenbus_dev_fatal - put an error messages into the store and then shutdown * @dev: xenbus device * @err: error to report * @fmt: error message format @@ -337,7 +357,7 @@ void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...) } EXPORT_SYMBOL_GPL(xenbus_dev_fatal); -/** +/* * Equivalent to xenbus_dev_fatal(dev, err, fmt, args), but helps * avoiding recursion within xenbus_switch_state. */ @@ -354,32 +374,103 @@ static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err, __xenbus_switch_state(dev, XenbusStateClosing, 1); } -/** - * xenbus_grant_ring +/* + * xenbus_setup_ring * @dev: xenbus device - * @ring_mfn: mfn of ring to grant - - * Grant access to the given @ring_mfn to the peer of the given device. Return - * 0 on success, or -errno on error. On error, the device will switch to - * XenbusStateClosing, and the error will be saved in the store. + * @vaddr: pointer to starting virtual address of the ring + * @nr_pages: number of pages to be granted + * @grefs: grant reference array to be filled in + * + * Allocate physically contiguous pages for a shared ring buffer and grant it + * to the peer of the given device. The ring buffer is initially filled with + * zeroes. The virtual address of the ring is stored at @vaddr and the + * grant references are stored in the @grefs array. In case of error @vaddr + * will be set to NULL and @grefs will be filled with INVALID_GRANT_REF. */ -int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn) +int xenbus_setup_ring(struct xenbus_device *dev, gfp_t gfp, void **vaddr, + unsigned int nr_pages, grant_ref_t *grefs) { - int err = gnttab_grant_foreign_access(dev->otherend_id, ring_mfn, 0); - if (err < 0) - xenbus_dev_fatal(dev, err, "granting access to ring page"); - return err; + unsigned long ring_size = nr_pages * XEN_PAGE_SIZE; + grant_ref_t gref_head; + unsigned int i; + void *addr; + int ret; + + addr = *vaddr = alloc_pages_exact(ring_size, gfp | __GFP_ZERO); + if (!*vaddr) { + ret = -ENOMEM; + goto err; + } + + ret = gnttab_alloc_grant_references(nr_pages, &gref_head); + if (ret) { + xenbus_dev_fatal(dev, ret, "granting access to %u ring pages", + nr_pages); + goto err; + } + + for (i = 0; i < nr_pages; i++) { + unsigned long gfn; + + if (is_vmalloc_addr(*vaddr)) + gfn = pfn_to_gfn(vmalloc_to_pfn(addr)); + else + gfn = virt_to_gfn(addr); + + grefs[i] = gnttab_claim_grant_reference(&gref_head); + gnttab_grant_foreign_access_ref(grefs[i], dev->otherend_id, + gfn, 0); + + addr += XEN_PAGE_SIZE; + } + + return 0; + + err: + if (*vaddr) + free_pages_exact(*vaddr, ring_size); + for (i = 0; i < nr_pages; i++) + grefs[i] = INVALID_GRANT_REF; + *vaddr = NULL; + + return ret; } -EXPORT_SYMBOL_GPL(xenbus_grant_ring); +EXPORT_SYMBOL_GPL(xenbus_setup_ring); +/* + * xenbus_teardown_ring + * @vaddr: starting virtual address of the ring + * @nr_pages: number of pages + * @grefs: grant reference array + * + * Remove grants for the shared ring buffer and free the associated memory. + * On return the grant reference array is filled with INVALID_GRANT_REF. + */ +void xenbus_teardown_ring(void **vaddr, unsigned int nr_pages, + grant_ref_t *grefs) +{ + unsigned int i; -/** + for (i = 0; i < nr_pages; i++) { + if (grefs[i] != INVALID_GRANT_REF) { + gnttab_end_foreign_access(grefs[i], NULL); + grefs[i] = INVALID_GRANT_REF; + } + } + + if (*vaddr) + free_pages_exact(*vaddr, nr_pages * XEN_PAGE_SIZE); + *vaddr = NULL; +} +EXPORT_SYMBOL_GPL(xenbus_teardown_ring); + +/* * Allocate an event channel for the given xenbus_device, assigning the newly * created local port to *port. Return 0 on success, or -errno on error. On * error, the device will switch to XenbusStateClosing, and the error will be * saved in the store. */ -int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port) +int xenbus_alloc_evtchn(struct xenbus_device *dev, evtchn_port_t *port) { struct evtchn_alloc_unbound alloc_unbound; int err; @@ -399,37 +490,10 @@ int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port) EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn); -/** - * Bind to an existing interdomain event channel in another domain. Returns 0 - * on success and stores the local port in *port. On error, returns -errno, - * switches the device to XenbusStateClosing, and saves the error in XenStore. - */ -int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port) -{ - struct evtchn_bind_interdomain bind_interdomain; - int err; - - bind_interdomain.remote_dom = dev->otherend_id; - bind_interdomain.remote_port = remote_port; - - err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, - &bind_interdomain); - if (err) - xenbus_dev_fatal(dev, err, - "binding to event channel %d from domain %d", - remote_port, dev->otherend_id); - else - *port = bind_interdomain.local_port; - - return err; -} -EXPORT_SYMBOL_GPL(xenbus_bind_evtchn); - - -/** +/* * Free an existing event channel. Returns 0 on success or -errno on error. */ -int xenbus_free_evtchn(struct xenbus_device *dev, int port) +int xenbus_free_evtchn(struct xenbus_device *dev, evtchn_port_t port) { struct evtchn_close close; int err; @@ -438,7 +502,7 @@ int xenbus_free_evtchn(struct xenbus_device *dev, int port) err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close); if (err) - xenbus_dev_error(dev, err, "freeing event channel %d", port); + xenbus_dev_error(dev, err, "freeing event channel %u", port); return err; } @@ -446,149 +510,222 @@ EXPORT_SYMBOL_GPL(xenbus_free_evtchn); /** - * xenbus_map_ring_valloc + * xenbus_map_ring_valloc - allocate & map pages of VA space * @dev: xenbus device - * @gnt_ref: grant reference + * @gnt_refs: grant reference array + * @nr_grefs: number of grant references * @vaddr: pointer to address to be filled out by mapping * - * Based on Rusty Russell's skeleton driver's map_page. - * Map a page of memory into this domain from another domain's grant table. - * xenbus_map_ring_valloc allocates a page of virtual address space, maps the - * page to that address, and sets *vaddr to that address. - * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h) - * or -ENOMEM on error. If an error is returned, device will switch to + * Map @nr_grefs pages of memory into this domain from another + * domain's grant table. xenbus_map_ring_valloc allocates @nr_grefs + * pages of virtual address space, maps the pages to that address, and sets + * *vaddr to that address. If an error is returned, device will switch to * XenbusStateClosing and the error message will be saved in XenStore. + * + * Returns: %0 on success or -errno on error */ -int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr) +int xenbus_map_ring_valloc(struct xenbus_device *dev, grant_ref_t *gnt_refs, + unsigned int nr_grefs, void **vaddr) { - return ring_ops->map(dev, gnt_ref, vaddr); + int err; + struct map_ring_valloc *info; + + *vaddr = NULL; + + if (nr_grefs > XENBUS_MAX_RING_GRANTS) + return -EINVAL; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + info->node = kzalloc(sizeof(*info->node), GFP_KERNEL); + if (!info->node) + err = -ENOMEM; + else + err = ring_ops->map(dev, info, gnt_refs, nr_grefs, vaddr); + + kfree(info->node); + kfree(info); + return err; } EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc); -static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev, - int gnt_ref, void **vaddr) +/* N.B. sizeof(phys_addr_t) doesn't always equal to sizeof(unsigned + * long), e.g. 32-on-64. Caller is responsible for preparing the + * right array to feed into this function */ +static int __xenbus_map_ring(struct xenbus_device *dev, + grant_ref_t *gnt_refs, + unsigned int nr_grefs, + grant_handle_t *handles, + struct map_ring_valloc *info, + unsigned int flags, + bool *leaked) { - struct gnttab_map_grant_ref op = { - .flags = GNTMAP_host_map | GNTMAP_contains_pte, - .ref = gnt_ref, - .dom = dev->otherend_id, - }; - struct xenbus_map_node *node; - struct vm_struct *area; - pte_t *pte; + int i, j; - *vaddr = NULL; + if (nr_grefs > XENBUS_MAX_RING_GRANTS) + return -EINVAL; - node = kzalloc(sizeof(*node), GFP_KERNEL); - if (!node) - return -ENOMEM; - - area = alloc_vm_area(PAGE_SIZE, &pte); - if (!area) { - kfree(node); - return -ENOMEM; + for (i = 0; i < nr_grefs; i++) { + gnttab_set_map_op(&info->map[i], info->phys_addrs[i], flags, + gnt_refs[i], dev->otherend_id); + handles[i] = INVALID_GRANT_HANDLE; } - op.host_addr = arbitrary_virt_to_machine(pte).maddr; + gnttab_batch_map(info->map, i); - gnttab_batch_map(&op, 1); + for (i = 0; i < nr_grefs; i++) { + if (info->map[i].status != GNTST_okay) { + xenbus_dev_fatal(dev, info->map[i].status, + "mapping in shared page %d from domain %d", + gnt_refs[i], dev->otherend_id); + goto fail; + } else + handles[i] = info->map[i].handle; + } - if (op.status != GNTST_okay) { - free_vm_area(area); - kfree(node); - xenbus_dev_fatal(dev, op.status, - "mapping in shared page %d from domain %d", - gnt_ref, dev->otherend_id); - return op.status; + return 0; + + fail: + for (i = j = 0; i < nr_grefs; i++) { + if (handles[i] != INVALID_GRANT_HANDLE) { + gnttab_set_unmap_op(&info->unmap[j], + info->phys_addrs[i], + GNTMAP_host_map, handles[i]); + j++; + } } - node->handle = op.handle; - node->area = area; + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, info->unmap, j)); - spin_lock(&xenbus_valloc_lock); - list_add(&node->next, &xenbus_valloc_pages); - spin_unlock(&xenbus_valloc_lock); + *leaked = false; + for (i = 0; i < j; i++) { + if (info->unmap[i].status != GNTST_okay) { + *leaked = true; + break; + } + } - *vaddr = area->addr; - return 0; + return -ENOENT; } -static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev, - int gnt_ref, void **vaddr) +/** + * xenbus_unmap_ring - unmap memory from another domain + * @dev: xenbus device + * @handles: grant handle array + * @nr_handles: number of handles in the array + * @vaddrs: addresses to unmap + * + * Unmap memory in this domain that was imported from another domain. + * + * Returns: %0 on success or GNTST_* on error + * (see xen/include/interface/grant_table.h). + */ +static int xenbus_unmap_ring(struct xenbus_device *dev, grant_handle_t *handles, + unsigned int nr_handles, unsigned long *vaddrs) { - struct xenbus_map_node *node; + struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS]; + int i; int err; - void *addr; - *vaddr = NULL; + if (nr_handles > XENBUS_MAX_RING_GRANTS) + return -EINVAL; - node = kzalloc(sizeof(*node), GFP_KERNEL); - if (!node) - return -ENOMEM; + for (i = 0; i < nr_handles; i++) + gnttab_set_unmap_op(&unmap[i], vaddrs[i], + GNTMAP_host_map, handles[i]); + + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap, i)); + + err = GNTST_okay; + for (i = 0; i < nr_handles; i++) { + if (unmap[i].status != GNTST_okay) { + xenbus_dev_error(dev, unmap[i].status, + "unmapping page at handle %d error %d", + handles[i], unmap[i].status); + err = unmap[i].status; + break; + } + } - err = alloc_xenballooned_pages(1, &node->page, false /* lowmem */); + return err; +} + +static void xenbus_map_ring_setup_grant_hvm(unsigned long gfn, + unsigned int goffset, + unsigned int len, + void *data) +{ + struct map_ring_valloc *info = data; + unsigned long vaddr = (unsigned long)gfn_to_virt(gfn); + + info->phys_addrs[info->idx] = vaddr; + info->addrs[info->idx] = vaddr; + + info->idx++; +} + +static int xenbus_map_ring_hvm(struct xenbus_device *dev, + struct map_ring_valloc *info, + grant_ref_t *gnt_ref, + unsigned int nr_grefs, + void **vaddr) +{ + struct xenbus_map_node *node = info->node; + int err; + void *addr; + bool leaked = false; + unsigned int nr_pages = XENBUS_PAGES(nr_grefs); + + err = xen_alloc_unpopulated_pages(nr_pages, node->hvm.pages); if (err) goto out_err; - addr = pfn_to_kaddr(page_to_pfn(node->page)); + gnttab_foreach_grant(node->hvm.pages, nr_grefs, + xenbus_map_ring_setup_grant_hvm, + info); + + err = __xenbus_map_ring(dev, gnt_ref, nr_grefs, node->handles, + info, GNTMAP_host_map, &leaked); + node->nr_handles = nr_grefs; - err = xenbus_map_ring(dev, gnt_ref, &node->handle, addr); if (err) - goto out_err_free_ballooned_pages; + goto out_free_ballooned_pages; + + addr = vmap(node->hvm.pages, nr_pages, VM_MAP | VM_IOREMAP, + PAGE_KERNEL); + if (!addr) { + err = -ENOMEM; + goto out_xenbus_unmap_ring; + } + + node->hvm.addr = addr; spin_lock(&xenbus_valloc_lock); list_add(&node->next, &xenbus_valloc_pages); spin_unlock(&xenbus_valloc_lock); *vaddr = addr; + info->node = NULL; + return 0; - out_err_free_ballooned_pages: - free_xenballooned_pages(1, &node->page); + out_xenbus_unmap_ring: + if (!leaked) + xenbus_unmap_ring(dev, node->handles, nr_grefs, info->addrs); + else + pr_alert("leaking %p size %u page(s)", + addr, nr_pages); + out_free_ballooned_pages: + if (!leaked) + xen_free_unpopulated_pages(nr_pages, node->hvm.pages); out_err: - kfree(node); return err; } - -/** - * xenbus_map_ring - * @dev: xenbus device - * @gnt_ref: grant reference - * @handle: pointer to grant handle to be filled - * @vaddr: address to be mapped to - * - * Map a page of memory into this domain from another domain's grant table. - * xenbus_map_ring does not allocate the virtual address space (you must do - * this yourself!). It only maps in the page to the specified address. - * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h) - * or -ENOMEM on error. If an error is returned, device will switch to - * XenbusStateClosing and the error message will be saved in XenStore. - */ -int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref, - grant_handle_t *handle, void *vaddr) -{ - struct gnttab_map_grant_ref op; - - gnttab_set_map_op(&op, (unsigned long)vaddr, GNTMAP_host_map, gnt_ref, - dev->otherend_id); - - gnttab_batch_map(&op, 1); - - if (op.status != GNTST_okay) { - xenbus_dev_fatal(dev, op.status, - "mapping in shared page %d from domain %d", - gnt_ref, dev->otherend_id); - } else - *handle = op.handle; - - return op.status; -} -EXPORT_SYMBOL_GPL(xenbus_map_ring); - - /** - * xenbus_unmap_ring_vfree + * xenbus_unmap_ring_vfree - unmap a page of memory from another domain * @dev: xenbus device * @vaddr: addr to unmap * @@ -596,7 +733,8 @@ EXPORT_SYMBOL_GPL(xenbus_map_ring); * Unmap a page of memory in this domain that was imported from another domain. * Use xenbus_unmap_ring_vfree if you mapped in your memory with * xenbus_map_ring_valloc (it will free the virtual address space). - * Returns 0 on success and returns GNTST_* on error + * + * Returns: %0 on success or GNTST_* on error * (see xen/include/interface/grant_table.h). */ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr) @@ -605,17 +743,71 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr) } EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); -static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) +#ifdef CONFIG_XEN_PV +static int map_ring_apply(pte_t *pte, unsigned long addr, void *data) +{ + struct map_ring_valloc *info = data; + + info->phys_addrs[info->idx++] = arbitrary_virt_to_machine(pte).maddr; + return 0; +} + +static int xenbus_map_ring_pv(struct xenbus_device *dev, + struct map_ring_valloc *info, + grant_ref_t *gnt_refs, + unsigned int nr_grefs, + void **vaddr) +{ + struct xenbus_map_node *node = info->node; + struct vm_struct *area; + bool leaked = false; + int err = -ENOMEM; + + area = get_vm_area(XEN_PAGE_SIZE * nr_grefs, VM_IOREMAP); + if (!area) + return -ENOMEM; + if (apply_to_page_range(&init_mm, (unsigned long)area->addr, + XEN_PAGE_SIZE * nr_grefs, map_ring_apply, info)) + goto failed; + err = __xenbus_map_ring(dev, gnt_refs, nr_grefs, node->handles, + info, GNTMAP_host_map | GNTMAP_contains_pte, + &leaked); + if (err) + goto failed; + + node->nr_handles = nr_grefs; + node->pv.area = area; + + spin_lock(&xenbus_valloc_lock); + list_add(&node->next, &xenbus_valloc_pages); + spin_unlock(&xenbus_valloc_lock); + + *vaddr = area->addr; + info->node = NULL; + + return 0; + +failed: + if (!leaked) + free_vm_area(area); + else + pr_alert("leaking VM area %p size %u page(s)", area, nr_grefs); + + return err; +} + +static int xenbus_unmap_ring_pv(struct xenbus_device *dev, void *vaddr) { struct xenbus_map_node *node; - struct gnttab_unmap_grant_ref op = { - .host_addr = (unsigned long)vaddr, - }; + struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS]; unsigned int level; + int i; + bool leaked = false; + int err; spin_lock(&xenbus_valloc_lock); list_for_each_entry(node, &xenbus_valloc_pages, next) { - if (node->area->addr == vaddr) { + if (node->pv.area->addr == vaddr) { list_del(&node->next); goto found; } @@ -630,33 +822,79 @@ static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) return GNTST_bad_virt_addr; } - op.handle = node->handle; - op.host_addr = arbitrary_virt_to_machine( - lookup_address((unsigned long)vaddr, &level)).maddr; + for (i = 0; i < node->nr_handles; i++) { + unsigned long addr; - if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) - BUG(); + memset(&unmap[i], 0, sizeof(unmap[i])); + addr = (unsigned long)vaddr + (XEN_PAGE_SIZE * i); + unmap[i].host_addr = arbitrary_virt_to_machine( + lookup_address(addr, &level)).maddr; + unmap[i].dev_bus_addr = 0; + unmap[i].handle = node->handles[i]; + } - if (op.status == GNTST_okay) - free_vm_area(node->area); + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap, i)); + + err = GNTST_okay; + leaked = false; + for (i = 0; i < node->nr_handles; i++) { + if (unmap[i].status != GNTST_okay) { + leaked = true; + xenbus_dev_error(dev, unmap[i].status, + "unmapping page at handle %d error %d", + node->handles[i], unmap[i].status); + err = unmap[i].status; + break; + } + } + + if (!leaked) + free_vm_area(node->pv.area); else - xenbus_dev_error(dev, op.status, - "unmapping page at handle %d error %d", - node->handle, op.status); + pr_alert("leaking VM area %p size %u page(s)", + node->pv.area, node->nr_handles); kfree(node); - return op.status; + return err; } -static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr) +static const struct xenbus_ring_ops ring_ops_pv = { + .map = xenbus_map_ring_pv, + .unmap = xenbus_unmap_ring_pv, +}; +#endif + +struct unmap_ring_hvm +{ + unsigned int idx; + unsigned long addrs[XENBUS_MAX_RING_GRANTS]; +}; + +static void xenbus_unmap_ring_setup_grant_hvm(unsigned long gfn, + unsigned int goffset, + unsigned int len, + void *data) +{ + struct unmap_ring_hvm *info = data; + + info->addrs[info->idx] = (unsigned long)gfn_to_virt(gfn); + + info->idx++; +} + +static int xenbus_unmap_ring_hvm(struct xenbus_device *dev, void *vaddr) { int rv; struct xenbus_map_node *node; void *addr; + struct unmap_ring_hvm info = { + .idx = 0, + }; + unsigned int nr_pages; spin_lock(&xenbus_valloc_lock); list_for_each_entry(node, &xenbus_valloc_pages, next) { - addr = pfn_to_kaddr(page_to_pfn(node->page)); + addr = node->hvm.addr; if (addr == vaddr) { list_del(&node->next); goto found; @@ -672,52 +910,30 @@ static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr) return GNTST_bad_virt_addr; } - rv = xenbus_unmap_ring(dev, node->handle, addr); + nr_pages = XENBUS_PAGES(node->nr_handles); + + gnttab_foreach_grant(node->hvm.pages, node->nr_handles, + xenbus_unmap_ring_setup_grant_hvm, + &info); - if (!rv) - free_xenballooned_pages(1, &node->page); + rv = xenbus_unmap_ring(dev, node->handles, node->nr_handles, + info.addrs); + if (!rv) { + vunmap(vaddr); + xen_free_unpopulated_pages(nr_pages, node->hvm.pages); + } else - WARN(1, "Leaking %p\n", vaddr); + WARN(1, "Leaking %p, size %u page(s)\n", vaddr, nr_pages); kfree(node); return rv; } /** - * xenbus_unmap_ring - * @dev: xenbus device - * @handle: grant handle - * @vaddr: addr to unmap - * - * Unmap a page of memory in this domain that was imported from another domain. - * Returns 0 on success and returns GNTST_* on error - * (see xen/include/interface/grant_table.h). - */ -int xenbus_unmap_ring(struct xenbus_device *dev, - grant_handle_t handle, void *vaddr) -{ - struct gnttab_unmap_grant_ref op; - - gnttab_set_unmap_op(&op, (unsigned long)vaddr, GNTMAP_host_map, handle); - - if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) - BUG(); - - if (op.status != GNTST_okay) - xenbus_dev_error(dev, op.status, - "unmapping page at handle %d error %d", - handle, op.status); - - return op.status; -} -EXPORT_SYMBOL_GPL(xenbus_unmap_ring); - - -/** - * xenbus_read_driver_state + * xenbus_read_driver_state - read state from a store path * @path: path for driver * - * Return the state of the driver rooted at the given store path, or + * Returns: the state of the driver rooted at the given store path, or * XenbusStateUnknown if no state can be read. */ enum xenbus_state xenbus_read_driver_state(const char *path) @@ -731,20 +947,17 @@ enum xenbus_state xenbus_read_driver_state(const char *path) } EXPORT_SYMBOL_GPL(xenbus_read_driver_state); -static const struct xenbus_ring_ops ring_ops_pv = { - .map = xenbus_map_ring_valloc_pv, - .unmap = xenbus_unmap_ring_vfree_pv, -}; - static const struct xenbus_ring_ops ring_ops_hvm = { - .map = xenbus_map_ring_valloc_hvm, - .unmap = xenbus_unmap_ring_vfree_hvm, + .map = xenbus_map_ring_hvm, + .unmap = xenbus_unmap_ring_hvm, }; void __init xenbus_ring_ops_init(void) { +#ifdef CONFIG_XEN_PV if (xen_pv_domain()) ring_ops = &ring_ops_pv; else +#endif ring_ops = &ring_ops_hvm; } diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c index fdb0f339d0a7..82df2da1b880 100644 --- a/drivers/xen/xenbus/xenbus_comms.c +++ b/drivers/xen/xenbus/xenbus_comms.c @@ -34,27 +34,31 @@ #include <linux/wait.h> #include <linux/interrupt.h> +#include <linux/kthread.h> #include <linux/sched.h> #include <linux/err.h> #include <xen/xenbus.h> #include <asm/xen/hypervisor.h> #include <xen/events.h> #include <xen/page.h> -#include "xenbus_comms.h" +#include "xenbus.h" -static int xenbus_irq; +/* A list of replies. Currently only one will ever be outstanding. */ +LIST_HEAD(xs_reply_list); + +/* A list of write requests. */ +LIST_HEAD(xb_write_list); +DECLARE_WAIT_QUEUE_HEAD(xb_waitq); +DEFINE_MUTEX(xb_write_mutex); -static DECLARE_WORK(probe_work, xenbus_probe); +/* Protect xenbus reader thread against save/restore. */ +DEFINE_MUTEX(xs_response_mutex); -static DECLARE_WAIT_QUEUE_HEAD(xb_waitq); +static int xenbus_irq; +static struct task_struct *xenbus_task; static irqreturn_t wake_waiting(int irq, void *unused) { - if (unlikely(xenstored_ready == 0)) { - xenstored_ready = 1; - schedule_work(&probe_work); - } - wake_up(&xb_waitq); return IRQ_HANDLED; } @@ -84,30 +88,31 @@ static const void *get_input_chunk(XENSTORE_RING_IDX cons, return buf + MASK_XENSTORE_IDX(cons); } +static int xb_data_to_write(void) +{ + struct xenstore_domain_interface *intf = xen_store_interface; + + return (intf->req_prod - intf->req_cons) != XENSTORE_RING_SIZE && + !list_empty(&xb_write_list); +} + /** * xb_write - low level write * @data: buffer to send * @len: length of buffer * - * Returns 0 on success, error otherwise. + * Returns number of bytes written or -err. */ -int xb_write(const void *data, unsigned len) +static int xb_write(const void *data, unsigned int len) { struct xenstore_domain_interface *intf = xen_store_interface; XENSTORE_RING_IDX cons, prod; - int rc; + unsigned int bytes = 0; while (len != 0) { void *dst; unsigned int avail; - rc = wait_event_interruptible( - xb_waitq, - (intf->req_prod - intf->req_cons) != - XENSTORE_RING_SIZE); - if (rc < 0) - return rc; - /* Read indexes, then verify. */ cons = intf->req_cons; prod = intf->req_prod; @@ -115,6 +120,11 @@ int xb_write(const void *data, unsigned len) intf->req_cons = intf->req_prod = 0; return -EIO; } + if (!xb_data_to_write()) + return bytes; + + /* Must write data /after/ reading the consumer index. */ + virt_mb(); dst = get_output_chunk(cons, prod, intf->req, &avail); if (avail == 0) @@ -122,52 +132,45 @@ int xb_write(const void *data, unsigned len) if (avail > len) avail = len; - /* Must write data /after/ reading the consumer index. */ - mb(); - memcpy(dst, data, avail); data += avail; len -= avail; + bytes += avail; /* Other side must not see new producer until data is there. */ - wmb(); + virt_wmb(); intf->req_prod += avail; /* Implies mb(): other side will see the updated producer. */ - notify_remote_via_evtchn(xen_store_evtchn); + if (prod <= intf->req_cons) + notify_remote_via_evtchn(xen_store_evtchn); } - return 0; + return bytes; } -int xb_data_to_read(void) +static int xb_data_to_read(void) { struct xenstore_domain_interface *intf = xen_store_interface; return (intf->rsp_cons != intf->rsp_prod); } -int xb_wait_for_data_to_read(void) -{ - return wait_event_interruptible(xb_waitq, xb_data_to_read()); -} - -int xb_read(void *data, unsigned len) +static int xb_read(void *data, unsigned int len) { struct xenstore_domain_interface *intf = xen_store_interface; XENSTORE_RING_IDX cons, prod; - int rc; + unsigned int bytes = 0; while (len != 0) { unsigned int avail; const char *src; - rc = xb_wait_for_data_to_read(); - if (rc < 0) - return rc; - /* Read indexes, then verify. */ cons = intf->rsp_cons; prod = intf->rsp_prod; + if (cons == prod) + return bytes; + if (!check_indexes(cons, prod)) { intf->rsp_cons = intf->rsp_prod = 0; return -EIO; @@ -180,22 +183,251 @@ int xb_read(void *data, unsigned len) avail = len; /* Must read data /after/ reading the producer index. */ - rmb(); + virt_rmb(); memcpy(data, src, avail); data += avail; len -= avail; + bytes += avail; /* Other side must not see free space until we've copied out */ - mb(); + virt_mb(); intf->rsp_cons += avail; - pr_debug("Finished read of %i bytes (%i to go)\n", avail, len); - /* Implies mb(): other side will see the updated consumer. */ - notify_remote_via_evtchn(xen_store_evtchn); + if (intf->rsp_prod - cons >= XENSTORE_RING_SIZE) + notify_remote_via_evtchn(xen_store_evtchn); + } + + return bytes; +} + +static int process_msg(void) +{ + static struct { + struct xsd_sockmsg msg; + char *body; + union { + void *alloc; + struct xs_watch_event *watch; + }; + bool in_msg; + bool in_hdr; + unsigned int read; + } state; + struct xb_req_data *req; + int err; + unsigned int len; + + if (!state.in_msg) { + state.in_msg = true; + state.in_hdr = true; + state.read = 0; + + /* + * We must disallow save/restore while reading a message. + * A partial read across s/r leaves us out of sync with + * xenstored. + * xs_response_mutex is locked as long as we are processing one + * message. state.in_msg will be true as long as we are holding + * the lock here. + */ + mutex_lock(&xs_response_mutex); + + if (!xb_data_to_read()) { + /* We raced with save/restore: pending data 'gone'. */ + mutex_unlock(&xs_response_mutex); + state.in_msg = false; + return 0; + } } + if (state.in_hdr) { + if (state.read != sizeof(state.msg)) { + err = xb_read((void *)&state.msg + state.read, + sizeof(state.msg) - state.read); + if (err < 0) + goto out; + state.read += err; + if (state.read != sizeof(state.msg)) + return 0; + if (state.msg.len > XENSTORE_PAYLOAD_MAX) { + err = -EINVAL; + goto out; + } + } + + len = state.msg.len + 1; + if (state.msg.type == XS_WATCH_EVENT) + len += sizeof(*state.watch); + + state.alloc = kmalloc(len, GFP_NOIO | __GFP_HIGH); + if (!state.alloc) + return -ENOMEM; + + if (state.msg.type == XS_WATCH_EVENT) + state.body = state.watch->body; + else + state.body = state.alloc; + state.in_hdr = false; + state.read = 0; + } + + err = xb_read(state.body + state.read, state.msg.len - state.read); + if (err < 0) + goto out; + + state.read += err; + if (state.read != state.msg.len) + return 0; + + state.body[state.msg.len] = '\0'; + + if (state.msg.type == XS_WATCH_EVENT) { + state.watch->len = state.msg.len; + err = xs_watch_msg(state.watch); + } else { + err = -ENOENT; + mutex_lock(&xb_write_mutex); + list_for_each_entry(req, &xs_reply_list, list) { + if (req->msg.req_id == state.msg.req_id) { + list_del(&req->list); + err = 0; + break; + } + } + mutex_unlock(&xb_write_mutex); + if (err) + goto out; + + if (req->state == xb_req_state_wait_reply) { + req->msg.req_id = req->caller_req_id; + req->msg.type = state.msg.type; + req->msg.len = state.msg.len; + req->body = state.body; + /* write body, then update state */ + virt_wmb(); + req->state = xb_req_state_got_reply; + req->cb(req); + } + kref_put(&req->kref, xs_free_req); + } + + mutex_unlock(&xs_response_mutex); + + state.in_msg = false; + state.alloc = NULL; + return err; + + out: + mutex_unlock(&xs_response_mutex); + state.in_msg = false; + kfree(state.alloc); + state.alloc = NULL; + return err; +} + +static int process_writes(void) +{ + static struct { + struct xb_req_data *req; + int idx; + unsigned int written; + } state; + void *base; + unsigned int len; + int err = 0; + + if (!xb_data_to_write()) + return 0; + + mutex_lock(&xb_write_mutex); + + if (!state.req) { + state.req = list_first_entry(&xb_write_list, + struct xb_req_data, list); + state.idx = -1; + state.written = 0; + } + + if (state.req->state == xb_req_state_aborted) + goto out_err; + + while (state.idx < state.req->num_vecs) { + if (state.idx < 0) { + base = &state.req->msg; + len = sizeof(state.req->msg); + } else { + base = state.req->vec[state.idx].iov_base; + len = state.req->vec[state.idx].iov_len; + } + err = xb_write(base + state.written, len - state.written); + if (err < 0) + goto out_err; + state.written += err; + if (state.written != len) + goto out; + + state.idx++; + state.written = 0; + } + + list_del(&state.req->list); + state.req->state = xb_req_state_wait_reply; + list_add_tail(&state.req->list, &xs_reply_list); + state.req = NULL; + + out: + mutex_unlock(&xb_write_mutex); + + return 0; + + out_err: + state.req->msg.type = XS_ERROR; + state.req->err = err; + list_del(&state.req->list); + if (state.req->state != xb_req_state_aborted) { + /* write err, then update state */ + virt_wmb(); + state.req->state = xb_req_state_got_reply; + wake_up(&state.req->wq); + } + kref_put(&state.req->kref, xs_free_req); + + mutex_unlock(&xb_write_mutex); + + state.req = NULL; + + return err; +} + +static int xb_thread_work(void) +{ + return xb_data_to_read() || xb_data_to_write(); +} + +static int xenbus_thread(void *unused) +{ + int err; + + while (!kthread_should_stop()) { + if (wait_event_interruptible(xb_waitq, xb_thread_work())) + continue; + + err = process_msg(); + if (err == -ENOMEM) + schedule(); + else if (err) + pr_warn_ratelimited("error %d while reading message\n", + err); + + err = process_writes(); + if (err) + pr_warn_ratelimited("error %d while writing message\n", + err); + } + + xenbus_task = NULL; return 0; } @@ -223,6 +455,7 @@ int xb_init_comms(void) rebind_evtchn_irq(xen_store_evtchn, xenbus_irq); } else { int err; + err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting, 0, "xenbus", &xb_waitq); if (err < 0) { @@ -231,6 +464,13 @@ int xb_init_comms(void) } xenbus_irq = err; + + if (!xenbus_task) { + xenbus_task = kthread_run(xenbus_thread, NULL, + "xenbus"); + if (IS_ERR(xenbus_task)) + return PTR_ERR(xenbus_task); + } } return 0; diff --git a/drivers/xen/xenbus/xenbus_comms.h b/drivers/xen/xenbus/xenbus_comms.h deleted file mode 100644 index e74f9c1fbd80..000000000000 --- a/drivers/xen/xenbus/xenbus_comms.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Private include for xenbus communications. - * - * Copyright (C) 2005 Rusty Russell, IBM Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation; or, when distributed - * separately from the Linux kernel or incorporated into other - * software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef _XENBUS_COMMS_H -#define _XENBUS_COMMS_H - -#include <linux/fs.h> - -int xs_init(void); -int xb_init_comms(void); -void xb_deinit_comms(void); - -/* Low level routines. */ -int xb_write(const void *data, unsigned len); -int xb_read(void *data, unsigned len); -int xb_data_to_read(void); -int xb_wait_for_data_to_read(void); -int xs_input_avail(void); -extern struct xenstore_domain_interface *xen_store_interface; -extern int xen_store_evtchn; -extern enum xenstore_init xen_store_domain_type; - -extern const struct file_operations xen_xenbus_fops; - -#endif /* _XENBUS_COMMS_H */ diff --git a/drivers/xen/xenbus/xenbus_dev_backend.c b/drivers/xen/xenbus/xenbus_dev_backend.c index b17707ee07d4..edba5fecde4d 100644 --- a/drivers/xen/xenbus/xenbus_dev_backend.c +++ b/drivers/xen/xenbus/xenbus_dev_backend.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> @@ -5,7 +6,7 @@ #include <linux/mm.h> #include <linux/fs.h> #include <linux/miscdevice.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/capability.h> #include <xen/xen.h> @@ -16,9 +17,7 @@ #include <xen/events.h> #include <asm/xen/hypervisor.h> -#include "xenbus_comms.h" - -MODULE_LICENSE("GPL"); +#include "xenbus.h" static int xenbus_backend_open(struct inode *inode, struct file *filp) { @@ -49,7 +48,7 @@ static long xenbus_alloc(domid_t domid) goto out_err; gnttab_grant_foreign_access_ref(GNTTAB_RESERVED_XENSTORE, domid, - virt_to_mfn(xen_store_interface), 0 /* writable */); + virt_to_gfn(xen_store_interface), 0 /* writable */); arg.dom = DOMID_SELF; arg.remote_dom = domid; @@ -132,11 +131,4 @@ static int __init xenbus_backend_init(void) pr_err("Could not register xenbus backend device\n"); return err; } - -static void __exit xenbus_backend_exit(void) -{ - misc_deregister(&xenbus_backend_dev); -} - -module_init(xenbus_backend_init); -module_exit(xenbus_backend_exit); +device_initcall(xenbus_backend_init); diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c index 85534ea63555..f5c21ba64df5 100644 --- a/drivers/xen/xenbus/xenbus_dev_frontend.c +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c @@ -55,15 +55,15 @@ #include <linux/string.h> #include <linux/slab.h> #include <linux/miscdevice.h> -#include <linux/module.h> - -#include "xenbus_comms.h" +#include <linux/workqueue.h> #include <xen/xenbus.h> #include <xen/xen.h> #include <asm/xen/hypervisor.h> -MODULE_LICENSE("GPL"); +#include "xenbus.h" + +unsigned int xb_dev_generation_id; /* * An element of a list of outstanding transactions, for which we're @@ -72,6 +72,7 @@ MODULE_LICENSE("GPL"); struct xenbus_transaction_holder { struct list_head list; struct xenbus_transaction handle; + unsigned int generation_id; }; /* @@ -81,7 +82,7 @@ struct read_buffer { struct list_head list; unsigned int cons; unsigned int len; - char msg[]; + char msg[] __counted_by(len); }; struct xenbus_file_priv { @@ -115,6 +116,9 @@ struct xenbus_file_priv { struct list_head read_buffers; wait_queue_head_t read_waitq; + struct kref kref; + + struct work_struct wq; }; /* Read out any raw xenbus messages queued up. */ @@ -124,7 +128,7 @@ static ssize_t xenbus_file_read(struct file *filp, { struct xenbus_file_priv *u = filp->private_data; struct read_buffer *rb; - unsigned i; + ssize_t i; int ret; mutex_lock(&u->reply_mutex); @@ -144,7 +148,7 @@ again: rb = list_entry(u->read_buffers.next, struct read_buffer, list); i = 0; while (i < len) { - unsigned sz = min((unsigned)len - i, rb->len - rb->cons); + size_t sz = min_t(size_t, len - i, rb->len - rb->cons); ret = copy_to_user(ubuf + i, &rb->msg[rb->cons], sz); @@ -188,8 +192,10 @@ static int queue_reply(struct list_head *queue, const void *data, size_t len) if (len == 0) return 0; + if (len > XENSTORE_PAYLOAD_MAX) + return -EINVAL; - rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL); + rb = kmalloc(struct_size(rb, msg, len), GFP_KERNEL); if (rb == NULL) return -ENOMEM; @@ -258,26 +264,23 @@ out_fail: } static void watch_fired(struct xenbus_watch *watch, - const char **vec, - unsigned int len) + const char *path, + const char *token) { struct watch_adapter *adap; struct xsd_sockmsg hdr; - const char *path, *token; - int path_len, tok_len, body_len, data_len = 0; + const char *token_caller; + int path_len, tok_len, body_len; int ret; LIST_HEAD(staging_q); adap = container_of(watch, struct watch_adapter, watch); - path = vec[XS_WATCH_PATH]; - token = adap->token; + token_caller = adap->token; path_len = strlen(path) + 1; - tok_len = strlen(token) + 1; - if (len > 2) - data_len = vec[len] - vec[2] + 1; - body_len = path_len + tok_len + data_len; + tok_len = strlen(token_caller) + 1; + body_len = path_len + tok_len; hdr.type = XS_WATCH_EVENT; hdr.len = body_len; @@ -288,9 +291,7 @@ static void watch_fired(struct xenbus_watch *watch, if (!ret) ret = queue_reply(&staging_q, path, path_len); if (!ret) - ret = queue_reply(&staging_q, token, tok_len); - if (!ret && len > 2) - ret = queue_reply(&staging_q, vec[2], data_len); + ret = queue_reply(&staging_q, token_caller, tok_len); if (!ret) { /* success: pass reply list onto watcher */ @@ -302,47 +303,100 @@ static void watch_fired(struct xenbus_watch *watch, mutex_unlock(&adap->dev_data->reply_mutex); } -static int xenbus_write_transaction(unsigned msg_type, - struct xenbus_file_priv *u) +static void xenbus_worker(struct work_struct *wq) { - int rc; - void *reply; - struct xenbus_transaction_holder *trans = NULL; - LIST_HEAD(staging_q); + struct xenbus_file_priv *u; + struct xenbus_transaction_holder *trans, *tmp; + struct watch_adapter *watch, *tmp_watch; + struct read_buffer *rb, *tmp_rb; - if (msg_type == XS_TRANSACTION_START) { - trans = kmalloc(sizeof(*trans), GFP_KERNEL); - if (!trans) { - rc = -ENOMEM; - goto out; - } - } + u = container_of(wq, struct xenbus_file_priv, wq); - reply = xenbus_dev_request_and_reply(&u->u.msg); - if (IS_ERR(reply)) { + /* + * No need for locking here because there are no other users, + * by definition. + */ + + list_for_each_entry_safe(trans, tmp, &u->transactions, list) { + xenbus_transaction_end(trans->handle, 1); + list_del(&trans->list); kfree(trans); - rc = PTR_ERR(reply); - goto out; } - if (msg_type == XS_TRANSACTION_START) { - trans->handle.id = simple_strtoul(reply, NULL, 0); + list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) { + unregister_xenbus_watch(&watch->watch); + list_del(&watch->list); + free_watch_adapter(watch); + } - list_add(&trans->list, &u->transactions); - } else if (msg_type == XS_TRANSACTION_END) { - list_for_each_entry(trans, &u->transactions, list) - if (trans->handle.id == u->u.msg.tx_id) - break; - BUG_ON(&trans->list == &u->transactions); - list_del(&trans->list); + list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) { + list_del(&rb->list); + kfree(rb); + } + kfree(u); +} +static void xenbus_file_free(struct kref *kref) +{ + struct xenbus_file_priv *u; + + /* + * We might be called in xenbus_thread(). + * Use workqueue to avoid deadlock. + */ + u = container_of(kref, struct xenbus_file_priv, kref); + schedule_work(&u->wq); +} + +static struct xenbus_transaction_holder *xenbus_get_transaction( + struct xenbus_file_priv *u, uint32_t tx_id) +{ + struct xenbus_transaction_holder *trans; + + list_for_each_entry(trans, &u->transactions, list) + if (trans->handle.id == tx_id) + return trans; + + return NULL; +} + +void xenbus_dev_queue_reply(struct xb_req_data *req) +{ + struct xenbus_file_priv *u = req->par; + struct xenbus_transaction_holder *trans = NULL; + int rc; + LIST_HEAD(staging_q); + + xs_request_exit(req); + + mutex_lock(&u->msgbuffer_mutex); + + if (req->type == XS_TRANSACTION_START) { + trans = xenbus_get_transaction(u, 0); + if (WARN_ON(!trans)) + goto out; + if (req->msg.type == XS_ERROR) { + list_del(&trans->list); + kfree(trans); + } else { + rc = kstrtou32(req->body, 10, &trans->handle.id); + if (WARN_ON(rc)) + goto out; + } + } else if (req->type == XS_TRANSACTION_END) { + trans = xenbus_get_transaction(u, req->msg.tx_id); + if (WARN_ON(!trans)) + goto out; + list_del(&trans->list); kfree(trans); } + mutex_unlock(&u->msgbuffer_mutex); + mutex_lock(&u->reply_mutex); - rc = queue_reply(&staging_q, &u->u.msg, sizeof(u->u.msg)); + rc = queue_reply(&staging_q, &req->msg, sizeof(req->msg)); if (!rc) - rc = queue_reply(&staging_q, reply, u->u.msg.len); + rc = queue_reply(&staging_q, req->body, req->msg.len); if (!rc) { list_splice_tail(&staging_q, &u->read_buffers); wake_up(&u->read_waitq); @@ -351,7 +405,89 @@ static int xenbus_write_transaction(unsigned msg_type, } mutex_unlock(&u->reply_mutex); - kfree(reply); + kfree(req->body); + kref_put(&req->kref, xs_free_req); + + kref_put(&u->kref, xenbus_file_free); + + return; + + out: + mutex_unlock(&u->msgbuffer_mutex); +} + +static int xenbus_command_reply(struct xenbus_file_priv *u, + unsigned int msg_type, const char *reply) +{ + struct { + struct xsd_sockmsg hdr; + char body[16]; + } msg; + int rc; + + msg.hdr = u->u.msg; + msg.hdr.type = msg_type; + msg.hdr.len = strlen(reply) + 1; + if (msg.hdr.len > sizeof(msg.body)) + return -E2BIG; + memcpy(&msg.body, reply, msg.hdr.len); + + mutex_lock(&u->reply_mutex); + rc = queue_reply(&u->read_buffers, &msg, sizeof(msg.hdr) + msg.hdr.len); + wake_up(&u->read_waitq); + mutex_unlock(&u->reply_mutex); + + if (!rc) + kref_put(&u->kref, xenbus_file_free); + + return rc; +} + +static int xenbus_write_transaction(unsigned msg_type, + struct xenbus_file_priv *u) +{ + int rc; + struct xenbus_transaction_holder *trans = NULL; + struct { + struct xsd_sockmsg hdr; + char body[]; + } *msg = (void *)u->u.buffer; + + if (msg_type == XS_TRANSACTION_START) { + trans = kzalloc(sizeof(*trans), GFP_KERNEL); + if (!trans) { + rc = -ENOMEM; + goto out; + } + trans->generation_id = xb_dev_generation_id; + list_add(&trans->list, &u->transactions); + } else if (msg->hdr.tx_id != 0 && + !xenbus_get_transaction(u, msg->hdr.tx_id)) + return xenbus_command_reply(u, XS_ERROR, "ENOENT"); + else if (msg_type == XS_TRANSACTION_END && + !(msg->hdr.len == 2 && + (!strcmp(msg->body, "T") || !strcmp(msg->body, "F")))) + return xenbus_command_reply(u, XS_ERROR, "EINVAL"); + else if (msg_type == XS_TRANSACTION_END) { + trans = xenbus_get_transaction(u, msg->hdr.tx_id); + if (trans && trans->generation_id != xb_dev_generation_id) { + list_del(&trans->list); + kfree(trans); + if (!strcmp(msg->body, "T")) + return xenbus_command_reply(u, XS_ERROR, + "EAGAIN"); + else + return xenbus_command_reply(u, + XS_TRANSACTION_END, + "OK"); + } + } + + rc = xenbus_dev_request_and_reply(&msg->hdr, u); + if (rc && trans) { + list_del(&trans->list); + kfree(trans); + } out: return rc; @@ -359,20 +495,19 @@ out: static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u) { - struct watch_adapter *watch, *tmp_watch; + struct watch_adapter *watch; char *path, *token; int err, rc; - LIST_HEAD(staging_q); path = u->u.buffer + sizeof(u->u.msg); token = memchr(path, 0, u->u.msg.len); if (token == NULL) { - rc = -EILSEQ; + rc = xenbus_command_reply(u, XS_ERROR, "EINVAL"); goto out; } token++; if (memchr(token, 0, u->u.msg.len - (token - path)) == NULL) { - rc = -EILSEQ; + rc = xenbus_command_reply(u, XS_ERROR, "EINVAL"); goto out; } @@ -394,7 +529,7 @@ static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u) } list_add(&watch->list, &u->watches); } else { - list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) { + list_for_each_entry(watch, &u->watches, list) { if (!strcmp(watch->token, token) && !strcmp(watch->watch.node, path)) { unregister_xenbus_watch(&watch->watch); @@ -406,23 +541,7 @@ static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u) } /* Success. Synthesize a reply to say all is OK. */ - { - struct { - struct xsd_sockmsg hdr; - char body[3]; - } __packed reply = { - { - .type = msg_type, - .len = sizeof(reply.body) - }, - "OK" - }; - - mutex_lock(&u->reply_mutex); - rc = queue_reply(&u->read_buffers, &reply, sizeof(reply)); - wake_up(&u->read_waitq); - mutex_unlock(&u->reply_mutex); - } + rc = xenbus_command_reply(u, msg_type, "OK"); out: return rc; @@ -436,7 +555,6 @@ static ssize_t xenbus_file_write(struct file *filp, uint32_t msg_type; int rc = len; int ret; - LIST_HEAD(staging_q); /* * We're expecting usermode to be writing properly formed @@ -499,6 +617,8 @@ static ssize_t xenbus_file_write(struct file *filp, * OK, now we have a complete message. Do something with it. */ + kref_get(&u->kref); + msg_type = u->u.msg.type; switch (msg_type) { @@ -513,8 +633,10 @@ static ssize_t xenbus_file_write(struct file *filp, ret = xenbus_write_transaction(msg_type, u); break; } - if (ret != 0) + if (ret != 0) { rc = ret; + kref_put(&u->kref, xenbus_file_free); + } /* Buffered message consumed */ u->len = 0; @@ -531,16 +653,19 @@ static int xenbus_file_open(struct inode *inode, struct file *filp) if (xen_store_evtchn == 0) return -ENOENT; - nonseekable_open(inode, filp); + stream_open(inode, filp); u = kzalloc(sizeof(*u), GFP_KERNEL); if (u == NULL) return -ENOMEM; + kref_init(&u->kref); + INIT_LIST_HEAD(&u->transactions); INIT_LIST_HEAD(&u->watches); INIT_LIST_HEAD(&u->read_buffers); init_waitqueue_head(&u->read_waitq); + INIT_WORK(&u->wq, xenbus_worker); mutex_init(&u->reply_mutex); mutex_init(&u->msgbuffer_mutex); @@ -553,43 +678,19 @@ static int xenbus_file_open(struct inode *inode, struct file *filp) static int xenbus_file_release(struct inode *inode, struct file *filp) { struct xenbus_file_priv *u = filp->private_data; - struct xenbus_transaction_holder *trans, *tmp; - struct watch_adapter *watch, *tmp_watch; - struct read_buffer *rb, *tmp_rb; - /* - * No need for locking here because there are no other users, - * by definition. - */ - - list_for_each_entry_safe(trans, tmp, &u->transactions, list) { - xenbus_transaction_end(trans->handle, 1); - list_del(&trans->list); - kfree(trans); - } - - list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) { - unregister_xenbus_watch(&watch->watch); - list_del(&watch->list); - free_watch_adapter(watch); - } - - list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) { - list_del(&rb->list); - kfree(rb); - } - kfree(u); + kref_put(&u->kref, xenbus_file_free); return 0; } -static unsigned int xenbus_file_poll(struct file *file, poll_table *wait) +static __poll_t xenbus_file_poll(struct file *file, poll_table *wait) { struct xenbus_file_priv *u = file->private_data; poll_wait(file, &u->read_waitq, wait); if (!list_empty(&u->read_buffers)) - return POLLIN | POLLRDNORM; + return EPOLLIN | EPOLLRDNORM; return 0; } @@ -599,7 +700,6 @@ const struct file_operations xen_xenbus_fops = { .open = xenbus_file_open, .release = xenbus_file_release, .poll = xenbus_file_poll, - .llseek = no_llseek, }; EXPORT_SYMBOL_GPL(xen_xenbus_fops); @@ -621,11 +721,4 @@ static int __init xenbus_init(void) pr_err("Could not register xenbus frontend device\n"); return err; } - -static void __exit xenbus_exit(void) -{ - misc_deregister(&xenbus_dev); -} - -module_init(xenbus_init); -module_exit(xenbus_exit); +device_initcall(xenbus_init); diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 38e92b770e91..86fe6e779056 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -31,6 +31,7 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define dev_fmt pr_fmt #define DPRINTK(fmt, args...) \ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ @@ -51,30 +52,34 @@ #include <linux/module.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/xen/hypervisor.h> #include <xen/xen.h> #include <xen/xenbus.h> #include <xen/events.h> +#include <xen/xen-ops.h> #include <xen/page.h> #include <xen/hvm.h> -#include "xenbus_comms.h" -#include "xenbus_probe.h" +#include "xenbus.h" +static int xs_init_irq = -1; int xen_store_evtchn; EXPORT_SYMBOL_GPL(xen_store_evtchn); struct xenstore_domain_interface *xen_store_interface; EXPORT_SYMBOL_GPL(xen_store_interface); +#define XS_INTERFACE_READY \ + ((xen_store_interface != NULL) && \ + (xen_store_interface->connection == XENSTORE_CONNECTED)) + enum xenstore_init xen_store_domain_type; EXPORT_SYMBOL_GPL(xen_store_domain_type); -static unsigned long xen_store_mfn; +static unsigned long xen_store_gfn; static BLOCKING_NOTIFIER_HEAD(xenstore_chain); @@ -89,9 +94,9 @@ match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) return NULL; } -int xenbus_match(struct device *_dev, struct device_driver *_drv) +int xenbus_match(struct device *_dev, const struct device_driver *_drv) { - struct xenbus_driver *drv = to_xenbus_driver(_drv); + const struct xenbus_driver *drv = to_xenbus_driver(_drv); if (!drv->ids) return 0; @@ -136,6 +141,7 @@ static int watch_otherend(struct xenbus_device *dev) container_of(dev->dev.bus, struct xen_bus_type, bus); return xenbus_watch_pathfmt(dev, &dev->otherend_watch, + bus->otherend_will_handle, bus->otherend_changed, "%s/%s", dev->otherend, "state"); } @@ -169,7 +175,7 @@ int xenbus_read_otherend_details(struct xenbus_device *xendev, EXPORT_SYMBOL_GPL(xenbus_read_otherend_details); void xenbus_otherend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len, + const char *path, const char *token, int ignore_on_shutdown) { struct xenbus_device *dev = @@ -180,18 +186,15 @@ void xenbus_otherend_changed(struct xenbus_watch *watch, /* Protect us against watches firing on old details when the otherend details change, say immediately after a resume. */ if (!dev->otherend || - strncmp(dev->otherend, vec[XS_WATCH_PATH], - strlen(dev->otherend))) { - dev_dbg(&dev->dev, "Ignoring watch at %s\n", - vec[XS_WATCH_PATH]); + strncmp(dev->otherend, path, strlen(dev->otherend))) { + dev_dbg(&dev->dev, "Ignoring watch at %s\n", path); return; } state = xenbus_read_driver_state(dev->otherend); dev_dbg(&dev->dev, "state is %d, (%s), %s, %s\n", - state, xenbus_strstate(state), dev->otherend_watch.node, - vec[XS_WATCH_PATH]); + state, xenbus_strstate(state), dev->otherend_watch.node, path); /* * Ignore xenbus transitions during shutdown. This prevents us doing @@ -208,6 +211,64 @@ void xenbus_otherend_changed(struct xenbus_watch *watch, } EXPORT_SYMBOL_GPL(xenbus_otherend_changed); +#define XENBUS_SHOW_STAT(name) \ +static ssize_t name##_show(struct device *_dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + struct xenbus_device *dev = to_xenbus_device(_dev); \ + \ + return sprintf(buf, "%d\n", atomic_read(&dev->name)); \ +} \ +static DEVICE_ATTR_RO(name) + +XENBUS_SHOW_STAT(event_channels); +XENBUS_SHOW_STAT(events); +XENBUS_SHOW_STAT(spurious_events); +XENBUS_SHOW_STAT(jiffies_eoi_delayed); + +static ssize_t spurious_threshold_show(struct device *_dev, + struct device_attribute *attr, + char *buf) +{ + struct xenbus_device *dev = to_xenbus_device(_dev); + + return sprintf(buf, "%d\n", dev->spurious_threshold); +} + +static ssize_t spurious_threshold_store(struct device *_dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct xenbus_device *dev = to_xenbus_device(_dev); + unsigned int val; + ssize_t ret; + + ret = kstrtouint(buf, 0, &val); + if (ret) + return ret; + + dev->spurious_threshold = val; + + return count; +} + +static DEVICE_ATTR_RW(spurious_threshold); + +static struct attribute *xenbus_attrs[] = { + &dev_attr_event_channels.attr, + &dev_attr_events.attr, + &dev_attr_spurious_events.attr, + &dev_attr_jiffies_eoi_delayed.attr, + &dev_attr_spurious_threshold.attr, + NULL +}; + +static const struct attribute_group xenbus_group = { + .name = "xenbus", + .attrs = xenbus_attrs, +}; + int xenbus_dev_probe(struct device *_dev) { struct xenbus_device *dev = to_xenbus_device(_dev); @@ -235,71 +296,87 @@ int xenbus_dev_probe(struct device *_dev) return err; } + if (!try_module_get(drv->driver.owner)) { + dev_warn(&dev->dev, "failed to acquire module reference on '%s'\n", + drv->driver.name); + err = -ESRCH; + goto fail; + } + + down(&dev->reclaim_sem); err = drv->probe(dev, id); + up(&dev->reclaim_sem); if (err) - goto fail; + goto fail_put; err = watch_otherend(dev); if (err) { dev_warn(&dev->dev, "watch_otherend on %s failed.\n", dev->nodename); - return err; + goto fail_remove; } + dev->spurious_threshold = 1; + if (sysfs_create_group(&dev->dev.kobj, &xenbus_group)) + dev_warn(&dev->dev, "sysfs_create_group on %s failed.\n", + dev->nodename); + return 0; +fail_remove: + if (drv->remove) { + down(&dev->reclaim_sem); + drv->remove(dev); + up(&dev->reclaim_sem); + } +fail_put: + module_put(drv->driver.owner); fail: xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename); - xenbus_switch_state(dev, XenbusStateClosed); return err; } EXPORT_SYMBOL_GPL(xenbus_dev_probe); -int xenbus_dev_remove(struct device *_dev) +void xenbus_dev_remove(struct device *_dev) { struct xenbus_device *dev = to_xenbus_device(_dev); struct xenbus_driver *drv = to_xenbus_driver(_dev->driver); DPRINTK("%s", dev->nodename); + sysfs_remove_group(&dev->dev.kobj, &xenbus_group); + free_otherend_watch(dev); - if (drv->remove) + if (drv->remove) { + down(&dev->reclaim_sem); drv->remove(dev); + up(&dev->reclaim_sem); + } + + module_put(drv->driver.owner); free_otherend_details(dev); - xenbus_switch_state(dev, XenbusStateClosed); - return 0; + /* + * If the toolstack has forced the device state to closing then set + * the state to closed now to allow it to be cleaned up. + * Similarly, if the driver does not support re-bind, set the + * closed. + */ + if (!drv->allow_rebind || + xenbus_read_driver_state(dev->nodename) == XenbusStateClosing) + xenbus_switch_state(dev, XenbusStateClosed); } EXPORT_SYMBOL_GPL(xenbus_dev_remove); -void xenbus_dev_shutdown(struct device *_dev) -{ - struct xenbus_device *dev = to_xenbus_device(_dev); - unsigned long timeout = 5*HZ; - - DPRINTK("%s", dev->nodename); - - get_device(&dev->dev); - if (dev->state != XenbusStateConnected) { - pr_info("%s: %s: %s != Connected, skipping\n", - __func__, dev->nodename, xenbus_strstate(dev->state)); - goto out; - } - xenbus_switch_state(dev, XenbusStateClosing); - timeout = wait_for_completion_timeout(&dev->down, timeout); - if (!timeout) - pr_info("%s: %s timeout closing device\n", - __func__, dev->nodename); - out: - put_device(&dev->dev); -} -EXPORT_SYMBOL_GPL(xenbus_dev_shutdown); - int xenbus_register_driver_common(struct xenbus_driver *drv, - struct xen_bus_type *bus) + struct xen_bus_type *bus, + struct module *owner, const char *mod_name) { + drv->driver.name = drv->name ? drv->name : drv->ids[0].devicetype; drv->driver.bus = &bus->bus; + drv->driver.owner = owner; + drv->driver.mod_name = mod_name; return driver_register(&drv->driver); } @@ -384,12 +461,14 @@ static ssize_t nodename_show(struct device *dev, { return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename); } +static DEVICE_ATTR_RO(nodename); static ssize_t devtype_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype); } +static DEVICE_ATTR_RO(devtype); static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -397,14 +476,33 @@ static ssize_t modalias_show(struct device *dev, return sprintf(buf, "%s:%s\n", dev->bus->name, to_xenbus_device(dev)->devicetype); } +static DEVICE_ATTR_RO(modalias); -struct device_attribute xenbus_dev_attrs[] = { - __ATTR_RO(nodename), - __ATTR_RO(devtype), - __ATTR_RO(modalias), - __ATTR_NULL +static ssize_t state_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", + xenbus_strstate(to_xenbus_device(dev)->state)); +} +static DEVICE_ATTR_RO(state); + +static struct attribute *xenbus_dev_attrs[] = { + &dev_attr_nodename.attr, + &dev_attr_devtype.attr, + &dev_attr_modalias.attr, + &dev_attr_state.attr, + NULL, +}; + +static const struct attribute_group xenbus_dev_group = { + .attrs = xenbus_dev_attrs, +}; + +const struct attribute_group *xenbus_dev_groups[] = { + &xenbus_dev_group, + NULL, }; -EXPORT_SYMBOL_GPL(xenbus_dev_attrs); +EXPORT_SYMBOL_GPL(xenbus_dev_groups); int xenbus_probe_node(struct xen_bus_type *bus, const char *type, @@ -450,11 +548,15 @@ int xenbus_probe_node(struct xen_bus_type *bus, goto fail; dev_set_name(&xendev->dev, "%s", devname); + sema_init(&xendev->reclaim_sem, 1); /* Register with generic device framework. */ err = device_register(&xendev->dev); - if (err) + if (err) { + put_device(&xendev->dev); + xendev = NULL; goto fail; + } return 0; fail: @@ -581,7 +683,7 @@ int xenbus_dev_suspend(struct device *dev) if (drv->suspend) err = drv->suspend(xdev); if (err) - pr_warn("suspend %s failed: %i\n", dev_name(dev), err); + dev_warn(dev, "suspend failed: %i\n", err); return 0; } EXPORT_SYMBOL_GPL(xenbus_dev_suspend); @@ -600,8 +702,7 @@ int xenbus_dev_resume(struct device *dev) drv = to_xenbus_driver(dev->driver); err = talk_to_otherend(xdev); if (err) { - pr_warn("resume (talk_to_otherend) %s failed: %i\n", - dev_name(dev), err); + dev_warn(dev, "resume (talk_to_otherend) failed: %i\n", err); return err; } @@ -610,15 +711,14 @@ int xenbus_dev_resume(struct device *dev) if (drv->resume) { err = drv->resume(xdev); if (err) { - pr_warn("resume %s failed: %i\n", dev_name(dev), err); + dev_warn(dev, "resume failed: %i\n", err); return err; } } err = watch_otherend(xdev); if (err) { - pr_warn("resume (watch_otherend) %s failed: %d.\n", - dev_name(dev), err); + dev_warn(dev, "resume (watch_otherend) failed: %d\n", err); return err; } @@ -657,35 +757,132 @@ void unregister_xenstore_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_xenstore_notifier); -void xenbus_probe(struct work_struct *unused) +static void xenbus_probe(void) { xenstored_ready = 1; + if (!xen_store_interface) + xen_store_interface = memremap(xen_store_gfn << XEN_PAGE_SHIFT, + XEN_PAGE_SIZE, MEMREMAP_WB); + /* + * Now it is safe to free the IRQ used for xenstore late + * initialization. No need to unbind: it is about to be + * bound again from xb_init_comms. Note that calling + * unbind_from_irqhandler now would result in xen_evtchn_close() + * being called and the event channel not being enabled again + * afterwards, resulting in missed event notifications. + */ + if (xs_init_irq >= 0) + free_irq(xs_init_irq, &xb_waitq); + + /* + * In the HVM case, xenbus_init() deferred its call to + * xs_init() in case callbacks were not operational yet. + * So do it now. + */ + if (xen_store_domain_type == XS_HVM) + xs_init(); + /* Notify others that xenstore is up */ blocking_notifier_call_chain(&xenstore_chain, 0, NULL); } -EXPORT_SYMBOL_GPL(xenbus_probe); + +/* + * Returns true when XenStore init must be deferred in order to + * allow the PCI platform device to be initialised, before we + * can actually have event channel interrupts working. + */ +static bool xs_hvm_defer_init_for_callback(void) +{ +#ifdef CONFIG_XEN_PVHVM + return xen_store_domain_type == XS_HVM && + !xen_have_vector_callback; +#else + return false; +#endif +} + +static int xenbus_probe_thread(void *unused) +{ + DEFINE_WAIT(w); + + /* + * We actually just want to wait for *any* trigger of xb_waitq, + * and run xenbus_probe() the moment it occurs. + */ + prepare_to_wait(&xb_waitq, &w, TASK_INTERRUPTIBLE); + schedule(); + finish_wait(&xb_waitq, &w); + + DPRINTK("probing"); + xenbus_probe(); + return 0; +} static int __init xenbus_probe_initcall(void) { if (!xen_domain()) return -ENODEV; - if (xen_initial_domain() || xen_hvm_domain()) - return 0; + /* + * Probe XenBus here in the XS_PV case, and also XS_HVM unless we + * need to wait for the platform PCI device to come up or + * xen_store_interface is not ready. + */ + if (xen_store_domain_type == XS_PV || + (xen_store_domain_type == XS_HVM && + !xs_hvm_defer_init_for_callback() && + XS_INTERFACE_READY)) + xenbus_probe(); + + /* + * For XS_LOCAL or when xen_store_interface is not ready, spawn a + * thread which will wait for xenstored or a xenstore-stubdom to be + * started, then probe. It will be triggered when communication + * starts happening, by waiting on xb_waitq. + */ + if (xen_store_domain_type == XS_LOCAL || !XS_INTERFACE_READY) { + struct task_struct *probe_task; - xenbus_probe(NULL); + probe_task = kthread_run(xenbus_probe_thread, NULL, + "xenbus_probe"); + if (IS_ERR(probe_task)) + return PTR_ERR(probe_task); + } return 0; } - device_initcall(xenbus_probe_initcall); +int xen_set_callback_via(uint64_t via) +{ + struct xen_hvm_param a; + int ret; + + a.domid = DOMID_SELF; + a.index = HVM_PARAM_CALLBACK_IRQ; + a.value = via; + + ret = HYPERVISOR_hvm_op(HVMOP_set_param, &a); + if (ret) + return ret; + + /* + * If xenbus_probe_initcall() deferred the xenbus_probe() + * due to the callback not functioning yet, we can do it now. + */ + if (!xenstored_ready && xs_hvm_defer_init_for_callback()) + xenbus_probe(); + + return ret; +} +EXPORT_SYMBOL_GPL(xen_set_callback_via); + /* Set up event channel for xenstored which is run as a local process * (this is normally used only in dom0) */ static int __init xenstored_local_init(void) { - int err = 0; + int err = -ENOMEM; unsigned long page = 0; struct evtchn_alloc_unbound alloc_unbound; @@ -694,9 +891,7 @@ static int __init xenstored_local_init(void) if (!page) goto out_err; - xen_store_mfn = xen_start_info->store_mfn = - pfn_to_mfn(virt_to_phys((void *)page) >> - PAGE_SHIFT); + xen_store_gfn = virt_to_gfn((void *)page); /* Next allocate a local port which xenstored can bind to */ alloc_unbound.dom = DOMID_SELF; @@ -708,8 +903,7 @@ static int __init xenstored_local_init(void) goto out_err; BUG_ON(err); - xen_store_evtchn = xen_start_info->store_evtchn = - alloc_unbound.port; + xen_store_evtchn = alloc_unbound.port; return 0; @@ -719,10 +913,49 @@ static int __init xenstored_local_init(void) return err; } -static int __init xenbus_init(void) +static int xenbus_resume_cb(struct notifier_block *nb, + unsigned long action, void *data) { int err = 0; + + if (xen_hvm_domain()) { + uint64_t v = 0; + + err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); + if (!err && v) + xen_store_evtchn = v; + else + pr_warn("Cannot update xenstore event channel: %d\n", + err); + } else + xen_store_evtchn = xen_start_info->store_evtchn; + + return err; +} + +static struct notifier_block xenbus_resume_nb = { + .notifier_call = xenbus_resume_cb, +}; + +static irqreturn_t xenbus_late_init(int irq, void *unused) +{ + int err; + uint64_t v = 0; + + err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v); + if (err || !v || !~v) + return IRQ_HANDLED; + xen_store_gfn = (unsigned long)v; + + wake_up(&xb_waitq); + return IRQ_HANDLED; +} + +static int __init xenbus_init(void) +{ + int err; uint64_t v = 0; + bool wait = false; xen_store_domain_type = XS_UNKNOWN; if (!xen_domain()) @@ -733,9 +966,15 @@ static int __init xenbus_init(void) if (xen_pv_domain()) xen_store_domain_type = XS_PV; if (xen_hvm_domain()) + { xen_store_domain_type = XS_HVM; - if (xen_hvm_domain() && xen_initial_domain()) - xen_store_domain_type = XS_LOCAL; + err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); + if (err) + goto out_error; + xen_store_evtchn = (int)v; + if (!v && xen_initial_domain()) + xen_store_domain_type = XS_LOCAL; + } if (xen_pv_domain() && !xen_start_info->store_evtchn) xen_store_domain_type = XS_LOCAL; if (xen_pv_domain() && xen_start_info->store_evtchn) @@ -746,46 +985,103 @@ static int __init xenbus_init(void) err = xenstored_local_init(); if (err) goto out_error; - xen_store_interface = mfn_to_virt(xen_store_mfn); + xen_store_interface = gfn_to_virt(xen_store_gfn); break; case XS_PV: xen_store_evtchn = xen_start_info->store_evtchn; - xen_store_mfn = xen_start_info->store_mfn; - xen_store_interface = mfn_to_virt(xen_store_mfn); + xen_store_gfn = xen_start_info->store_mfn; + xen_store_interface = gfn_to_virt(xen_store_gfn); break; case XS_HVM: - err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); - if (err) - goto out_error; - xen_store_evtchn = (int)v; err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v); if (err) goto out_error; - xen_store_mfn = (unsigned long)v; - xen_store_interface = - xen_remap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE); + /* + * Uninitialized hvm_params are zero and return no error. + * Although it is theoretically possible to have + * HVM_PARAM_STORE_PFN set to zero on purpose, in reality it is + * not zero when valid. If zero, it means that Xenstore hasn't + * been properly initialized. Instead of attempting to map a + * wrong guest physical address return error. + * + * Also recognize all bits set as an invalid/uninitialized value. + */ + if (!v) { + err = -ENOENT; + goto out_error; + } + if (v == ~0ULL) { + wait = true; + } else { + /* Avoid truncation on 32-bit. */ +#if BITS_PER_LONG == 32 + if (v > ULONG_MAX) { + pr_err("%s: cannot handle HVM_PARAM_STORE_PFN=%llx > ULONG_MAX\n", + __func__, v); + err = -EINVAL; + goto out_error; + } +#endif + xen_store_gfn = (unsigned long)v; + xen_store_interface = + memremap(xen_store_gfn << XEN_PAGE_SHIFT, + XEN_PAGE_SIZE, MEMREMAP_WB); + if (!xen_store_interface) { + pr_err("%s: cannot map HVM_PARAM_STORE_PFN=%llx\n", + __func__, v); + err = -EINVAL; + goto out_error; + } + if (xen_store_interface->connection != XENSTORE_CONNECTED) + wait = true; + } + if (wait) { + err = bind_evtchn_to_irqhandler(xen_store_evtchn, + xenbus_late_init, + 0, "xenstore_late_init", + &xb_waitq); + if (err < 0) { + pr_err("xenstore_late_init couldn't bind irq err=%d\n", + err); + goto out_error; + } + + xs_init_irq = err; + } break; default: pr_warn("Xenstore state unknown\n"); break; } - /* Initialize the interface to xenstore. */ - err = xs_init(); - if (err) { - pr_warn("Error initializing xenstore comms: %i\n", err); - goto out_error; + /* + * HVM domains may not have a functional callback yet. In that + * case let xs_init() be called from xenbus_probe(), which will + * get invoked at an appropriate time. + */ + if (xen_store_domain_type != XS_HVM) { + err = xs_init(); + if (err) { + pr_warn("Error initializing xenstore comms: %i\n", err); + goto out_error; + } } + if ((xen_store_domain_type != XS_LOCAL) && + (xen_store_domain_type != XS_UNKNOWN)) + xen_resume_notifier_register(&xenbus_resume_nb); + #ifdef CONFIG_XEN_COMPAT_XENFS /* * Create xenfs mountpoint in /proc for compatibility with * utilities that expect to find "xenbus" under "/proc/xen". */ - proc_mkdir("xen", NULL); + proc_create_mount_point("xen"); #endif + return 0; out_error: + xen_store_domain_type = XS_UNKNOWN; return err; } diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h deleted file mode 100644 index 146f857a36f8..000000000000 --- a/drivers/xen/xenbus/xenbus_probe.h +++ /dev/null @@ -1,86 +0,0 @@ -/****************************************************************************** - * xenbus_probe.h - * - * Talks to Xen Store to figure out what devices we have. - * - * Copyright (C) 2005 Rusty Russell, IBM Corporation - * Copyright (C) 2005 XenSource Ltd. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation; or, when distributed - * separately from the Linux kernel or incorporated into other - * software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef _XENBUS_PROBE_H -#define _XENBUS_PROBE_H - -#define XEN_BUS_ID_SIZE 20 - -struct xen_bus_type { - char *root; - unsigned int levels; - int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename); - int (*probe)(struct xen_bus_type *bus, const char *type, - const char *dir); - void (*otherend_changed)(struct xenbus_watch *watch, const char **vec, - unsigned int len); - struct bus_type bus; -}; - -enum xenstore_init { - XS_UNKNOWN, - XS_PV, - XS_HVM, - XS_LOCAL, -}; - -extern struct device_attribute xenbus_dev_attrs[]; - -extern int xenbus_match(struct device *_dev, struct device_driver *_drv); -extern int xenbus_dev_probe(struct device *_dev); -extern int xenbus_dev_remove(struct device *_dev); -extern int xenbus_register_driver_common(struct xenbus_driver *drv, - struct xen_bus_type *bus); -extern int xenbus_probe_node(struct xen_bus_type *bus, - const char *type, - const char *nodename); -extern int xenbus_probe_devices(struct xen_bus_type *bus); - -extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus); - -extern void xenbus_dev_shutdown(struct device *_dev); - -extern int xenbus_dev_suspend(struct device *dev); -extern int xenbus_dev_resume(struct device *dev); -extern int xenbus_dev_cancel(struct device *dev); - -extern void xenbus_otherend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len, - int ignore_on_shutdown); - -extern int xenbus_read_otherend_details(struct xenbus_device *xendev, - char *id_node, char *path_node); - -void xenbus_ring_ops_init(void); - -#endif diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index 998bbbab816b..5ebb7233076f 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -45,16 +45,15 @@ #include <linux/mm.h> #include <linux/notifier.h> #include <linux/export.h> +#include <linux/semaphore.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/xen/hypervisor.h> #include <asm/hypervisor.h> #include <xen/xenbus.h> #include <xen/features.h> -#include "xenbus_comms.h" -#include "xenbus_probe.h" +#include "xenbus.h" /* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */ static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) @@ -93,12 +92,12 @@ static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) return 0; } -static int xenbus_uevent_backend(struct device *dev, +static int xenbus_uevent_backend(const struct device *dev, struct kobj_uevent_env *env) { - struct xenbus_device *xdev; - struct xenbus_driver *drv; - struct xen_bus_type *bus; + const struct xenbus_device *xdev; + const struct xenbus_driver *drv; + const struct xen_bus_type *bus; DPRINTK(""); @@ -181,10 +180,16 @@ static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type, return err; } +static bool frontend_will_handle(struct xenbus_watch *watch, + const char *path, const char *token) +{ + return watch->nr_pending == 0; +} + static void frontend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { - xenbus_otherend_changed(watch, vec, len, 0); + xenbus_otherend_changed(watch, path, token, 0); } static struct xen_bus_type xenbus_backend = { @@ -192,6 +197,7 @@ static struct xen_bus_type xenbus_backend = { .levels = 3, /* backend/type/<frontend>/<id> */ .get_bus_id = backend_bus_id, .probe = xenbus_probe_backend, + .otherend_will_handle = frontend_will_handle, .otherend_changed = frontend_changed, .bus = { .name = "xen-backend", @@ -199,17 +205,16 @@ static struct xen_bus_type xenbus_backend = { .uevent = xenbus_uevent_backend, .probe = xenbus_dev_probe, .remove = xenbus_dev_remove, - .shutdown = xenbus_dev_shutdown, - .dev_attrs = xenbus_dev_attrs, + .dev_groups = xenbus_dev_groups, }, }; static void backend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { DPRINTK(""); - xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_backend); + xenbus_dev_changed(path, &xenbus_backend); } static struct xenbus_watch be_watch = { @@ -224,23 +229,19 @@ static int read_frontend_details(struct xenbus_device *xendev) int xenbus_dev_is_online(struct xenbus_device *dev) { - int rc, val; - - rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val); - if (rc != 1) - val = 0; /* no online node present */ - - return val; + return !!xenbus_read_unsigned(dev->nodename, "online", 0); } EXPORT_SYMBOL_GPL(xenbus_dev_is_online); -int xenbus_register_backend(struct xenbus_driver *drv) +int __xenbus_register_backend(struct xenbus_driver *drv, struct module *owner, + const char *mod_name) { drv->read_otherend_details = read_frontend_details; - return xenbus_register_driver_common(drv, &xenbus_backend); + return xenbus_register_driver_common(drv, &xenbus_backend, + owner, mod_name); } -EXPORT_SYMBOL_GPL(xenbus_register_backend); +EXPORT_SYMBOL_GPL(__xenbus_register_backend); static int backend_probe_and_watch(struct notifier_block *notifier, unsigned long event, @@ -253,8 +254,39 @@ static int backend_probe_and_watch(struct notifier_block *notifier, return NOTIFY_DONE; } +static int backend_reclaim_memory(struct device *dev, void *data) +{ + const struct xenbus_driver *drv; + struct xenbus_device *xdev; + + if (!dev->driver) + return 0; + drv = to_xenbus_driver(dev->driver); + if (drv && drv->reclaim_memory) { + xdev = to_xenbus_device(dev); + if (down_trylock(&xdev->reclaim_sem)) + return 0; + drv->reclaim_memory(xdev); + up(&xdev->reclaim_sem); + } + return 0; +} + +/* + * Returns 0 always because we are using shrinker to only detect memory + * pressure. + */ +static unsigned long backend_shrink_memory_count(struct shrinker *shrinker, + struct shrink_control *sc) +{ + bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, + backend_reclaim_memory); + return 0; +} + static int __init xenbus_probe_backend_init(void) { + struct shrinker *backend_memory_shrinker; static struct notifier_block xenstore_notifier = { .notifier_call = backend_probe_and_watch }; @@ -269,6 +301,16 @@ static int __init xenbus_probe_backend_init(void) register_xenstore_notifier(&xenstore_notifier); + backend_memory_shrinker = shrinker_alloc(0, "xen-backend"); + if (!backend_memory_shrinker) { + pr_warn("shrinker allocation failed\n"); + return 0; + } + + backend_memory_shrinker->count_objects = backend_shrink_memory_count; + + shrinker_register(backend_memory_shrinker); + return 0; } subsys_initcall(xenbus_probe_backend_init); diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index 6ed8a9df4472..6d1819269cbe 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define DPRINTK(fmt, ...) \ @@ -18,7 +19,6 @@ #include <linux/module.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/xen/hypervisor.h> #include <xen/xenbus.h> #include <xen/events.h> @@ -27,11 +27,9 @@ #include <xen/platform_pci.h> -#include "xenbus_comms.h" -#include "xenbus_probe.h" +#include "xenbus.h" -static struct workqueue_struct *xenbus_frontend_wq; /* device/<type>/<id> => <type>-<id> */ static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) @@ -42,7 +40,7 @@ static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) return -EINVAL; } - strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE); + strscpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE); if (!strchr(bus_id, '/')) { pr_warn("bus_id %s no slash\n", bus_id); return -EINVAL; @@ -75,10 +73,10 @@ static int xenbus_probe_frontend(struct xen_bus_type *bus, const char *type, return err; } -static int xenbus_uevent_frontend(struct device *_dev, +static int xenbus_uevent_frontend(const struct device *_dev, struct kobj_uevent_env *env) { - struct xenbus_device *dev = to_xenbus_device(_dev); + const struct xenbus_device *dev = to_xenbus_device(_dev); if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype)) return -ENOMEM; @@ -88,9 +86,9 @@ static int xenbus_uevent_frontend(struct device *_dev, static void backend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { - xenbus_otherend_changed(watch, vec, len, 1); + xenbus_otherend_changed(watch, path, token, 1); } static void xenbus_frontend_delayed_resume(struct work_struct *w) @@ -109,14 +107,7 @@ static int xenbus_frontend_dev_resume(struct device *dev) if (xen_store_domain_type == XS_LOCAL) { struct xenbus_device *xdev = to_xenbus_device(dev); - if (!xenbus_frontend_wq) { - pr_err("%s: no workqueue to process delayed resume\n", - xdev->nodename); - return -EFAULT; - } - - INIT_WORK(&xdev->work, xenbus_frontend_delayed_resume); - queue_work(xenbus_frontend_wq, &xdev->work); + schedule_work(&xdev->work); return 0; } @@ -124,6 +115,38 @@ static int xenbus_frontend_dev_resume(struct device *dev) return xenbus_dev_resume(dev); } +static int xenbus_frontend_dev_probe(struct device *dev) +{ + if (xen_store_domain_type == XS_LOCAL) { + struct xenbus_device *xdev = to_xenbus_device(dev); + INIT_WORK(&xdev->work, xenbus_frontend_delayed_resume); + } + + return xenbus_dev_probe(dev); +} + +static void xenbus_frontend_dev_shutdown(struct device *_dev) +{ + struct xenbus_device *dev = to_xenbus_device(_dev); + unsigned long timeout = 5*HZ; + + DPRINTK("%s", dev->nodename); + + get_device(&dev->dev); + if (dev->state != XenbusStateConnected) { + pr_info("%s: %s: %s != Connected, skipping\n", + __func__, dev->nodename, xenbus_strstate(dev->state)); + goto out; + } + xenbus_switch_state(dev, XenbusStateClosing); + timeout = wait_for_completion_timeout(&dev->down, timeout); + if (!timeout) + pr_info("%s: %s timeout closing device\n", + __func__, dev->nodename); + out: + put_device(&dev->dev); +} + static const struct dev_pm_ops xenbus_pm_ops = { .suspend = xenbus_dev_suspend, .resume = xenbus_frontend_dev_resume, @@ -142,21 +165,21 @@ static struct xen_bus_type xenbus_frontend = { .name = "xen", .match = xenbus_match, .uevent = xenbus_uevent_frontend, - .probe = xenbus_dev_probe, + .probe = xenbus_frontend_dev_probe, .remove = xenbus_dev_remove, - .shutdown = xenbus_dev_shutdown, - .dev_attrs = xenbus_dev_attrs, + .shutdown = xenbus_frontend_dev_shutdown, + .dev_groups = xenbus_dev_groups, .pm = &xenbus_pm_ops, }, }; static void frontend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { DPRINTK(""); - xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend); + xenbus_dev_changed(path, &xenbus_frontend); } @@ -188,19 +211,11 @@ static int is_device_connecting(struct device *dev, void *data, bool ignore_none if (drv && (dev->driver != drv)) return 0; - if (ignore_nonessential) { - /* With older QEMU, for PVonHVM guests the guest config files - * could contain: vfb = [ 'vnc=1, vnclisten=0.0.0.0'] - * which is nonsensical as there is no PV FB (there can be - * a PVKB) running as HVM guest. */ + xendrv = to_xenbus_driver(dev->driver); - if ((strncmp(xendev->nodename, "device/vkbd", 11) == 0)) - return 0; + if (ignore_nonessential && xendrv->not_essential) + return 0; - if ((strncmp(xendev->nodename, "device/vfb", 10) == 0)) - return 0; - } - xendrv = to_xenbus_driver(dev->driver); return (xendev->state < XenbusStateConnected || (xendev->state == XenbusStateConnected && xendrv->is_ready && !xendrv->is_ready(xendev))); @@ -308,13 +323,15 @@ static void wait_for_devices(struct xenbus_driver *xendrv) print_device_status); } -int xenbus_register_frontend(struct xenbus_driver *drv) +int __xenbus_register_frontend(struct xenbus_driver *drv, struct module *owner, + const char *mod_name) { int ret; drv->read_otherend_details = read_backend_details; - ret = xenbus_register_driver_common(drv, &xenbus_frontend); + ret = xenbus_register_driver_common(drv, &xenbus_frontend, + owner, mod_name); if (ret) return ret; @@ -323,17 +340,19 @@ int xenbus_register_frontend(struct xenbus_driver *drv) return 0; } -EXPORT_SYMBOL_GPL(xenbus_register_frontend); +EXPORT_SYMBOL_GPL(__xenbus_register_frontend); static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq); static int backend_state; static void xenbus_reset_backend_state_changed(struct xenbus_watch *w, - const char **v, unsigned int l) + const char *path, const char *token) { - xenbus_scanf(XBT_NIL, v[XS_WATCH_PATH], "", "%i", &backend_state); + if (xenbus_scanf(XBT_NIL, path, "", "%i", + &backend_state) != 1) + backend_state = XenbusStateUnknown; printk(KERN_DEBUG "XENBUS: backend %s %s\n", - v[XS_WATCH_PATH], xenbus_strstate(backend_state)); + path, xenbus_strstate(backend_state)); wake_up(&backend_state_wq); } @@ -374,10 +393,12 @@ static void xenbus_reset_frontend(char *fe, char *be, int be_state) case XenbusStateConnected: xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosing); xenbus_reset_wait_for_backend(be, XenbusStateClosing); + fallthrough; case XenbusStateClosing: xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosed); xenbus_reset_wait_for_backend(be, XenbusStateClosed); + fallthrough; case XenbusStateClosed: xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateInitialising); @@ -408,7 +429,7 @@ static void xenbus_check_frontend(char *class, char *dev) printk(KERN_DEBUG "XENBUS: frontend %s %s\n", frontend, xenbus_strstate(fe_state)); backend = xenbus_read(XBT_NIL, frontend, "backend", NULL); - if (!backend || IS_ERR(backend)) + if (IS_ERR_OR_NULL(backend)) goto out; err = xenbus_scanf(XBT_NIL, backend, "state", "%i", &be_state); if (err == 1) @@ -474,8 +495,6 @@ static int __init xenbus_probe_frontend_init(void) register_xenstore_notifier(&xenstore_notifier); - xenbus_frontend_wq = create_workqueue("xenbus_frontend"); - return 0; } subsys_initcall(xenbus_probe_frontend_init); @@ -483,7 +502,7 @@ subsys_initcall(xenbus_probe_frontend_init); #ifndef MODULE static int __init boot_wait_for_devices(void) { - if (xen_hvm_domain() && !xen_platform_pci_unplug) + if (!xen_has_pv_devices()) return -ENODEV; ready_to_wait_for_devices = 1; @@ -494,4 +513,5 @@ static int __init boot_wait_for_devices(void) late_initcall(boot_wait_for_devices); #endif +MODULE_DESCRIPTION("Xen PV-device frontend support"); MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index b6d5fff43d16..15f18374020e 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -43,69 +43,36 @@ #include <linux/slab.h> #include <linux/fcntl.h> #include <linux/kthread.h> +#include <linux/reboot.h> #include <linux/rwsem.h> -#include <linux/module.h> #include <linux/mutex.h> #include <asm/xen/hypervisor.h> #include <xen/xenbus.h> #include <xen/xen.h> -#include "xenbus_comms.h" - -struct xs_stored_msg { - struct list_head list; - - struct xsd_sockmsg hdr; - - union { - /* Queued replies. */ - struct { - char *body; - } reply; - - /* Queued watch events. */ - struct { - struct xenbus_watch *handle; - char **vec; - unsigned int vec_size; - } watch; - } u; -}; - -struct xs_handle { - /* A list of replies. Currently only one will ever be outstanding. */ - struct list_head reply_list; - spinlock_t reply_lock; - wait_queue_head_t reply_waitq; - - /* - * Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex. - * response_mutex is never taken simultaneously with the other three. - * - * transaction_mutex must be held before incrementing - * transaction_count. The mutex is held when a suspend is in - * progress to prevent new transactions starting. - * - * When decrementing transaction_count to zero the wait queue - * should be woken up, the suspend code waits for count to - * reach zero. - */ - - /* One request at a time. */ - struct mutex request_mutex; +#include "xenbus.h" - /* Protect xenbus reader thread against save/restore. */ - struct mutex response_mutex; - - /* Protect transactions against save/restore. */ - struct mutex transaction_mutex; - atomic_t transaction_count; - wait_queue_head_t transaction_wq; +/* + * Framework to protect suspend/resume handling against normal Xenstore + * message handling: + * During suspend/resume there must be no open transaction and no pending + * Xenstore request. + * New watch events happening in this time can be ignored by firing all watches + * after resume. + */ - /* Protect watch (de)register against save/restore. */ - struct rw_semaphore watch_mutex; -}; +/* Lock protecting enter/exit critical region. */ +static DEFINE_SPINLOCK(xs_state_lock); +/* Number of users in critical region (protected by xs_state_lock). */ +static unsigned int xs_state_users; +/* Suspend handler waiting or already active (protected by xs_state_lock)? */ +static int xs_suspend_active; +/* Unique Xenstore request id (protected by xs_state_lock). */ +static uint32_t xs_request_id; -static struct xs_handle xs_state; +/* Wait queue for all callers waiting for critical region to become usable. */ +static DECLARE_WAIT_QUEUE_HEAD(xs_state_enter_wq); +/* Wait queue for suspend handling waiting for critical region being empty. */ +static DECLARE_WAIT_QUEUE_HEAD(xs_state_exit_wq); /* List of registered watches, and a lock to protect it. */ static LIST_HEAD(watches); @@ -115,6 +82,9 @@ static DEFINE_SPINLOCK(watches_lock); static LIST_HEAD(watch_events); static DEFINE_SPINLOCK(watch_events_lock); +/* Protect watch (de)register against save/restore. */ +static DECLARE_RWSEM(xs_watch_rwsem); + /* * Details of the xenwatch callback kernel thread. The thread waits on the * watch_events_waitq for work to do (queued on watch_events list). When it @@ -125,6 +95,68 @@ static pid_t xenwatch_pid; static DEFINE_MUTEX(xenwatch_mutex); static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq); +static void xs_suspend_enter(void) +{ + spin_lock(&xs_state_lock); + xs_suspend_active++; + spin_unlock(&xs_state_lock); + wait_event(xs_state_exit_wq, xs_state_users == 0); +} + +static void xs_suspend_exit(void) +{ + xb_dev_generation_id++; + spin_lock(&xs_state_lock); + xs_suspend_active--; + spin_unlock(&xs_state_lock); + wake_up_all(&xs_state_enter_wq); +} + +void xs_free_req(struct kref *kref) +{ + struct xb_req_data *req = container_of(kref, struct xb_req_data, kref); + kfree(req); +} + +static uint32_t xs_request_enter(struct xb_req_data *req) +{ + uint32_t rq_id; + + req->type = req->msg.type; + + spin_lock(&xs_state_lock); + + while (!xs_state_users && xs_suspend_active) { + spin_unlock(&xs_state_lock); + wait_event(xs_state_enter_wq, xs_suspend_active == 0); + spin_lock(&xs_state_lock); + } + + if (req->type == XS_TRANSACTION_START && !req->user_req) + xs_state_users++; + xs_state_users++; + rq_id = xs_request_id++; + + spin_unlock(&xs_state_lock); + + return rq_id; +} + +void xs_request_exit(struct xb_req_data *req) +{ + spin_lock(&xs_state_lock); + xs_state_users--; + if ((req->type == XS_TRANSACTION_START && req->msg.type == XS_ERROR) || + (req->type == XS_TRANSACTION_END && !req->user_req && + !WARN_ON_ONCE(req->msg.type == XS_ERROR && + !strcmp(req->body, "ENOENT")))) + xs_state_users--; + spin_unlock(&xs_state_lock); + + if (xs_suspend_active && !xs_state_users) + wake_up(&xs_state_exit_wq); +} + static int get_error(const char *errorstring) { unsigned int i; @@ -139,88 +171,143 @@ static int get_error(const char *errorstring) return xsd_errors[i].errnum; } -static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) +static bool xenbus_ok(void) { - struct xs_stored_msg *msg; - char *body; - - spin_lock(&xs_state.reply_lock); + switch (xen_store_domain_type) { + case XS_LOCAL: + switch (system_state) { + case SYSTEM_POWER_OFF: + case SYSTEM_RESTART: + case SYSTEM_HALT: + return false; + default: + break; + } + return true; + case XS_PV: + case XS_HVM: + /* FIXME: Could check that the remote domain is alive, + * but it is normally initial domain. */ + return true; + default: + break; + } + return false; +} - while (list_empty(&xs_state.reply_list)) { - spin_unlock(&xs_state.reply_lock); - /* XXX FIXME: Avoid synchronous wait for response here. */ - wait_event(xs_state.reply_waitq, - !list_empty(&xs_state.reply_list)); - spin_lock(&xs_state.reply_lock); +static bool test_reply(struct xb_req_data *req) +{ + if (req->state == xb_req_state_got_reply || !xenbus_ok()) { + /* read req->state before all other fields */ + virt_rmb(); + return true; } - msg = list_entry(xs_state.reply_list.next, - struct xs_stored_msg, list); - list_del(&msg->list); + /* Make sure to reread req->state each time. */ + barrier(); - spin_unlock(&xs_state.reply_lock); + return false; +} - *type = msg->hdr.type; - if (len) - *len = msg->hdr.len; - body = msg->u.reply.body; +static void *read_reply(struct xb_req_data *req) +{ + do { + wait_event(req->wq, test_reply(req)); - kfree(msg); + if (!xenbus_ok()) + /* + * If we are in the process of being shut-down there is + * no point of trying to contact XenBus - it is either + * killed (xenstored application) or the other domain + * has been killed or is unreachable. + */ + return ERR_PTR(-EIO); + if (req->err) + return ERR_PTR(req->err); - return body; -} + } while (req->state != xb_req_state_got_reply); -static void transaction_start(void) -{ - mutex_lock(&xs_state.transaction_mutex); - atomic_inc(&xs_state.transaction_count); - mutex_unlock(&xs_state.transaction_mutex); + return req->body; } -static void transaction_end(void) +static void xs_send(struct xb_req_data *req, struct xsd_sockmsg *msg) { - if (atomic_dec_and_test(&xs_state.transaction_count)) - wake_up(&xs_state.transaction_wq); + bool notify; + + req->msg = *msg; + req->err = 0; + req->state = xb_req_state_queued; + init_waitqueue_head(&req->wq); + + /* Save the caller req_id and restore it later in the reply */ + req->caller_req_id = req->msg.req_id; + req->msg.req_id = xs_request_enter(req); + + /* + * Take 2nd ref. One for this thread, and the second for the + * xenbus_thread. + */ + kref_get(&req->kref); + + mutex_lock(&xb_write_mutex); + list_add_tail(&req->list, &xb_write_list); + notify = list_is_singular(&xb_write_list); + mutex_unlock(&xb_write_mutex); + + if (notify) + wake_up(&xb_waitq); } -static void transaction_suspend(void) +static void *xs_wait_for_reply(struct xb_req_data *req, struct xsd_sockmsg *msg) { - mutex_lock(&xs_state.transaction_mutex); - wait_event(xs_state.transaction_wq, - atomic_read(&xs_state.transaction_count) == 0); + void *ret; + + ret = read_reply(req); + + xs_request_exit(req); + + msg->type = req->msg.type; + msg->len = req->msg.len; + + mutex_lock(&xb_write_mutex); + if (req->state == xb_req_state_queued || + req->state == xb_req_state_wait_reply) + req->state = xb_req_state_aborted; + + kref_put(&req->kref, xs_free_req); + mutex_unlock(&xb_write_mutex); + + return ret; } -static void transaction_resume(void) +static void xs_wake_up(struct xb_req_data *req) { - mutex_unlock(&xs_state.transaction_mutex); + wake_up(&req->wq); } -void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) +int xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void *par) { - void *ret; - struct xsd_sockmsg req_msg = *msg; - int err; + struct xb_req_data *req; + struct kvec *vec; - if (req_msg.type == XS_TRANSACTION_START) - transaction_start(); + req = kmalloc(sizeof(*req) + sizeof(*vec), GFP_KERNEL); + if (!req) + return -ENOMEM; - mutex_lock(&xs_state.request_mutex); + vec = (struct kvec *)(req + 1); + vec->iov_len = msg->len; + vec->iov_base = msg + 1; - err = xb_write(msg, sizeof(*msg) + msg->len); - if (err) { - msg->type = XS_ERROR; - ret = ERR_PTR(err); - } else - ret = read_reply(&msg->type, &msg->len); - - mutex_unlock(&xs_state.request_mutex); + req->vec = vec; + req->num_vecs = 1; + req->cb = xenbus_dev_queue_reply; + req->par = par; + req->user_req = true; + kref_init(&req->kref); - if ((msg->type == XS_TRANSACTION_END) || - ((req_msg.type == XS_TRANSACTION_START) && - (msg->type == XS_ERROR))) - transaction_end(); + xs_send(req, msg); - return ret; + return 0; } EXPORT_SYMBOL(xenbus_dev_request_and_reply); @@ -231,37 +318,34 @@ static void *xs_talkv(struct xenbus_transaction t, unsigned int num_vecs, unsigned int *len) { + struct xb_req_data *req; struct xsd_sockmsg msg; void *ret = NULL; unsigned int i; int err; - msg.tx_id = t.id; + req = kmalloc(sizeof(*req), GFP_NOIO | __GFP_HIGH); + if (!req) + return ERR_PTR(-ENOMEM); + + req->vec = iovec; + req->num_vecs = num_vecs; + req->cb = xs_wake_up; + req->user_req = false; + kref_init(&req->kref); + msg.req_id = 0; + msg.tx_id = t.id; msg.type = type; msg.len = 0; for (i = 0; i < num_vecs; i++) msg.len += iovec[i].iov_len; - mutex_lock(&xs_state.request_mutex); - - err = xb_write(&msg, sizeof(msg)); - if (err) { - mutex_unlock(&xs_state.request_mutex); - return ERR_PTR(err); - } - - for (i = 0; i < num_vecs; i++) { - err = xb_write(iovec[i].iov_base, iovec[i].iov_len); - if (err) { - mutex_unlock(&xs_state.request_mutex); - return ERR_PTR(err); - } - } - - ret = read_reply(&msg.type, len); + xs_send(req, &msg); - mutex_unlock(&xs_state.request_mutex); + ret = xs_wait_for_reply(req, &msg); + if (len) + *len = msg.len; if (IS_ERR(ret)) return ret; @@ -323,10 +407,10 @@ static char *join(const char *dir, const char *name) buffer = kasprintf(GFP_NOIO | __GFP_HIGH, "%s", dir); else buffer = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/%s", dir, name); - return (!buffer) ? ERR_PTR(-ENOMEM) : buffer; + return buffer ?: ERR_PTR(-ENOMEM); } -static char **split(char *strings, unsigned int len, unsigned int *num) +static char **split_strings(char *strings, unsigned int len, unsigned int *num) { char *p, **ret; @@ -357,14 +441,14 @@ char **xenbus_directory(struct xenbus_transaction t, path = join(dir, node); if (IS_ERR(path)) - return (char **)path; + return ERR_CAST(path); strings = xs_single(t, XS_DIRECTORY, path, &len); kfree(path); if (IS_ERR(strings)) - return (char **)strings; + return ERR_CAST(strings); - return split(strings, len, num); + return split_strings(strings, len, num); } EXPORT_SYMBOL_GPL(xenbus_directory); @@ -395,7 +479,7 @@ void *xenbus_read(struct xenbus_transaction t, path = join(dir, node); if (IS_ERR(path)) - return (void *)path; + return ERR_CAST(path); ret = xs_single(t, XS_READ, path, len); kfree(path); @@ -428,23 +512,6 @@ int xenbus_write(struct xenbus_transaction t, } EXPORT_SYMBOL_GPL(xenbus_write); -/* Create a new directory. */ -int xenbus_mkdir(struct xenbus_transaction t, - const char *dir, const char *node) -{ - char *path; - int ret; - - path = join(dir, node); - if (IS_ERR(path)) - return PTR_ERR(path); - - ret = xs_error(xs_single(t, XS_MKDIR, path, NULL)); - kfree(path); - return ret; -} -EXPORT_SYMBOL_GPL(xenbus_mkdir); - /* Destroy a file or directory (directories must be empty). */ int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node) { @@ -468,13 +535,9 @@ int xenbus_transaction_start(struct xenbus_transaction *t) { char *id_str; - transaction_start(); - id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL); - if (IS_ERR(id_str)) { - transaction_end(); + if (IS_ERR(id_str)) return PTR_ERR(id_str); - } t->id = simple_strtoul(id_str, NULL, 0); kfree(id_str); @@ -483,23 +546,12 @@ int xenbus_transaction_start(struct xenbus_transaction *t) EXPORT_SYMBOL_GPL(xenbus_transaction_start); /* End a transaction. - * If abandon is true, transaction is discarded instead of committed. + * If abort is true, transaction is discarded instead of committed. */ -int xenbus_transaction_end(struct xenbus_transaction t, int abort) +int xenbus_transaction_end(struct xenbus_transaction t, bool abort) { - char abortstr[2]; - int err; - - if (abort) - strcpy(abortstr, "F"); - else - strcpy(abortstr, "T"); - - err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL)); - - transaction_end(); - - return err; + return xs_error(xs_single(t, XS_TRANSACTION_END, abort ? "F" : "T", + NULL)); } EXPORT_SYMBOL_GPL(xenbus_transaction_end); @@ -526,6 +578,21 @@ int xenbus_scanf(struct xenbus_transaction t, } EXPORT_SYMBOL_GPL(xenbus_scanf); +/* Read an (optional) unsigned value. */ +unsigned int xenbus_read_unsigned(const char *dir, const char *node, + unsigned int default_val) +{ + unsigned int val; + int ret; + + ret = xenbus_scanf(XBT_NIL, dir, node, "%u", &val); + if (ret <= 0) + val = default_val; + + return val; +} +EXPORT_SYMBOL_GPL(xenbus_read_unsigned); + /* Single printf and write: returns -errno or 0. */ int xenbus_printf(struct xenbus_transaction t, const char *dir, const char *node, const char *fmt, ...) @@ -617,39 +684,43 @@ static struct xenbus_watch *find_watch(const char *token) return NULL; } -/* - * Certain older XenBus toolstack cannot handle reading values that are - * not populated. Some Xen 3.4 installation are incapable of doing this - * so if we are running on anything older than 4 do not attempt to read - * control/platform-feature-xs_reset_watches. - */ -static bool xen_strict_xenbus_quirk(void) -{ -#ifdef CONFIG_X86 - uint32_t eax, ebx, ecx, edx, base; - base = xen_cpuid_base(); - cpuid(base + 1, &eax, &ebx, &ecx, &edx); +int xs_watch_msg(struct xs_watch_event *event) +{ + if (count_strings(event->body, event->len) != 2) { + kfree(event); + return -EINVAL; + } + event->path = (const char *)event->body; + event->token = (const char *)strchr(event->body, '\0') + 1; - if ((eax >> 16) < 4) - return true; -#endif - return false; + spin_lock(&watches_lock); + event->handle = find_watch(event->token); + if (event->handle != NULL && + (!event->handle->will_handle || + event->handle->will_handle(event->handle, + event->path, event->token))) { + spin_lock(&watch_events_lock); + list_add_tail(&event->list, &watch_events); + event->handle->nr_pending++; + wake_up(&watch_events_waitq); + spin_unlock(&watch_events_lock); + } else + kfree(event); + spin_unlock(&watches_lock); + return 0; } + static void xs_reset_watches(void) { - int err, supported = 0; + int err; if (!xen_hvm_domain() || xen_initial_domain()) return; - if (xen_strict_xenbus_quirk()) - return; - - err = xenbus_scanf(XBT_NIL, "control", - "platform-feature-xs_reset_watches", "%d", &supported); - if (err != 1 || !supported) + if (!xenbus_read_unsigned("control", + "platform-feature-xs_reset_watches", 0)) return; err = xs_error(xs_single(XBT_NIL, XS_RESET_WATCHES, "", NULL)); @@ -666,7 +737,9 @@ int register_xenbus_watch(struct xenbus_watch *watch) sprintf(token, "%lX", (long)watch); - down_read(&xs_state.watch_mutex); + watch->nr_pending = 0; + + down_read(&xs_watch_rwsem); spin_lock(&watches_lock); BUG_ON(find_watch(token)); @@ -681,7 +754,7 @@ int register_xenbus_watch(struct xenbus_watch *watch) spin_unlock(&watches_lock); } - up_read(&xs_state.watch_mutex); + up_read(&xs_watch_rwsem); return err; } @@ -689,13 +762,13 @@ EXPORT_SYMBOL_GPL(register_xenbus_watch); void unregister_xenbus_watch(struct xenbus_watch *watch) { - struct xs_stored_msg *msg, *tmp; + struct xs_watch_event *event, *tmp; char token[sizeof(watch) * 2 + 1]; int err; sprintf(token, "%lX", (long)watch); - down_read(&xs_state.watch_mutex); + down_read(&xs_watch_rwsem); spin_lock(&watches_lock); BUG_ON(!find_watch(token)); @@ -706,7 +779,7 @@ void unregister_xenbus_watch(struct xenbus_watch *watch) if (err) pr_warn("Failed to release watch %s: %i\n", watch->node, err); - up_read(&xs_state.watch_mutex); + up_read(&xs_watch_rwsem); /* Make sure there are no callbacks running currently (unless its us) */ @@ -715,12 +788,14 @@ void unregister_xenbus_watch(struct xenbus_watch *watch) /* Cancel pending watch events. */ spin_lock(&watch_events_lock); - list_for_each_entry_safe(msg, tmp, &watch_events, list) { - if (msg->u.watch.handle != watch) - continue; - list_del(&msg->list); - kfree(msg->u.watch.vec); - kfree(msg); + if (watch->nr_pending) { + list_for_each_entry_safe(event, tmp, &watch_events, list) { + if (event->handle != watch) + continue; + list_del(&event->list); + kfree(event); + } + watch->nr_pending = 0; } spin_unlock(&watch_events_lock); @@ -731,10 +806,10 @@ EXPORT_SYMBOL_GPL(unregister_xenbus_watch); void xs_suspend(void) { - transaction_suspend(); - down_write(&xs_state.watch_mutex); - mutex_lock(&xs_state.request_mutex); - mutex_lock(&xs_state.response_mutex); + xs_suspend_enter(); + + mutex_lock(&xs_response_mutex); + down_write(&xs_watch_rwsem); } void xs_resume(void) @@ -744,31 +819,32 @@ void xs_resume(void) xb_init_comms(); - mutex_unlock(&xs_state.response_mutex); - mutex_unlock(&xs_state.request_mutex); - transaction_resume(); + mutex_unlock(&xs_response_mutex); + + xs_suspend_exit(); - /* No need for watches_lock: the watch_mutex is sufficient. */ + /* No need for watches_lock: the xs_watch_rwsem is sufficient. */ list_for_each_entry(watch, &watches, list) { sprintf(token, "%lX", (long)watch); xs_watch(watch->node, token); } - up_write(&xs_state.watch_mutex); + up_write(&xs_watch_rwsem); } void xs_suspend_cancel(void) { - mutex_unlock(&xs_state.response_mutex); - mutex_unlock(&xs_state.request_mutex); - up_write(&xs_state.watch_mutex); - mutex_unlock(&xs_state.transaction_mutex); + up_write(&xs_watch_rwsem); + mutex_unlock(&xs_response_mutex); + + xs_suspend_exit(); } static int xenwatch_thread(void *unused) { - struct list_head *ent; - struct xs_stored_msg *msg; + struct xs_watch_event *event; + + xenwatch_pid = current->pid; for (;;) { wait_event_interruptible(watch_events_waitq, @@ -780,19 +856,18 @@ static int xenwatch_thread(void *unused) mutex_lock(&xenwatch_mutex); spin_lock(&watch_events_lock); - ent = watch_events.next; - if (ent != &watch_events) - list_del(ent); + event = list_first_entry_or_null(&watch_events, + struct xs_watch_event, list); + if (event) { + list_del(&event->list); + event->handle->nr_pending--; + } spin_unlock(&watch_events_lock); - if (ent != &watch_events) { - msg = list_entry(ent, struct xs_stored_msg, list); - msg->u.watch.handle->callback( - msg->u.watch.handle, - (const char **)msg->u.watch.vec, - msg->u.watch.vec_size); - kfree(msg->u.watch.vec); - kfree(msg); + if (event) { + event->handle->callback(event->handle, event->path, + event->token); + kfree(event); } mutex_unlock(&xenwatch_mutex); @@ -801,126 +876,37 @@ static int xenwatch_thread(void *unused) return 0; } -static int process_msg(void) +/* + * Wake up all threads waiting for a xenstore reply. In case of shutdown all + * pending replies will be marked as "aborted" in order to let the waiters + * return in spite of xenstore possibly no longer being able to reply. This + * will avoid blocking shutdown by a thread waiting for xenstore but being + * necessary for shutdown processing to proceed. + */ +static int xs_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) { - struct xs_stored_msg *msg; - char *body; - int err; - - /* - * We must disallow save/restore while reading a xenstore message. - * A partial read across s/r leaves us out of sync with xenstored. - */ - for (;;) { - err = xb_wait_for_data_to_read(); - if (err) - return err; - mutex_lock(&xs_state.response_mutex); - if (xb_data_to_read()) - break; - /* We raced with save/restore: pending data 'disappeared'. */ - mutex_unlock(&xs_state.response_mutex); - } - + struct xb_req_data *req; - msg = kmalloc(sizeof(*msg), GFP_NOIO | __GFP_HIGH); - if (msg == NULL) { - err = -ENOMEM; - goto out; - } - - err = xb_read(&msg->hdr, sizeof(msg->hdr)); - if (err) { - kfree(msg); - goto out; - } - - if (msg->hdr.len > XENSTORE_PAYLOAD_MAX) { - kfree(msg); - err = -EINVAL; - goto out; - } - - body = kmalloc(msg->hdr.len + 1, GFP_NOIO | __GFP_HIGH); - if (body == NULL) { - kfree(msg); - err = -ENOMEM; - goto out; - } - - err = xb_read(body, msg->hdr.len); - if (err) { - kfree(body); - kfree(msg); - goto out; - } - body[msg->hdr.len] = '\0'; - - if (msg->hdr.type == XS_WATCH_EVENT) { - msg->u.watch.vec = split(body, msg->hdr.len, - &msg->u.watch.vec_size); - if (IS_ERR(msg->u.watch.vec)) { - err = PTR_ERR(msg->u.watch.vec); - kfree(msg); - goto out; - } - - spin_lock(&watches_lock); - msg->u.watch.handle = find_watch( - msg->u.watch.vec[XS_WATCH_TOKEN]); - if (msg->u.watch.handle != NULL) { - spin_lock(&watch_events_lock); - list_add_tail(&msg->list, &watch_events); - wake_up(&watch_events_waitq); - spin_unlock(&watch_events_lock); - } else { - kfree(msg->u.watch.vec); - kfree(msg); - } - spin_unlock(&watches_lock); - } else { - msg->u.reply.body = body; - spin_lock(&xs_state.reply_lock); - list_add_tail(&msg->list, &xs_state.reply_list); - spin_unlock(&xs_state.reply_lock); - wake_up(&xs_state.reply_waitq); - } - - out: - mutex_unlock(&xs_state.response_mutex); - return err; + mutex_lock(&xb_write_mutex); + list_for_each_entry(req, &xs_reply_list, list) + wake_up(&req->wq); + list_for_each_entry(req, &xb_write_list, list) + wake_up(&req->wq); + mutex_unlock(&xb_write_mutex); + return NOTIFY_DONE; } -static int xenbus_thread(void *unused) -{ - int err; - - for (;;) { - err = process_msg(); - if (err) - pr_warn("error %d while reading message\n", err); - if (kthread_should_stop()) - break; - } - - return 0; -} +static struct notifier_block xs_reboot_nb = { + .notifier_call = xs_reboot_notify, +}; int xs_init(void) { int err; struct task_struct *task; - INIT_LIST_HEAD(&xs_state.reply_list); - spin_lock_init(&xs_state.reply_lock); - init_waitqueue_head(&xs_state.reply_waitq); - - mutex_init(&xs_state.request_mutex); - mutex_init(&xs_state.response_mutex); - mutex_init(&xs_state.transaction_mutex); - init_rwsem(&xs_state.watch_mutex); - atomic_set(&xs_state.transaction_count, 0); - init_waitqueue_head(&xs_state.transaction_wq); + register_reboot_notifier(&xs_reboot_nb); /* Initialize the shared memory rings to talk to xenstored */ err = xb_init_comms(); @@ -930,11 +916,6 @@ int xs_init(void) task = kthread_run(xenwatch_thread, NULL, "xenwatch"); if (IS_ERR(task)) return PTR_ERR(task); - xenwatch_pid = task->pid; - - task = kthread_run(xenbus_thread, NULL, "xenbus"); - if (IS_ERR(task)) - return PTR_ERR(task); /* shutdown watches for kexec boot */ xs_reset_watches(); diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c deleted file mode 100644 index 4793fc594549..000000000000 --- a/drivers/xen/xencomm.c +++ /dev/null @@ -1,219 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Copyright (C) IBM Corp. 2006 - * - * Authors: Hollis Blanchard <hollisb@us.ibm.com> - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/mm.h> -#include <linux/slab.h> -#include <asm/page.h> -#include <xen/xencomm.h> -#include <xen/interface/xen.h> -#include <asm/xen/xencomm.h> /* for xencomm_is_phys_contiguous() */ - -static int xencomm_init(struct xencomm_desc *desc, - void *buffer, unsigned long bytes) -{ - unsigned long recorded = 0; - int i = 0; - - while ((recorded < bytes) && (i < desc->nr_addrs)) { - unsigned long vaddr = (unsigned long)buffer + recorded; - unsigned long paddr; - int offset; - int chunksz; - - offset = vaddr % PAGE_SIZE; /* handle partial pages */ - chunksz = min(PAGE_SIZE - offset, bytes - recorded); - - paddr = xencomm_vtop(vaddr); - if (paddr == ~0UL) { - printk(KERN_DEBUG "%s: couldn't translate vaddr %lx\n", - __func__, vaddr); - return -EINVAL; - } - - desc->address[i++] = paddr; - recorded += chunksz; - } - - if (recorded < bytes) { - printk(KERN_DEBUG - "%s: could only translate %ld of %ld bytes\n", - __func__, recorded, bytes); - return -ENOSPC; - } - - /* mark remaining addresses invalid (just for safety) */ - while (i < desc->nr_addrs) - desc->address[i++] = XENCOMM_INVALID; - - desc->magic = XENCOMM_MAGIC; - - return 0; -} - -static struct xencomm_desc *xencomm_alloc(gfp_t gfp_mask, - void *buffer, unsigned long bytes) -{ - struct xencomm_desc *desc; - unsigned long buffer_ulong = (unsigned long)buffer; - unsigned long start = buffer_ulong & PAGE_MASK; - unsigned long end = (buffer_ulong + bytes) | ~PAGE_MASK; - unsigned long nr_addrs = (end - start + 1) >> PAGE_SHIFT; - unsigned long size = sizeof(*desc) + - sizeof(desc->address[0]) * nr_addrs; - - /* - * slab allocator returns at least sizeof(void*) aligned pointer. - * When sizeof(*desc) > sizeof(void*), struct xencomm_desc might - * cross page boundary. - */ - if (sizeof(*desc) > sizeof(void *)) { - unsigned long order = get_order(size); - desc = (struct xencomm_desc *)__get_free_pages(gfp_mask, - order); - if (desc == NULL) - return NULL; - - desc->nr_addrs = - ((PAGE_SIZE << order) - sizeof(struct xencomm_desc)) / - sizeof(*desc->address); - } else { - desc = kmalloc(size, gfp_mask); - if (desc == NULL) - return NULL; - - desc->nr_addrs = nr_addrs; - } - return desc; -} - -void xencomm_free(struct xencomm_handle *desc) -{ - if (desc && !((ulong)desc & XENCOMM_INLINE_FLAG)) { - struct xencomm_desc *desc__ = (struct xencomm_desc *)desc; - if (sizeof(*desc__) > sizeof(void *)) { - unsigned long size = sizeof(*desc__) + - sizeof(desc__->address[0]) * desc__->nr_addrs; - unsigned long order = get_order(size); - free_pages((unsigned long)__va(desc), order); - } else - kfree(__va(desc)); - } -} - -static int xencomm_create(void *buffer, unsigned long bytes, - struct xencomm_desc **ret, gfp_t gfp_mask) -{ - struct xencomm_desc *desc; - int rc; - - pr_debug("%s: %p[%ld]\n", __func__, buffer, bytes); - - if (bytes == 0) { - /* don't create a descriptor; Xen recognizes NULL. */ - BUG_ON(buffer != NULL); - *ret = NULL; - return 0; - } - - BUG_ON(buffer == NULL); /* 'bytes' is non-zero */ - - desc = xencomm_alloc(gfp_mask, buffer, bytes); - if (!desc) { - printk(KERN_DEBUG "%s failure\n", "xencomm_alloc"); - return -ENOMEM; - } - - rc = xencomm_init(desc, buffer, bytes); - if (rc) { - printk(KERN_DEBUG "%s failure: %d\n", "xencomm_init", rc); - xencomm_free((struct xencomm_handle *)__pa(desc)); - return rc; - } - - *ret = desc; - return 0; -} - -static struct xencomm_handle *xencomm_create_inline(void *ptr) -{ - unsigned long paddr; - - BUG_ON(!xencomm_is_phys_contiguous((unsigned long)ptr)); - - paddr = (unsigned long)xencomm_pa(ptr); - BUG_ON(paddr & XENCOMM_INLINE_FLAG); - return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG); -} - -/* "mini" routine, for stack-based communications: */ -static int xencomm_create_mini(void *buffer, - unsigned long bytes, struct xencomm_mini *xc_desc, - struct xencomm_desc **ret) -{ - int rc = 0; - struct xencomm_desc *desc; - BUG_ON(((unsigned long)xc_desc) % sizeof(*xc_desc) != 0); - - desc = (void *)xc_desc; - - desc->nr_addrs = XENCOMM_MINI_ADDRS; - - rc = xencomm_init(desc, buffer, bytes); - if (!rc) - *ret = desc; - - return rc; -} - -struct xencomm_handle *xencomm_map(void *ptr, unsigned long bytes) -{ - int rc; - struct xencomm_desc *desc; - - if (xencomm_is_phys_contiguous((unsigned long)ptr)) - return xencomm_create_inline(ptr); - - rc = xencomm_create(ptr, bytes, &desc, GFP_KERNEL); - - if (rc || desc == NULL) - return NULL; - - return xencomm_pa(desc); -} - -struct xencomm_handle *__xencomm_map_no_alloc(void *ptr, unsigned long bytes, - struct xencomm_mini *xc_desc) -{ - int rc; - struct xencomm_desc *desc = NULL; - - if (xencomm_is_phys_contiguous((unsigned long)ptr)) - return xencomm_create_inline(ptr); - - rc = xencomm_create_mini(ptr, bytes, xc_desc, - &desc); - - if (rc) - return NULL; - - return xencomm_pa(desc); -} diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile index b019865fcc56..8490644df1a3 100644 --- a/drivers/xen/xenfs/Makefile +++ b/drivers/xen/xenfs/Makefile @@ -1,4 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_XENFS) += xenfs.o xenfs-y = super.o xenfs-$(CONFIG_XEN_DOM0) += xenstored.o +xenfs-$(CONFIG_XEN_SYMS) += xensyms.o diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index 06092e0fe8ce..37ea7c5c0346 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * xenfs.c - a filesystem for passing info between the a domain and * the hypervisor. @@ -13,13 +14,14 @@ #include <linux/errno.h> #include <linux/module.h> #include <linux/fs.h> +#include <linux/fs_context.h> #include <linux/magic.h> #include <xen/xen.h> +#include <xen/xenbus.h> #include "xenfs.h" #include "../privcmd.h" -#include "../xenbus/xenbus_comms.h" #include <asm/xen/hypervisor.h> @@ -42,21 +44,24 @@ static const struct file_operations capabilities_file_ops = { .llseek = default_llseek, }; -static int xenfs_fill_super(struct super_block *sb, void *data, int silent) +static int xenfs_fill_super(struct super_block *sb, struct fs_context *fc) { - static struct tree_descr xenfs_files[] = { + static const struct tree_descr xenfs_files[] = { [2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR }, { "capabilities", &capabilities_file_ops, S_IRUGO }, { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR }, {""}, }; - static struct tree_descr xenfs_init_files[] = { + static const struct tree_descr xenfs_init_files[] = { [2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR }, { "capabilities", &capabilities_file_ops, S_IRUGO }, { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR }, { "xsd_kva", &xsd_kva_file_ops, S_IRUSR|S_IWUSR}, { "xsd_port", &xsd_port_file_ops, S_IRUSR|S_IWUSR}, +#ifdef CONFIG_XEN_SYMS + { "xensyms", &xensyms_ops, S_IRUSR}, +#endif {""}, }; @@ -64,18 +69,26 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent) xen_initial_domain() ? xenfs_init_files : xenfs_files); } -static struct dentry *xenfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) +static int xenfs_get_tree(struct fs_context *fc) { - return mount_single(fs_type, flags, data, xenfs_fill_super); + return get_tree_single(fc, xenfs_fill_super); +} + +static const struct fs_context_operations xenfs_context_ops = { + .get_tree = xenfs_get_tree, +}; + +static int xenfs_init_fs_context(struct fs_context *fc) +{ + fc->ops = &xenfs_context_ops; + return 0; } static struct file_system_type xenfs_type = { .owner = THIS_MODULE, .name = "xenfs", - .mount = xenfs_mount, - .kill_sb = kill_litter_super, + .init_fs_context = xenfs_init_fs_context, + .kill_sb = kill_anon_super, }; MODULE_ALIAS_FS("xenfs"); @@ -84,7 +97,6 @@ static int __init xenfs_init(void) if (xen_domain()) return register_filesystem(&xenfs_type); - pr_info("not registering filesystem on non-xen platform\n"); return 0; } diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h index 6b80c7779c02..cfe4874b83a7 100644 --- a/drivers/xen/xenfs/xenfs.h +++ b/drivers/xen/xenfs/xenfs.h @@ -1,7 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _XENFS_XENBUS_H #define _XENFS_XENBUS_H extern const struct file_operations xsd_kva_file_ops; extern const struct file_operations xsd_port_file_ops; +extern const struct file_operations xensyms_ops; #endif /* _XENFS_XENBUS_H */ diff --git a/drivers/xen/xenfs/xenstored.c b/drivers/xen/xenfs/xenstored.c index fef20dbc6a5c..f59235f9f8a2 100644 --- a/drivers/xen/xenfs/xenstored.c +++ b/drivers/xen/xenfs/xenstored.c @@ -1,12 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/slab.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/fs.h> #include <xen/page.h> +#include <xen/xenbus.h> #include "xenfs.h" -#include "../xenbus/xenbus_comms.h" static ssize_t xsd_read(struct file *file, char __user *buf, size_t size, loff_t *off) diff --git a/drivers/xen/xenfs/xensyms.c b/drivers/xen/xenfs/xensyms.c new file mode 100644 index 000000000000..088b7f02c358 --- /dev/null +++ b/drivers/xen/xenfs/xensyms.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/init.h> +#include <linux/seq_file.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/proc_fs.h> +#include <linux/slab.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> +#include <xen/xen-ops.h> +#include "xenfs.h" + + +#define XEN_KSYM_NAME_LEN 127 /* Hypervisor may have different name length */ + +struct xensyms { + struct xen_platform_op op; + char *name; + uint32_t namelen; +}; + +/* Grab next output page from the hypervisor */ +static int xensyms_next_sym(struct xensyms *xs) +{ + int ret; + struct xenpf_symdata *symdata = &xs->op.u.symdata; + uint64_t symnum; + + memset(xs->name, 0, xs->namelen); + symdata->namelen = xs->namelen; + + symnum = symdata->symnum; + + ret = HYPERVISOR_platform_op(&xs->op); + if (ret < 0) + return ret; + + /* + * If hypervisor's symbol didn't fit into the buffer then allocate + * a larger buffer and try again. + */ + if (unlikely(symdata->namelen > xs->namelen)) { + kfree(xs->name); + + xs->namelen = symdata->namelen; + xs->name = kzalloc(xs->namelen, GFP_KERNEL); + if (!xs->name) + return -ENOMEM; + + set_xen_guest_handle(symdata->name, xs->name); + symdata->symnum = symnum; /* Rewind */ + + ret = HYPERVISOR_platform_op(&xs->op); + if (ret < 0) + return ret; + } + + if (symdata->symnum == symnum) + /* End of symbols */ + return 1; + + return 0; +} + +static void *xensyms_start(struct seq_file *m, loff_t *pos) +{ + struct xensyms *xs = m->private; + + xs->op.u.symdata.symnum = *pos; + + if (xensyms_next_sym(xs)) + return NULL; + + return m->private; +} + +static void *xensyms_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct xensyms *xs = m->private; + + *pos = xs->op.u.symdata.symnum; + + if (xensyms_next_sym(xs)) + return NULL; + + return p; +} + +static int xensyms_show(struct seq_file *m, void *p) +{ + struct xensyms *xs = m->private; + struct xenpf_symdata *symdata = &xs->op.u.symdata; + + seq_printf(m, "%016llx %c %s\n", symdata->address, + symdata->type, xs->name); + + return 0; +} + +static void xensyms_stop(struct seq_file *m, void *p) +{ +} + +static const struct seq_operations xensyms_seq_ops = { + .start = xensyms_start, + .next = xensyms_next, + .show = xensyms_show, + .stop = xensyms_stop, +}; + +static int xensyms_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + struct xensyms *xs; + int ret; + + ret = seq_open_private(file, &xensyms_seq_ops, + sizeof(struct xensyms)); + if (ret) + return ret; + + m = file->private_data; + xs = m->private; + + xs->namelen = XEN_KSYM_NAME_LEN + 1; + xs->name = kzalloc(xs->namelen, GFP_KERNEL); + if (!xs->name) { + seq_release_private(inode, file); + return -ENOMEM; + } + set_xen_guest_handle(xs->op.u.symdata.name, xs->name); + xs->op.cmd = XENPF_get_symbol; + xs->op.u.symdata.namelen = xs->namelen; + + return 0; +} + +static int xensyms_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct xensyms *xs = m->private; + + kfree(xs->name); + return seq_release_private(inode, file); +} + +const struct file_operations xensyms_ops = { + .open = xensyms_open, + .read = seq_read, + .llseek = seq_lseek, + .release = xensyms_release +}; diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c new file mode 100644 index 000000000000..f17c4c03db30 --- /dev/null +++ b/drivers/xen/xlate_mmu.c @@ -0,0 +1,295 @@ +/* + * MMU operations common to all auto-translated physmap guests. + * + * Copyright (C) 2015 Citrix Systems R&D Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/page.h> +#include <xen/interface/xen.h> +#include <xen/interface/memory.h> +#include <xen/balloon.h> + +typedef void (*xen_gfn_fn_t)(unsigned long gfn, void *data); + +/* Break down the pages in 4KB chunk and call fn for each gfn */ +static void xen_for_each_gfn(struct page **pages, unsigned nr_gfn, + xen_gfn_fn_t fn, void *data) +{ + unsigned long xen_pfn = 0; + struct page *page; + int i; + + for (i = 0; i < nr_gfn; i++) { + if ((i % XEN_PFN_PER_PAGE) == 0) { + page = pages[i / XEN_PFN_PER_PAGE]; + xen_pfn = page_to_xen_pfn(page); + } + fn(pfn_to_gfn(xen_pfn++), data); + } +} + +struct remap_data { + xen_pfn_t *fgfn; /* foreign domain's gfn */ + int nr_fgfn; /* Number of foreign gfn left to map */ + pgprot_t prot; + domid_t domid; + struct vm_area_struct *vma; + int index; + struct page **pages; + struct xen_remap_gfn_info *info; + int *err_ptr; + int mapped; + + /* Hypercall parameters */ + int h_errs[XEN_PFN_PER_PAGE]; + xen_ulong_t h_idxs[XEN_PFN_PER_PAGE]; + xen_pfn_t h_gpfns[XEN_PFN_PER_PAGE]; + + int h_iter; /* Iterator */ +}; + +static void setup_hparams(unsigned long gfn, void *data) +{ + struct remap_data *info = data; + + info->h_idxs[info->h_iter] = *info->fgfn; + info->h_gpfns[info->h_iter] = gfn; + info->h_errs[info->h_iter] = 0; + + info->h_iter++; + info->fgfn++; +} + +static int remap_pte_fn(pte_t *ptep, unsigned long addr, void *data) +{ + struct remap_data *info = data; + struct page *page = info->pages[info->index++]; + pte_t pte = pte_mkspecial(pfn_pte(page_to_pfn(page), info->prot)); + int rc, nr_gfn; + uint32_t i; + struct xen_add_to_physmap_range xatp = { + .domid = DOMID_SELF, + .foreign_domid = info->domid, + .space = XENMAPSPACE_gmfn_foreign, + }; + + nr_gfn = min_t(typeof(info->nr_fgfn), XEN_PFN_PER_PAGE, info->nr_fgfn); + info->nr_fgfn -= nr_gfn; + + info->h_iter = 0; + xen_for_each_gfn(&page, nr_gfn, setup_hparams, info); + BUG_ON(info->h_iter != nr_gfn); + + set_xen_guest_handle(xatp.idxs, info->h_idxs); + set_xen_guest_handle(xatp.gpfns, info->h_gpfns); + set_xen_guest_handle(xatp.errs, info->h_errs); + xatp.size = nr_gfn; + + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp); + + /* info->err_ptr expect to have one error status per Xen PFN */ + for (i = 0; i < nr_gfn; i++) { + int err = (rc < 0) ? rc : info->h_errs[i]; + + *(info->err_ptr++) = err; + if (!err) + info->mapped++; + } + + /* + * Note: The hypercall will return 0 in most of the case if even if + * all the fgmfn are not mapped. We still have to update the pte + * as the userspace may decide to continue. + */ + if (!rc) + set_pte_at(info->vma->vm_mm, addr, ptep, pte); + + return 0; +} + +int xen_xlate_remap_gfn_array(struct vm_area_struct *vma, + unsigned long addr, + xen_pfn_t *gfn, int nr, + int *err_ptr, pgprot_t prot, + unsigned domid, + struct page **pages) +{ + int err; + struct remap_data data; + unsigned long range = DIV_ROUND_UP(nr, XEN_PFN_PER_PAGE) << PAGE_SHIFT; + + /* Kept here for the purpose of making sure code doesn't break + x86 PVOPS */ + BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); + + data.fgfn = gfn; + data.nr_fgfn = nr; + data.prot = prot; + data.domid = domid; + data.vma = vma; + data.pages = pages; + data.index = 0; + data.err_ptr = err_ptr; + data.mapped = 0; + + err = apply_to_page_range(vma->vm_mm, addr, range, + remap_pte_fn, &data); + return err < 0 ? err : data.mapped; +} +EXPORT_SYMBOL_GPL(xen_xlate_remap_gfn_array); + +static void unmap_gfn(unsigned long gfn, void *data) +{ + struct xen_remove_from_physmap xrp; + + xrp.domid = DOMID_SELF; + xrp.gpfn = gfn; + (void)HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); +} + +int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma, + int nr, struct page **pages) +{ + xen_for_each_gfn(pages, nr, unmap_gfn, NULL); + + return 0; +} +EXPORT_SYMBOL_GPL(xen_xlate_unmap_gfn_range); + +struct map_balloon_pages { + xen_pfn_t *pfns; + unsigned int idx; +}; + +static void setup_balloon_gfn(unsigned long gfn, void *data) +{ + struct map_balloon_pages *info = data; + + info->pfns[info->idx++] = gfn; +} + +/** + * xen_xlate_map_ballooned_pages - map a new set of ballooned pages + * @gfns: returns the array of corresponding GFNs + * @virt: returns the virtual address of the mapped region + * @nr_grant_frames: number of GFNs + * @return 0 on success, error otherwise + * + * This allocates a set of ballooned pages and maps them into the + * kernel's address space. + */ +int __init xen_xlate_map_ballooned_pages(xen_pfn_t **gfns, void **virt, + unsigned long nr_grant_frames) +{ + struct page **pages; + xen_pfn_t *pfns; + void *vaddr; + struct map_balloon_pages data; + int rc; + unsigned long nr_pages; + + BUG_ON(nr_grant_frames == 0); + nr_pages = DIV_ROUND_UP(nr_grant_frames, XEN_PFN_PER_PAGE); + pages = kcalloc(nr_pages, sizeof(pages[0]), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + pfns = kcalloc(nr_grant_frames, sizeof(pfns[0]), GFP_KERNEL); + if (!pfns) { + kfree(pages); + return -ENOMEM; + } + rc = xen_alloc_unpopulated_pages(nr_pages, pages); + if (rc) { + pr_warn("%s Couldn't balloon alloc %ld pages rc:%d\n", __func__, + nr_pages, rc); + kfree(pages); + kfree(pfns); + return rc; + } + + data.pfns = pfns; + data.idx = 0; + xen_for_each_gfn(pages, nr_grant_frames, setup_balloon_gfn, &data); + + vaddr = vmap(pages, nr_pages, 0, PAGE_KERNEL); + if (!vaddr) { + pr_warn("%s Couldn't map %ld pages rc:%d\n", __func__, + nr_pages, rc); + xen_free_unpopulated_pages(nr_pages, pages); + kfree(pages); + kfree(pfns); + return -ENOMEM; + } + kfree(pages); + + *gfns = pfns; + *virt = vaddr; + + return 0; +} + +struct remap_pfn { + struct mm_struct *mm; + struct page **pages; + pgprot_t prot; + unsigned long i; +}; + +static int remap_pfn_fn(pte_t *ptep, unsigned long addr, void *data) +{ + struct remap_pfn *r = data; + struct page *page = r->pages[r->i]; + pte_t pte = pte_mkspecial(pfn_pte(page_to_pfn(page), r->prot)); + + set_pte_at(r->mm, addr, ptep, pte); + r->i++; + + return 0; +} + +/* Used by the privcmd module, but has to be built-in on ARM */ +int xen_remap_vma_range(struct vm_area_struct *vma, unsigned long addr, unsigned long len) +{ + struct remap_pfn r = { + .mm = vma->vm_mm, + .pages = vma->vm_private_data, + .prot = vma->vm_page_prot, + }; + + return apply_to_page_range(vma->vm_mm, addr, len, remap_pfn_fn, &r); +} +EXPORT_SYMBOL_GPL(xen_remap_vma_range); |
