diff options
Diffstat (limited to 'drivers/xen')
76 files changed, 6452 insertions, 5613 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 838b66a9a0e7..f9a35ed266ec 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only menu "Xen driver support" depends on XEN @@ -9,30 +10,19 @@ config XEN_BALLOON the system to expand the domain's memory allocation, or alternatively return unneeded memory to the system. -config XEN_SELFBALLOONING - bool "Dynamically self-balloon kernel memory to target" - depends on XEN && XEN_BALLOON && CLEANCACHE && SWAP && XEN_TMEM - help - Self-ballooning dynamically balloons available kernel memory driven - by the current usage of anonymous memory ("committed AS") and - controlled by various sysfs-settable parameters. Configuring - FRONTSWAP is highly recommended; if it is not configured, self- - ballooning is disabled by default. If FRONTSWAP is configured, - frontswap-selfshrinking is enabled by default but can be disabled - with the 'tmem.selfshrink=0' kernel boot parameter; and self-ballooning - is enabled by default but can be disabled with the 'tmem.selfballooning=0' - kernel boot parameter. Note that systems without a sufficiently - large swap device should not enable self-ballooning. - config XEN_BALLOON_MEMORY_HOTPLUG bool "Memory hotplug support for Xen balloon driver" depends on XEN_BALLOON && MEMORY_HOTPLUG + default y help Memory hotplug support for Xen balloon driver allows expanding memory available for the system above limit declared at system startup. It is very useful on critical systems which require long run without rebooting. + It's also very useful for non PV domains to obtain unpopulated physical + memory ranges to use in order to map foreign memory or grants. + Memory could be hotplugged in following steps: 1) target domain: ensure that memory auto online policy is in @@ -60,15 +50,13 @@ config XEN_BALLOON_MEMORY_HOTPLUG SUBSYSTEM=="memory", ACTION=="add", RUN+="/bin/sh -c '[ -f /sys$devpath/state ] && echo online > /sys$devpath/state'" -config XEN_BALLOON_MEMORY_HOTPLUG_LIMIT +config XEN_MEMORY_HOTPLUG_LIMIT int "Hotplugged memory limit (in GiB) for a PV guest" - default 512 if X86_64 - default 4 if X86_32 - range 0 64 if X86_32 + default 512 depends on XEN_HAVE_PVMMU - depends on XEN_BALLOON_MEMORY_HOTPLUG + depends on MEMORY_HOTPLUG help - Maxmium amount of memory (in GiB) that a PV guest can be + Maximum amount of memory (in GiB) that a PV guest can be expanded to when using memory hotplug. A PV guest can have more memory than this limit if is @@ -120,27 +108,27 @@ config XENFS If in doubt, say yes. config XEN_COMPAT_XENFS - bool "Create compatibility mount point /proc/xen" - depends on XENFS - default y - help - The old xenstore userspace tools expect to find "xenbus" - under /proc/xen, but "xenbus" is now found at the root of the - xenfs filesystem. Selecting this causes the kernel to create - the compatibility mount point /proc/xen if it is running on - a xen platform. - If in doubt, say yes. + bool "Create compatibility mount point /proc/xen" + depends on XENFS + default y + help + The old xenstore userspace tools expect to find "xenbus" + under /proc/xen, but "xenbus" is now found at the root of the + xenfs filesystem. Selecting this causes the kernel to create + the compatibility mount point /proc/xen if it is running on + a xen platform. + If in doubt, say yes. config XEN_SYS_HYPERVISOR - bool "Create xen entries under /sys/hypervisor" - depends on SYSFS - select SYS_HYPERVISOR - default y - help - Create entries under /sys/hypervisor describing the Xen - hypervisor environment. When running native or in another - virtual environment, /sys/hypervisor will still be present, - but will have no xen contents. + bool "Create xen entries under /sys/hypervisor" + depends on SYSFS + select SYS_HYPERVISOR + default y + help + Create entries under /sys/hypervisor describing the Xen + hypervisor environment. When running native or in another + virtual environment, /sys/hypervisor will still be present, + but will have no xen contents. config XEN_XENBUS_FRONTEND tristate @@ -150,12 +138,14 @@ config XEN_GNTDEV depends on XEN default m select MMU_NOTIFIER + select FIND_NORMAL_PAGE help Allows userspace processes to use grants. config XEN_GNTDEV_DMABUF bool "Add support for dma-buf grant access device driver extension" - depends on XEN_GNTDEV && XEN_GRANT_DMA_ALLOC && DMA_SHARED_BUFFER + depends on XEN_GNTDEV && XEN_GRANT_DMA_ALLOC + select DMA_SHARED_BUFFER help Allows userspace processes and kernel modules to use Xen backed dma-buf implementation. With this extension grant references to @@ -188,20 +178,38 @@ config XEN_GRANT_DMA_ALLOC config SWIOTLB_XEN def_bool y + depends on ARCH_HAS_DMA_OPS + depends on XEN_PV || ARM || ARM64 select SWIOTLB -config XEN_TMEM - tristate - depends on !ARM && !ARM64 - default m if (CLEANCACHE || FRONTSWAP) +config XEN_PCI_STUB + bool + +config XEN_PCIDEV_STUB + tristate "Xen PCI-device stub driver" + depends on PCI && !X86 && XEN + depends on XEN_BACKEND + select XEN_PCI_STUB + default m help - Shim to interface in-kernel Transcendent Memory hooks - (e.g. cleancache and frontswap) to Xen tmem hypercalls. + The PCI device stub driver provides limited version of the PCI + device backend driver without para-virtualized support for guests. + If you select this to be a module, you will need to make sure no + other driver has bound to the device(s) you want to make visible to + other guests. + + The "hide" parameter (only applicable if backend driver is compiled + into the kernel) allows you to bind the PCI devices to this module + from the default device drivers. The argument is the list of PCI BDFs: + xen-pciback.hide=(03:00.0)(04:00.0) + + If in doubt, say m. config XEN_PCIDEV_BACKEND tristate "Xen PCI-device backend driver" depends on PCI && X86 && XEN depends on XEN_BACKEND + select XEN_PCI_STUB default m help The PCI device backend driver allows the kernel to export arbitrary @@ -232,7 +240,7 @@ config XEN_PVCALLS_FRONTEND implements them. config XEN_PVCALLS_BACKEND - bool "XEN PV Calls backend driver" + tristate "XEN PV Calls backend driver" depends on INET && XEN && XEN_BACKEND help Experimental backend for the Xen PV Calls protocol @@ -252,47 +260,29 @@ config XEN_SCSI_BACKEND if guests need generic access to SCSI devices. config XEN_PRIVCMD - tristate + tristate "Xen hypercall passthrough driver" depends on XEN default m - -config XEN_STUB - bool "Xen stub drivers" - depends on XEN && X86_64 && BROKEN help - Allow kernel to install stub drivers, to reserve space for Xen drivers, - i.e. memory hotplug and cpu hotplug, and to block native drivers loaded, - so that real Xen drivers can be modular. - - To enable Xen features like cpu and memory hotplug, select Y here. - -config XEN_ACPI_HOTPLUG_MEMORY - tristate "Xen ACPI memory hotplug" - depends on XEN_DOM0 && XEN_STUB && ACPI - help - This is Xen ACPI memory hotplug. - - Currently Xen only support ACPI memory hot-add. If you want - to hot-add memory at runtime (the hot-added memory cannot be - removed until machine stop), select Y/M here, otherwise select N. - -config XEN_ACPI_HOTPLUG_CPU - tristate "Xen ACPI cpu hotplug" - depends on XEN_DOM0 && XEN_STUB && ACPI - select ACPI_CONTAINER + The hypercall passthrough driver allows privileged user programs to + perform Xen hypercalls. This driver is normally required for systems + running as Dom0 to perform privileged operations, but in some + disaggregated Xen setups this driver might be needed for other + domains, too. + +config XEN_PRIVCMD_EVENTFD + bool "Xen Ioeventfd and irqfd support" + depends on XEN_PRIVCMD && XEN_VIRTIO && EVENTFD help - Xen ACPI cpu enumerating and hotplugging - - For hotplugging, currently Xen only support ACPI cpu hotadd. - If you want to hotadd cpu at runtime (the hotadded cpu cannot - be removed until machine stop), select Y/M here. + Using the ioeventfd / irqfd mechanism a virtio backend running in a + daemon can speed up interrupt delivery from / to a guest. config XEN_ACPI_PROCESSOR tristate "Xen ACPI processor" depends on XEN && XEN_DOM0 && X86 && ACPI_PROCESSOR && CPU_FREQ default m help - This ACPI processor uploads Power Management information to the Xen + This ACPI processor uploads Power Management information to the Xen hypervisor. To do that the driver parses the Power Management data and uploads @@ -301,19 +291,19 @@ config XEN_ACPI_PROCESSOR SMM so that other drivers (such as ACPI cpufreq scaling driver) will not load. - To compile this driver as a module, choose M here: the module will be + To compile this driver as a module, choose M here: the module will be called xen_acpi_processor If you do not know what to choose, select M here. If the CPUFREQ drivers are built in, select Y here. config XEN_MCE_LOG bool "Xen platform mcelog" - depends on XEN_DOM0 && X86_64 && X86_MCE + depends on XEN_PV_DOM0 && X86_MCE help Allow kernel fetching MCE error from Xen platform and converting it into Linux mcelog format for mcelog tools config XEN_HAVE_PVMMU - bool + bool config XEN_EFI def_bool y @@ -330,17 +320,56 @@ config XEN_ACPI depends on X86 && ACPI config XEN_SYMS - bool "Xen symbols" - depends on X86 && XEN_DOM0 && XENFS - default y if KALLSYMS - help - Exports hypervisor symbols (along with their types and addresses) via - /proc/xen/xensyms file, similar to /proc/kallsyms + bool "Xen symbols" + depends on X86 && XEN_DOM0 && XENFS + default y if KALLSYMS + help + Exports hypervisor symbols (along with their types and addresses) via + /proc/xen/xensyms file, similar to /proc/kallsyms config XEN_HAVE_VPMU - bool + bool config XEN_FRONT_PGDIR_SHBUF tristate +config XEN_UNPOPULATED_ALLOC + bool "Use unpopulated memory ranges for guest mappings" + depends on ZONE_DEVICE + default XEN_BACKEND || XEN_GNTDEV || XEN_DOM0 + help + Use unpopulated memory ranges in order to create mappings for guest + memory regions, including grant maps and foreign pages. This avoids + having to balloon out RAM regions in order to obtain physical memory + space to create such mappings. + +config XEN_GRANT_DMA_IOMMU + bool + select IOMMU_API + +config XEN_GRANT_DMA_OPS + bool + +config XEN_VIRTIO + bool "Xen virtio support" + depends on ARCH_HAS_DMA_OPS + depends on VIRTIO + select XEN_GRANT_DMA_OPS + select XEN_GRANT_DMA_IOMMU if OF + help + Enable virtio support for running as Xen guest. Depending on the + guest type this will require special support on the backend side + (qemu or kernel, depending on the virtio device types used). + + If in doubt, say n. + +config XEN_VIRTIO_FORCE_GRANT + bool "Require Xen virtio support to use grants" + depends on XEN_VIRTIO + help + Require virtio for Xen guests to use grant mappings. + This will avoid the need to give the backend the right to map all + of the guest memory. This will need support on the backend side + (e.g. qemu or kernel, depending on the virtio device types used). + endmenu diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index c48927a58e10..c0503f1c7d5b 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,13 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o -obj-$(CONFIG_X86) += fallback.o -obj-y += grant-table.o features.o balloon.o manage.o preempt.o time.o +obj-y += grant-table.o features.o balloon.o manage.o time.o obj-y += mem-reservation.o obj-y += events/ obj-y += xenbus/ -nostackp := $(call cc-option, -fno-stack-protector) -CFLAGS_features.o := $(nostackp) +CFLAGS_features.o := -fno-stack-protector dom0-$(CONFIG_ARM64) += arm-device.o dom0-$(CONFIG_PCI) += pci.o @@ -18,21 +16,16 @@ dom0-$(CONFIG_X86) += pcpu.o obj-$(CONFIG_XEN_DOM0) += $(dom0-y) obj-$(CONFIG_BLOCK) += biomerge.o obj-$(CONFIG_XEN_BALLOON) += xen-balloon.o -obj-$(CONFIG_XEN_SELFBALLOONING) += xen-selfballoon.o obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o obj-$(CONFIG_XEN_GRANT_DEV_ALLOC) += xen-gntalloc.o obj-$(CONFIG_XENFS) += xenfs/ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o -obj-$(CONFIG_XEN_PVHVM) += platform-pci.o -obj-$(CONFIG_XEN_TMEM) += tmem.o +obj-$(CONFIG_XEN_PVHVM_GUEST) += platform-pci.o obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o obj-$(CONFIG_XEN_MCE_LOG) += mcelog.o -obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback/ +obj-$(CONFIG_XEN_PCI_STUB) += xen-pciback/ obj-$(CONFIG_XEN_PRIVCMD) += xen-privcmd.o -obj-$(CONFIG_XEN_STUB) += xen-stub.o -obj-$(CONFIG_XEN_ACPI_HOTPLUG_MEMORY) += xen-acpi-memhotplug.o -obj-$(CONFIG_XEN_ACPI_HOTPLUG_CPU) += xen-acpi-cpuhotplug.o obj-$(CONFIG_XEN_ACPI_PROCESSOR) += xen-acpi-processor.o obj-$(CONFIG_XEN_EFI) += efi.o obj-$(CONFIG_XEN_SCSI_BACKEND) += xen-scsiback.o @@ -45,3 +38,6 @@ xen-gntdev-$(CONFIG_XEN_GNTDEV_DMABUF) += gntdev-dmabuf.o xen-gntalloc-y := gntalloc.o xen-privcmd-y := privcmd.o privcmd-buf.o obj-$(CONFIG_XEN_FRONT_PGDIR_SHBUF) += xen-front-pgdir-shbuf.o +obj-$(CONFIG_XEN_UNPOPULATED_ALLOC) += unpopulated-alloc.o +obj-$(CONFIG_XEN_GRANT_DMA_OPS) += grant-dma-ops.o +obj-$(CONFIG_XEN_GRANT_DMA_IOMMU) += grant-dma-iommu.o diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c index 6893c79fd2a1..d2ee605c5ca1 100644 --- a/drivers/xen/acpi.c +++ b/drivers/xen/acpi.c @@ -30,6 +30,7 @@ * IN THE SOFTWARE. */ +#include <linux/pci.h> #include <xen/acpi.h> #include <xen/interface/platform.h> #include <asm/xen/hypercall.h> @@ -75,3 +76,76 @@ int xen_acpi_notify_hypervisor_extended_sleep(u8 sleep_state, return xen_acpi_notify_hypervisor_state(sleep_state, val_a, val_b, true); } + +struct acpi_prt_entry { + struct acpi_pci_id id; + u8 pin; + acpi_handle link; + u32 index; +}; + +int xen_acpi_get_gsi_info(struct pci_dev *dev, + int *gsi_out, + int *trigger_out, + int *polarity_out) +{ + int gsi; + u8 pin; + struct acpi_prt_entry *entry; + int trigger = ACPI_LEVEL_SENSITIVE; + int polarity = acpi_irq_model == ACPI_IRQ_MODEL_GIC ? + ACPI_ACTIVE_HIGH : ACPI_ACTIVE_LOW; + + if (!dev || !gsi_out || !trigger_out || !polarity_out) + return -EINVAL; + + pin = dev->pin; + if (!pin) + return -EINVAL; + + entry = acpi_pci_irq_lookup(dev, pin); + if (entry) { + if (entry->link) + gsi = acpi_pci_link_allocate_irq(entry->link, + entry->index, + &trigger, &polarity, + NULL); + else + gsi = entry->index; + } else + gsi = -1; + + if (gsi < 0) + return -EINVAL; + + *gsi_out = gsi; + *trigger_out = trigger; + *polarity_out = polarity; + + return 0; +} +EXPORT_SYMBOL_GPL(xen_acpi_get_gsi_info); + +static get_gsi_from_sbdf_t get_gsi_from_sbdf; +static DEFINE_RWLOCK(get_gsi_from_sbdf_lock); + +void xen_acpi_register_get_gsi_func(get_gsi_from_sbdf_t func) +{ + write_lock(&get_gsi_from_sbdf_lock); + get_gsi_from_sbdf = func; + write_unlock(&get_gsi_from_sbdf_lock); +} +EXPORT_SYMBOL_GPL(xen_acpi_register_get_gsi_func); + +int xen_acpi_get_gsi_from_sbdf(u32 sbdf) +{ + int ret = -EOPNOTSUPP; + + read_lock(&get_gsi_from_sbdf_lock); + if (get_gsi_from_sbdf) + ret = get_gsi_from_sbdf(sbdf); + read_unlock(&get_gsi_from_sbdf_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(xen_acpi_get_gsi_from_sbdf); diff --git a/drivers/xen/arm-device.c b/drivers/xen/arm-device.c index 3e789c77f568..87493f92291f 100644 --- a/drivers/xen/arm-device.c +++ b/drivers/xen/arm-device.c @@ -1,17 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2015, Linaro Limited, Shannon Zhao - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/platform_device.h> diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index ceb5048de9a7..49c3f9926394 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -43,6 +43,8 @@ #include <linux/sched.h> #include <linux/cred.h> #include <linux/errno.h> +#include <linux/freezer.h> +#include <linux/kthread.h> #include <linux/mm.h> #include <linux/memblock.h> #include <linux/pagemap.h> @@ -56,10 +58,10 @@ #include <linux/percpu-defs.h> #include <linux/slab.h> #include <linux/sysctl.h> +#include <linux/moduleparam.h> +#include <linux/jiffies.h> #include <asm/page.h> -#include <asm/pgalloc.h> -#include <asm/pgtable.h> #include <asm/tlb.h> #include <asm/xen/hypervisor.h> @@ -73,44 +75,29 @@ #include <xen/page.h> #include <xen/mem-reservation.h> -static int xen_hotplug_unpopulated; +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "xen." -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +static uint __read_mostly balloon_boot_timeout = 180; +module_param(balloon_boot_timeout, uint, 0444); -static int zero; -static int one = 1; +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +static int xen_hotplug_unpopulated; -static struct ctl_table balloon_table[] = { +static const struct ctl_table balloon_table[] = { { .procname = "hotplug_unpopulated", .data = &xen_hotplug_unpopulated, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, - { } -}; - -static struct ctl_table balloon_root[] = { - { - .procname = "balloon", - .mode = 0555, - .child = balloon_table, - }, - { } -}; - -static struct ctl_table xen_root[] = { - { - .procname = "xen", - .mode = 0555, - .child = balloon_root, - }, - { } }; +#else +#define xen_hotplug_unpopulated 0 #endif /* @@ -120,7 +107,7 @@ static struct ctl_table xen_root[] = { #define EXTENT_ORDER (fls(XEN_PFN_PER_PAGE) - 1) /* - * balloon_process() state: + * balloon_thread() state: * * BP_DONE: done or nothing to do, * BP_WAIT: wait to be rescheduled, @@ -128,13 +115,15 @@ static struct ctl_table xen_root[] = { * BP_ECANCELED: error, balloon operation canceled. */ -enum bp_state { +static enum bp_state { BP_DONE, BP_WAIT, BP_EAGAIN, BP_ECANCELED -}; +} balloon_state = BP_DONE; +/* Main waiting point for xen-balloon thread. */ +static DECLARE_WAIT_QUEUE_HEAD(balloon_thread_wq); static DEFINE_MUTEX(balloon_mutex); @@ -149,18 +138,17 @@ static xen_pfn_t frame_list[PAGE_SIZE / sizeof(xen_pfn_t)]; static LIST_HEAD(ballooned_pages); static DECLARE_WAIT_QUEUE_HEAD(balloon_wq); -/* Main work function, always executed in process context. */ -static void balloon_process(struct work_struct *work); -static DECLARE_DELAYED_WORK(balloon_worker, balloon_process); - /* When ballooning out (allocating memory to return to Xen) we don't really want the kernel to try too hard since that can trigger the oom killer. */ #define GFP_BALLOON \ (GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC) /* balloon_append: add the given page to the balloon. */ -static void __balloon_append(struct page *page) +static void balloon_append(struct page *page) { + if (!PageOffline(page)) + __SetPageOffline(page); + /* Lowmem is re-populated first, so highmem pages go at list tail. */ if (PageHighMem(page)) { list_add_tail(&page->lru, &ballooned_pages); @@ -169,12 +157,9 @@ static void __balloon_append(struct page *page) list_add(&page->lru, &ballooned_pages); balloon_stats.balloon_low++; } - wake_up(&balloon_wq); -} + inc_node_page_state(page, NR_BALLOON_PAGES); -static void balloon_append(struct page *page) -{ - __balloon_append(page); + wake_up(&balloon_wq); } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ @@ -195,6 +180,9 @@ static struct page *balloon_retrieve(bool require_lowmem) else balloon_stats.balloon_low--; + __ClearPageOffline(page); + dec_node_page_state(page, NR_BALLOON_PAGES); + return page; } @@ -206,18 +194,15 @@ static struct page *balloon_next_page(struct page *page) return list_entry(next, struct page, lru); } -static enum bp_state update_schedule(enum bp_state state) +static void update_schedule(void) { - if (state == BP_WAIT) - return BP_WAIT; - - if (state == BP_ECANCELED) - return BP_ECANCELED; + if (balloon_state == BP_WAIT || balloon_state == BP_ECANCELED) + return; - if (state == BP_DONE) { + if (balloon_state == BP_DONE) { balloon_stats.schedule_delay = 1; balloon_stats.retry_count = 1; - return BP_DONE; + return; } ++balloon_stats.retry_count; @@ -226,7 +211,8 @@ static enum bp_state update_schedule(enum bp_state state) balloon_stats.retry_count > balloon_stats.max_retry_count) { balloon_stats.schedule_delay = 1; balloon_stats.retry_count = 1; - return BP_ECANCELED; + balloon_state = BP_ECANCELED; + return; } balloon_stats.schedule_delay <<= 1; @@ -234,7 +220,7 @@ static enum bp_state update_schedule(enum bp_state state) if (balloon_stats.schedule_delay > balloon_stats.max_schedule_delay) balloon_stats.schedule_delay = balloon_stats.max_schedule_delay; - return BP_EAGAIN; + balloon_state = BP_EAGAIN; } #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG @@ -272,20 +258,6 @@ static struct resource *additional_memory_resource(phys_addr_t size) return NULL; } -#ifdef CONFIG_SPARSEMEM - { - unsigned long limit = 1UL << (MAX_PHYSMEM_BITS - PAGE_SHIFT); - unsigned long pfn = res->start >> PAGE_SHIFT; - - if (pfn > limit) { - pr_err("New System RAM resource outside addressable RAM (%lu > %lu)\n", - pfn, limit); - release_memory_resource(res); - return NULL; - } - } -#endif - return res; } @@ -330,7 +302,7 @@ static enum bp_state reserve_additional_memory(void) * are not restored since this region is now known not to * conflict with any devices. */ - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (xen_pv_domain()) { unsigned long pfn, i; pfn = PFN_DOWN(resource->start); @@ -352,7 +324,7 @@ static enum bp_state reserve_additional_memory(void) mutex_unlock(&balloon_mutex); /* add_memory_resource() requires the device_hotplug lock */ lock_device_hotplug(); - rc = add_memory_resource(nid, resource); + rc = add_memory_resource(nid, resource, MHP_MERGE_RESOURCE); unlock_device_hotplug(); mutex_lock(&balloon_mutex); @@ -369,21 +341,25 @@ static enum bp_state reserve_additional_memory(void) return BP_ECANCELED; } -static void xen_online_page(struct page *page) +static void xen_online_page(struct page *page, unsigned int order) { - __online_page_set_limits(page); + unsigned long i, size = (1 << order); + unsigned long start_pfn = page_to_pfn(page); + struct page *p; + pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn); mutex_lock(&balloon_mutex); - - __balloon_append(page); - + for (i = 0; i < size; i++) { + p = pfn_to_page(start_pfn + i); + balloon_append(p); + } mutex_unlock(&balloon_mutex); } static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v) { if (val == MEM_ONLINE) - schedule_delayed_work(&balloon_worker, 0); + wake_up(&balloon_thread_wq); return NOTIFY_OK; } @@ -395,7 +371,8 @@ static struct notifier_block xen_memory_nb = { #else static enum bp_state reserve_additional_memory(void) { - balloon_stats.target_pages = balloon_stats.current_pages; + balloon_stats.target_pages = balloon_stats.current_pages + + balloon_stats.target_unpopulated; return BP_ECANCELED; } #endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ @@ -440,7 +417,11 @@ static enum bp_state increase_reservation(unsigned long nr_pages) xenmem_reservation_va_mapping_update(1, &page, &frame_list[i]); - /* Relinquish the page back to the allocator. */ + /* + * Relinquish the page back to the allocator. Note that + * some pages, including ones added via xen_online_page(), might + * not be marked reserved; free_reserved_page() will handle that. + */ free_reserved_page(page); } @@ -507,43 +488,79 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) } /* - * As this is a work item it is guaranteed to run as a single instance only. + * Stop waiting if either state is BP_DONE and ballooning action is + * needed, or if the credit has changed while state is not BP_DONE. + */ +static bool balloon_thread_cond(long credit) +{ + if (balloon_state == BP_DONE) + credit = 0; + + return current_credit() != credit || kthread_should_stop(); +} + +/* + * As this is a kthread it is guaranteed to run as a single instance only. * We may of course race updates of the target counts (which are protected * by the balloon lock), or with changes to the Xen hard limit, but we will * recover from these in time. */ -static void balloon_process(struct work_struct *work) +static int balloon_thread(void *unused) { - enum bp_state state = BP_DONE; long credit; + unsigned long timeout; + + set_freezable(); + for (;;) { + switch (balloon_state) { + case BP_DONE: + case BP_ECANCELED: + timeout = 3600 * HZ; + break; + case BP_EAGAIN: + timeout = balloon_stats.schedule_delay * HZ; + break; + case BP_WAIT: + timeout = HZ; + break; + } + + credit = current_credit(); + + wait_event_freezable_timeout(balloon_thread_wq, + balloon_thread_cond(credit), timeout); + if (kthread_should_stop()) + return 0; - do { mutex_lock(&balloon_mutex); credit = current_credit(); if (credit > 0) { if (balloon_is_inflated()) - state = increase_reservation(credit); + balloon_state = increase_reservation(credit); else - state = reserve_additional_memory(); + balloon_state = reserve_additional_memory(); } - if (credit < 0) - state = decrease_reservation(-credit, GFP_BALLOON); + if (credit < 0) { + long n_pages; + + n_pages = min(-credit, si_mem_available()); + balloon_state = decrease_reservation(n_pages, + GFP_BALLOON); + if (balloon_state == BP_DONE && n_pages != -credit && + n_pages < totalreserve_pages) + balloon_state = BP_EAGAIN; + } - state = update_schedule(state); + update_schedule(); mutex_unlock(&balloon_mutex); cond_resched(); - - } while (credit && state == BP_DONE); - - /* Schedule more work if there is some still to be done. */ - if (state == BP_EAGAIN) - schedule_delayed_work(&balloon_worker, balloon_stats.schedule_delay * HZ); + } } /* Resets the Xen limit, sets new target, and kicks off processing. */ @@ -551,25 +568,30 @@ void balloon_set_new_target(unsigned long target) { /* No need for lock. Not read-modify-write updates. */ balloon_stats.target_pages = target; - schedule_delayed_work(&balloon_worker, 0); + wake_up(&balloon_thread_wq); } EXPORT_SYMBOL_GPL(balloon_set_new_target); -static int add_ballooned_pages(int nr_pages) +static int add_ballooned_pages(unsigned int nr_pages) { enum bp_state st; if (xen_hotplug_unpopulated) { st = reserve_additional_memory(); if (st != BP_ECANCELED) { + int rc; + mutex_unlock(&balloon_mutex); - wait_event(balloon_wq, + rc = wait_event_interruptible(balloon_wq, !list_empty(&ballooned_pages)); mutex_lock(&balloon_mutex); - return 0; + return rc ? -ENOMEM : 0; } } + if (si_mem_available() < nr_pages) + return -ENOMEM; + st = decrease_reservation(nr_pages, GFP_USER); if (st != BP_DONE) return -ENOMEM; @@ -578,14 +600,14 @@ static int add_ballooned_pages(int nr_pages) } /** - * alloc_xenballooned_pages - get pages that have been ballooned out + * xen_alloc_ballooned_pages - get pages that have been ballooned out * @nr_pages: Number of pages to get * @pages: pages returned * @return 0 on success, error otherwise */ -int alloc_xenballooned_pages(int nr_pages, struct page **pages) +int xen_alloc_ballooned_pages(unsigned int nr_pages, struct page **pages) { - int pgno = 0; + unsigned int pgno = 0; struct page *page; int ret; @@ -604,7 +626,7 @@ int alloc_xenballooned_pages(int nr_pages, struct page **pages) */ BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (xen_pv_domain()) { ret = xen_alloc_p2m_entry(page_to_pfn(page)); if (ret < 0) goto out_undo; @@ -620,19 +642,25 @@ int alloc_xenballooned_pages(int nr_pages, struct page **pages) return 0; out_undo: mutex_unlock(&balloon_mutex); - free_xenballooned_pages(pgno, pages); + xen_free_ballooned_pages(pgno, pages); + /* + * NB: xen_free_ballooned_pages will only subtract pgno pages, but since + * target_unpopulated is incremented with nr_pages at the start we need + * to remove the remaining ones also, or accounting will be screwed. + */ + balloon_stats.target_unpopulated -= nr_pages - pgno; return ret; } -EXPORT_SYMBOL(alloc_xenballooned_pages); +EXPORT_SYMBOL(xen_alloc_ballooned_pages); /** - * free_xenballooned_pages - return pages retrieved with get_ballooned_pages + * xen_free_ballooned_pages - return pages retrieved with get_ballooned_pages * @nr_pages: Number of pages * @pages: pages to return */ -void free_xenballooned_pages(int nr_pages, struct page **pages) +void xen_free_ballooned_pages(unsigned int nr_pages, struct page **pages) { - int i; + unsigned int i; mutex_lock(&balloon_mutex); @@ -645,52 +673,70 @@ void free_xenballooned_pages(int nr_pages, struct page **pages) /* The balloon may be too large now. Shrink it if needed. */ if (current_credit()) - schedule_delayed_work(&balloon_worker, 0); + wake_up(&balloon_thread_wq); mutex_unlock(&balloon_mutex); } -EXPORT_SYMBOL(free_xenballooned_pages); +EXPORT_SYMBOL(xen_free_ballooned_pages); -#ifdef CONFIG_XEN_PV -static void __init balloon_add_region(unsigned long start_pfn, - unsigned long pages) +static int __init balloon_add_regions(void) { + unsigned long start_pfn, pages; unsigned long pfn, extra_pfn_end; - struct page *page; + unsigned int i; - /* - * If the amount of usable memory has been limited (e.g., with - * the 'mem' command line parameter), don't add pages beyond - * this limit. - */ - extra_pfn_end = min(max_pfn, start_pfn + pages); - - for (pfn = start_pfn; pfn < extra_pfn_end; pfn++) { - page = pfn_to_page(pfn); - /* totalram_pages and totalhigh_pages do not - include the boot-time balloon extension, so - don't subtract from it. */ - __balloon_append(page); + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { + pages = xen_extra_mem[i].n_pfns; + if (!pages) + continue; + + start_pfn = xen_extra_mem[i].start_pfn; + + /* + * If the amount of usable memory has been limited (e.g., with + * the 'mem' command line parameter), don't add pages beyond + * this limit. + */ + extra_pfn_end = min(max_pfn, start_pfn + pages); + + for (pfn = start_pfn; pfn < extra_pfn_end; pfn++) + balloon_append(pfn_to_page(pfn)); + + /* + * Extra regions are accounted for in the physmap, but need + * decreasing from current_pages and target_pages to balloon + * down the initial allocation, because they are already + * accounted for in total_pages. + */ + pages = extra_pfn_end - start_pfn; + if (pages >= balloon_stats.current_pages || + pages >= balloon_stats.target_pages) { + WARN(1, "Extra pages underflow current target"); + return -ERANGE; + } + balloon_stats.current_pages -= pages; + balloon_stats.target_pages -= pages; } - balloon_stats.total_pages += extra_pfn_end - start_pfn; + return 0; } -#endif static int __init balloon_init(void) { + struct task_struct *task; + int rc; + if (!xen_domain()) return -ENODEV; pr_info("Initialising balloon driver\n"); -#ifdef CONFIG_XEN_PV - balloon_stats.current_pages = xen_pv_domain() - ? min(xen_start_info->nr_pages - xen_released_pages, max_pfn) - : get_num_physpages(); -#else - balloon_stats.current_pages = get_num_physpages(); -#endif + if (xen_released_pages >= get_num_physpages()) { + WARN(1, "Released pages underflow current target"); + return -ERANGE; + } + + balloon_stats.current_pages = get_num_physpages() - xen_released_pages; balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; @@ -699,28 +745,23 @@ static int __init balloon_init(void) balloon_stats.schedule_delay = 1; balloon_stats.max_schedule_delay = 32; balloon_stats.retry_count = 1; - balloon_stats.max_retry_count = RETRY_UNLIMITED; + balloon_stats.max_retry_count = 4; #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG set_online_page_callback(&xen_online_page); register_memory_notifier(&xen_memory_nb); - register_sysctl_table(xen_root); + register_sysctl_init("xen/balloon", balloon_table); #endif -#ifdef CONFIG_XEN_PV - { - int i; + rc = balloon_add_regions(); + if (rc) + return rc; - /* - * Initialize the balloon with pages from the extra memory - * regions (see arch/x86/xen/setup.c). - */ - for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) - if (xen_extra_mem[i].n_pfns) - balloon_add_region(xen_extra_mem[i].start_pfn, - xen_extra_mem[i].n_pfns); + task = kthread_run(balloon_thread, NULL, "xen-balloon"); + if (IS_ERR(task)) { + pr_err("xen-balloon thread could not be started, ballooning will not work!\n"); + return PTR_ERR(task); } -#endif /* Init the xen-balloon driver. */ xen_balloon_init(); @@ -728,3 +769,38 @@ static int __init balloon_init(void) return 0; } subsys_initcall(balloon_init); + +static int __init balloon_wait_finish(void) +{ + long credit, last_credit = 0; + unsigned long last_changed = 0; + + if (!xen_domain()) + return -ENODEV; + + /* PV guests don't need to wait. */ + if (xen_pv_domain() || !current_credit()) + return 0; + + pr_notice("Waiting for initial ballooning down having finished.\n"); + + while ((credit = current_credit()) < 0) { + if (credit != last_credit) { + last_changed = jiffies; + last_credit = credit; + } + if (balloon_state == BP_ECANCELED) { + pr_warn_once("Initial ballooning failed, %ld pages need to be freed.\n", + -credit); + if (time_is_before_eq_jiffies(last_changed + HZ * balloon_boot_timeout)) + panic("Initial ballooning failed!\n"); + } + + schedule_timeout_interruptible(HZ / 10); + } + + pr_notice("Initial ballooning down finished.\n"); + + return 0; +} +late_initcall_sync(balloon_wait_finish); diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c index f3fbb700f569..05a286d24f14 100644 --- a/drivers/xen/biomerge.c +++ b/drivers/xen/biomerge.c @@ -4,12 +4,13 @@ #include <xen/xen.h> #include <xen/page.h> +/* check if @page can be merged with 'vec1' */ bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, - const struct bio_vec *vec2) + const struct page *page) { #if XEN_PAGE_SIZE == PAGE_SIZE unsigned long bfn1 = pfn_to_bfn(page_to_pfn(vec1->bv_page)); - unsigned long bfn2 = pfn_to_bfn(page_to_pfn(vec2->bv_page)); + unsigned long bfn2 = pfn_to_bfn(page_to_pfn(page)); return bfn1 + PFN_DOWN(vec1->bv_offset + vec1->bv_len) == bfn2; #else diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index b1357aa4bc55..b96b11e2b571 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -54,7 +54,7 @@ static int vcpu_online(unsigned int cpu) } static void vcpu_hotplug(unsigned int cpu) { - if (!cpu_possible(cpu)) + if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) return; switch (vcpu_online(cpu)) { @@ -93,10 +93,8 @@ static int setup_cpu_watcher(struct notifier_block *notifier, (void)register_xenbus_watch(&cpu_watch); for_each_possible_cpu(cpu) { - if (vcpu_online(cpu) == 0) { - (void)cpu_down(cpu); - set_cpu_present(cpu, false); - } + if (vcpu_online(cpu) == 0) + disable_hotplug_cpu(cpu); } return NOTIFY_DONE; @@ -119,5 +117,5 @@ static int __init setup_vcpu_hotplug_event(void) return 0; } -arch_initcall(setup_vcpu_hotplug_event); +late_initcall(setup_vcpu_hotplug_event); diff --git a/drivers/xen/dbgp.c b/drivers/xen/dbgp.c index 8145a59fd9f6..cfb5de31d860 100644 --- a/drivers/xen/dbgp.c +++ b/drivers/xen/dbgp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include <linux/pci.h> #include <linux/usb.h> #include <linux/usb/ehci_def.h> diff --git a/drivers/xen/efi.c b/drivers/xen/efi.c index 9243a9051078..fb321cd6415a 100644 --- a/drivers/xen/efi.c +++ b/drivers/xen/efi.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * EFI support for Xen. * @@ -25,6 +26,7 @@ #include <xen/interface/xen.h> #include <xen/interface/platform.h> +#include <xen/page.h> #include <xen/xen.h> #include <xen/xen-ops.h> @@ -39,7 +41,7 @@ #define efi_data(op) (op.u.efi_runtime_call) -efi_status_t xen_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) +static efi_status_t xen_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) { struct xen_platform_op op = INIT_EFI_OP(get_time); @@ -60,9 +62,8 @@ efi_status_t xen_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_get_time); -efi_status_t xen_efi_set_time(efi_time_t *tm) +static efi_status_t xen_efi_set_time(efi_time_t *tm) { struct xen_platform_op op = INIT_EFI_OP(set_time); @@ -74,10 +75,10 @@ efi_status_t xen_efi_set_time(efi_time_t *tm) return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_set_time); -efi_status_t xen_efi_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending, - efi_time_t *tm) +static efi_status_t xen_efi_get_wakeup_time(efi_bool_t *enabled, + efi_bool_t *pending, + efi_time_t *tm) { struct xen_platform_op op = INIT_EFI_OP(get_wakeup_time); @@ -97,9 +98,8 @@ efi_status_t xen_efi_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_get_wakeup_time); -efi_status_t xen_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) +static efi_status_t xen_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) { struct xen_platform_op op = INIT_EFI_OP(set_wakeup_time); @@ -116,11 +116,10 @@ efi_status_t xen_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_set_wakeup_time); -efi_status_t xen_efi_get_variable(efi_char16_t *name, efi_guid_t *vendor, - u32 *attr, unsigned long *data_size, - void *data) +static efi_status_t xen_efi_get_variable(efi_char16_t *name, efi_guid_t *vendor, + u32 *attr, unsigned long *data_size, + void *data) { struct xen_platform_op op = INIT_EFI_OP(get_variable); @@ -140,11 +139,10 @@ efi_status_t xen_efi_get_variable(efi_char16_t *name, efi_guid_t *vendor, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_get_variable); -efi_status_t xen_efi_get_next_variable(unsigned long *name_size, - efi_char16_t *name, - efi_guid_t *vendor) +static efi_status_t xen_efi_get_next_variable(unsigned long *name_size, + efi_char16_t *name, + efi_guid_t *vendor) { struct xen_platform_op op = INIT_EFI_OP(get_next_variable_name); @@ -164,11 +162,10 @@ efi_status_t xen_efi_get_next_variable(unsigned long *name_size, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_get_next_variable); -efi_status_t xen_efi_set_variable(efi_char16_t *name, efi_guid_t *vendor, - u32 attr, unsigned long data_size, - void *data) +static efi_status_t xen_efi_set_variable(efi_char16_t *name, efi_guid_t *vendor, + u32 attr, unsigned long data_size, + void *data) { struct xen_platform_op op = INIT_EFI_OP(set_variable); @@ -185,11 +182,10 @@ efi_status_t xen_efi_set_variable(efi_char16_t *name, efi_guid_t *vendor, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_set_variable); -efi_status_t xen_efi_query_variable_info(u32 attr, u64 *storage_space, - u64 *remaining_space, - u64 *max_variable_size) +static efi_status_t xen_efi_query_variable_info(u32 attr, u64 *storage_space, + u64 *remaining_space, + u64 *max_variable_size) { struct xen_platform_op op = INIT_EFI_OP(query_variable_info); @@ -207,9 +203,8 @@ efi_status_t xen_efi_query_variable_info(u32 attr, u64 *storage_space, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_query_variable_info); -efi_status_t xen_efi_get_next_high_mono_count(u32 *count) +static efi_status_t xen_efi_get_next_high_mono_count(u32 *count) { struct xen_platform_op op = INIT_EFI_OP(get_next_high_monotonic_count); @@ -220,10 +215,9 @@ efi_status_t xen_efi_get_next_high_mono_count(u32 *count) return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_get_next_high_mono_count); -efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules, - unsigned long count, unsigned long sg_list) +static efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules, + unsigned long count, unsigned long sg_list) { struct xen_platform_op op = INIT_EFI_OP(update_capsule); @@ -240,11 +234,9 @@ efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_update_capsule); -efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, - unsigned long count, u64 *max_size, - int *reset_type) +static efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, + unsigned long count, u64 *max_size, int *reset_type) { struct xen_platform_op op = INIT_EFI_OP(query_capsule_capabilities); @@ -263,10 +255,9 @@ efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_query_capsule_caps); -void xen_efi_reset_system(int reset_type, efi_status_t status, - unsigned long data_size, efi_char16_t *data) +static void xen_efi_reset_system(int reset_type, efi_status_t status, + unsigned long data_size, efi_char16_t *data) { switch (reset_type) { case EFI_RESET_COLD: @@ -280,4 +271,85 @@ void xen_efi_reset_system(int reset_type, efi_status_t status, BUG(); } } -EXPORT_SYMBOL_GPL(xen_efi_reset_system); + +/* + * Set XEN EFI runtime services function pointers. Other fields of struct efi, + * e.g. efi.systab, will be set like normal EFI. + */ +void __init xen_efi_runtime_setup(void) +{ + efi.get_time = xen_efi_get_time; + efi.set_time = xen_efi_set_time; + efi.get_wakeup_time = xen_efi_get_wakeup_time; + efi.set_wakeup_time = xen_efi_set_wakeup_time; + efi.get_variable = xen_efi_get_variable; + efi.get_next_variable = xen_efi_get_next_variable; + efi.set_variable = xen_efi_set_variable; + efi.set_variable_nonblocking = xen_efi_set_variable; + efi.query_variable_info = xen_efi_query_variable_info; + efi.query_variable_info_nonblocking = xen_efi_query_variable_info; + efi.update_capsule = xen_efi_update_capsule; + efi.query_capsule_caps = xen_efi_query_capsule_caps; + efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count; + efi.reset_system = xen_efi_reset_system; +} + +int efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md) +{ + static_assert(XEN_PAGE_SHIFT == EFI_PAGE_SHIFT, + "Mismatch between EFI_PAGE_SHIFT and XEN_PAGE_SHIFT"); + struct xen_platform_op op; + union xenpf_efi_info *info = &op.u.firmware_info.u.efi_info; + int rc; + + if (!efi_enabled(EFI_PARAVIRT) || efi_enabled(EFI_MEMMAP)) + return __efi_mem_desc_lookup(phys_addr, out_md); + phys_addr &= ~(u64)(EFI_PAGE_SIZE - 1); + op = (struct xen_platform_op) { + .cmd = XENPF_firmware_info, + .u.firmware_info = { + .type = XEN_FW_EFI_INFO, + .index = XEN_FW_EFI_MEM_INFO, + .u.efi_info.mem.addr = phys_addr, + .u.efi_info.mem.size = U64_MAX - phys_addr, + }, + }; + + rc = HYPERVISOR_platform_op(&op); + if (rc) { + pr_warn("Failed to lookup header 0x%llx in Xen memory map: error %d\n", + phys_addr, rc); + } + + out_md->phys_addr = info->mem.addr; + out_md->num_pages = info->mem.size >> EFI_PAGE_SHIFT; + out_md->type = info->mem.type; + out_md->attribute = info->mem.attr; + + return 0; +} + +bool __init xen_efi_config_table_is_usable(const efi_guid_t *guid, + unsigned long table) +{ + efi_memory_desc_t md; + int rc; + + if (!efi_enabled(EFI_PARAVIRT)) + return true; + + rc = efi_mem_desc_lookup(table, &md); + if (rc) + return false; + + switch (md.type) { + case EFI_RUNTIME_SERVICES_CODE: + case EFI_RUNTIME_SERVICES_DATA: + case EFI_ACPI_RECLAIM_MEMORY: + case EFI_ACPI_MEMORY_NVS: + case EFI_RESERVED_TYPE: + return true; + default: + return false; + } +} diff --git a/drivers/xen/events/Makefile b/drivers/xen/events/Makefile index 62be55cd981d..92508d9a6bd2 100644 --- a/drivers/xen/events/Makefile +++ b/drivers/xen/events/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-y += events.o events-y += events_base.o diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c index 8edef51c92e5..e3585330cf98 100644 --- a/drivers/xen/events/events_2l.c +++ b/drivers/xen/events/events_2l.c @@ -47,43 +47,43 @@ static unsigned evtchn_2l_max_channels(void) return EVTCHN_2L_NR_CHANNELS; } -static void evtchn_2l_bind_to_cpu(struct irq_info *info, unsigned cpu) +static void evtchn_2l_remove(evtchn_port_t evtchn, unsigned int cpu) { - clear_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, info->cpu))); - set_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, cpu))); + clear_bit(evtchn, BM(per_cpu(cpu_evtchn_mask, cpu))); } -static void evtchn_2l_clear_pending(unsigned port) +static void evtchn_2l_bind_to_cpu(evtchn_port_t evtchn, unsigned int cpu, + unsigned int old_cpu) { - struct shared_info *s = HYPERVISOR_shared_info; - sync_clear_bit(port, BM(&s->evtchn_pending[0])); + clear_bit(evtchn, BM(per_cpu(cpu_evtchn_mask, old_cpu))); + set_bit(evtchn, BM(per_cpu(cpu_evtchn_mask, cpu))); } -static void evtchn_2l_set_pending(unsigned port) +static void evtchn_2l_clear_pending(evtchn_port_t port) { struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, BM(&s->evtchn_pending[0])); + sync_clear_bit(port, BM(&s->evtchn_pending[0])); } -static bool evtchn_2l_is_pending(unsigned port) +static void evtchn_2l_set_pending(evtchn_port_t port) { struct shared_info *s = HYPERVISOR_shared_info; - return sync_test_bit(port, BM(&s->evtchn_pending[0])); + sync_set_bit(port, BM(&s->evtchn_pending[0])); } -static bool evtchn_2l_test_and_set_mask(unsigned port) +static bool evtchn_2l_is_pending(evtchn_port_t port) { struct shared_info *s = HYPERVISOR_shared_info; - return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0])); + return sync_test_bit(port, BM(&s->evtchn_pending[0])); } -static void evtchn_2l_mask(unsigned port) +static void evtchn_2l_mask(evtchn_port_t port) { struct shared_info *s = HYPERVISOR_shared_info; sync_set_bit(port, BM(&s->evtchn_mask[0])); } -static void evtchn_2l_unmask(unsigned port) +static void evtchn_2l_unmask(evtchn_port_t port) { struct shared_info *s = HYPERVISOR_shared_info; unsigned int cpu = get_cpu(); @@ -91,6 +91,8 @@ static void evtchn_2l_unmask(unsigned port) BUG_ON(!irqs_disabled()); + smp_wmb(); /* All writes before unmask must be visible. */ + if (unlikely((cpu != cpu_from_evtchn(port)))) do_hypercall = 1; else { @@ -159,7 +161,7 @@ static inline xen_ulong_t active_evtchns(unsigned int cpu, * a bitset of words which contain pending event bits. The second * level is a bitset of pending events themselves. */ -static void evtchn_2l_handle_events(unsigned cpu) +static void evtchn_2l_handle_events(unsigned cpu, struct evtchn_loop_ctrl *ctrl) { int irq; xen_ulong_t pending_words; @@ -169,11 +171,11 @@ static void evtchn_2l_handle_events(unsigned cpu) int i; struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + evtchn_port_t evtchn; /* Timer interrupt has highest priority. */ - irq = irq_from_virq(cpu, VIRQ_TIMER); + irq = irq_evtchn_from_virq(cpu, VIRQ_TIMER, &evtchn); if (irq != -1) { - unsigned int evtchn = evtchn_from_irq(irq); word_idx = evtchn / BITS_PER_LONG; bit_idx = evtchn % BITS_PER_LONG; if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx)) @@ -228,7 +230,7 @@ static void evtchn_2l_handle_events(unsigned cpu) do { xen_ulong_t bits; - int port; + evtchn_port_t port; bits = MASK_LSBS(pending_bits, bit_idx); @@ -240,10 +242,7 @@ static void evtchn_2l_handle_events(unsigned cpu) /* Process port. */ port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx; - irq = get_evtchn_to_irq(port); - - if (irq != -1) - generic_handle_irq(irq); + handle_irq_for_port(port, ctrl); bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD; @@ -329,9 +328,9 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) for (i = 0; i < EVTCHN_2L_NR_CHANNELS; i++) { if (sync_test_bit(i, BM(sh->evtchn_pending))) { int word_idx = i / BITS_PER_EVTCHN_WORD; - printk(" %d: event %d -> irq %d%s%s%s\n", + printk(" %d: event %d -> irq %u%s%s%s\n", cpu_from_evtchn(i), i, - get_evtchn_to_irq(i), + irq_from_evtchn(i), sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) ? "" : " l2-clear", !sync_test_bit(i, BM(sh->evtchn_mask)) @@ -355,18 +354,27 @@ static void evtchn_2l_resume(void) EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD); } +static int evtchn_2l_percpu_deinit(unsigned int cpu) +{ + memset(per_cpu(cpu_evtchn_mask, cpu), 0, sizeof(xen_ulong_t) * + EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD); + + return 0; +} + static const struct evtchn_ops evtchn_ops_2l = { .max_channels = evtchn_2l_max_channels, .nr_channels = evtchn_2l_max_channels, + .remove = evtchn_2l_remove, .bind_to_cpu = evtchn_2l_bind_to_cpu, .clear_pending = evtchn_2l_clear_pending, .set_pending = evtchn_2l_set_pending, .is_pending = evtchn_2l_is_pending, - .test_and_set_mask = evtchn_2l_test_and_set_mask, .mask = evtchn_2l_mask, .unmask = evtchn_2l_unmask, .handle_events = evtchn_2l_handle_events, .resume = evtchn_2l_resume, + .percpu_deinit = evtchn_2l_percpu_deinit, }; void __init xen_evtchn_2l_init(void) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 117e76b2f939..9478fae014e5 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Xen event channels * @@ -32,13 +33,20 @@ #include <linux/slab.h> #include <linux/irqnr.h> #include <linux/pci.h> +#include <linux/rcupdate.h> +#include <linux/spinlock.h> +#include <linux/cpuhotplug.h> +#include <linux/atomic.h> +#include <linux/ktime.h> #ifdef CONFIG_X86 #include <asm/desc.h> #include <asm/ptrace.h> +#include <asm/idtentry.h> #include <asm/irq.h> #include <asm/io_apic.h> #include <asm/i8259.h> +#include <asm/xen/cpuid.h> #include <asm/xen/pci.h> #endif #include <asm/sync_bitops.h> @@ -57,10 +65,81 @@ #include <xen/interface/physdev.h> #include <xen/interface/sched.h> #include <xen/interface/vcpu.h> +#include <xen/xenbus.h> #include <asm/hw_irq.h> #include "events_internal.h" +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "xen." + +/* Interrupt types. */ +enum xen_irq_type { + IRQT_UNBOUND = 0, + IRQT_PIRQ, + IRQT_VIRQ, + IRQT_IPI, + IRQT_EVTCHN +}; + +/* + * Packed IRQ information: + * type - enum xen_irq_type + * event channel - irq->event channel mapping + * cpu - cpu this event channel is bound to + * index - type-specific information: + * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM + * guest, or GSI (real passthrough IRQ) of the device. + * VIRQ - virq number + * IPI - IPI vector + * EVTCHN - + */ +struct irq_info { + struct list_head list; + struct list_head eoi_list; + struct rcu_work rwork; + short refcnt; + u8 spurious_cnt; + u8 is_accounted; + short type; /* type: IRQT_* */ + u8 mask_reason; /* Why is event channel masked */ +#define EVT_MASK_REASON_EXPLICIT 0x01 +#define EVT_MASK_REASON_TEMPORARY 0x02 +#define EVT_MASK_REASON_EOI_PENDING 0x04 + u8 is_active; /* Is event just being handled? */ + unsigned irq; + evtchn_port_t evtchn; /* event channel */ + unsigned short cpu; /* cpu bound */ + unsigned short eoi_cpu; /* EOI must happen on this cpu-1 */ + unsigned int irq_epoch; /* If eoi_cpu valid: irq_epoch of event */ + u64 eoi_time; /* Time in jiffies when to EOI. */ + raw_spinlock_t lock; + bool is_static; /* Is event channel static */ + + union { + unsigned short virq; + enum ipi_vector ipi; + struct { + unsigned short pirq; + unsigned short gsi; + unsigned char vector; + unsigned char flags; + uint16_t domid; + } pirq; + struct xenbus_device *interdomain; + } u; +}; + +#define PIRQ_NEEDS_EOI (1 << 0) +#define PIRQ_SHAREABLE (1 << 1) +#define PIRQ_MSI_GROUP (1 << 2) + +static uint __read_mostly event_loop_timeout = 2; +module_param(event_loop_timeout, uint, 0644); + +static uint __read_mostly event_eoi_delay = 10; +module_param(event_eoi_delay, uint, 0644); + const struct evtchn_ops *evtchn_ops; /* @@ -69,6 +148,15 @@ const struct evtchn_ops *evtchn_ops; */ static DEFINE_MUTEX(irq_mapping_update_lock); +/* + * Lock hierarchy: + * + * irq_mapping_update_lock + * IRQ-desc lock + * percpu eoi_list_lock + * irq_info->lock + */ + static LIST_HEAD(xen_irq_list_head); /* IRQ <-> VIRQ mapping. */ @@ -76,12 +164,17 @@ static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; /* IRQ <-> IPI mapping */ static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; +/* Cache for IPI event channels - needed for hot cpu unplug (avoid RCU usage). */ +static DEFINE_PER_CPU(evtchn_port_t [XEN_NR_IPIS], ipi_to_evtchn) = {[0 ... XEN_NR_IPIS-1] = 0}; + +/* Event channel distribution data */ +static atomic_t channels_on_cpu[NR_CPUS]; -int **evtchn_to_irq; +static int **evtchn_to_irq; #ifdef CONFIG_X86 static unsigned long *pirq_eoi_map; #endif -static bool (*pirq_needs_eoi)(unsigned irq); +static bool (*pirq_needs_eoi)(struct irq_info *info); #define EVTCHN_ROW(e) (e / (PAGE_SIZE/sizeof(**evtchn_to_irq))) #define EVTCHN_COL(e) (e % (PAGE_SIZE/sizeof(**evtchn_to_irq))) @@ -90,18 +183,22 @@ static bool (*pirq_needs_eoi)(unsigned irq); /* Xen will never allocate port zero for any purpose. */ #define VALID_EVTCHN(chn) ((chn) != 0) +static struct irq_info *legacy_info_ptrs[NR_IRQS_LEGACY]; + static struct irq_chip xen_dynamic_chip; +static struct irq_chip xen_lateeoi_chip; static struct irq_chip xen_percpu_chip; static struct irq_chip xen_pirq_chip; static void enable_dynirq(struct irq_data *data); -static void disable_dynirq(struct irq_data *data); -static void clear_evtchn_to_irq_row(unsigned row) +static DEFINE_PER_CPU(unsigned int, irq_epoch); + +static void clear_evtchn_to_irq_row(int *evtchn_row) { unsigned col; for (col = 0; col < EVTCHN_PER_ROW; col++) - evtchn_to_irq[row][col] = -1; + WRITE_ONCE(evtchn_row[col], -1); } static void clear_evtchn_to_irq_all(void) @@ -111,14 +208,15 @@ static void clear_evtchn_to_irq_all(void) for (row = 0; row < EVTCHN_ROW(xen_evtchn_max_channels()); row++) { if (evtchn_to_irq[row] == NULL) continue; - clear_evtchn_to_irq_row(row); + clear_evtchn_to_irq_row(evtchn_to_irq[row]); } } -static int set_evtchn_to_irq(unsigned evtchn, unsigned irq) +static int set_evtchn_to_irq(evtchn_port_t evtchn, unsigned int irq) { unsigned row; unsigned col; + int *evtchn_row; if (evtchn >= xen_evtchn_max_channels()) return -EINVAL; @@ -131,37 +229,105 @@ static int set_evtchn_to_irq(unsigned evtchn, unsigned irq) if (irq == -1) return 0; - evtchn_to_irq[row] = (int *)get_zeroed_page(GFP_KERNEL); - if (evtchn_to_irq[row] == NULL) + evtchn_row = (int *) __get_free_pages(GFP_KERNEL, 0); + if (evtchn_row == NULL) return -ENOMEM; - clear_evtchn_to_irq_row(row); + clear_evtchn_to_irq_row(evtchn_row); + + /* + * We've prepared an empty row for the mapping. If a different + * thread was faster inserting it, we can drop ours. + */ + if (cmpxchg(&evtchn_to_irq[row], NULL, evtchn_row) != NULL) + free_page((unsigned long) evtchn_row); } - evtchn_to_irq[row][col] = irq; + WRITE_ONCE(evtchn_to_irq[row][col], irq); return 0; } -int get_evtchn_to_irq(unsigned evtchn) +/* Get info for IRQ */ +static struct irq_info *info_for_irq(unsigned irq) { + if (irq < nr_legacy_irqs()) + return legacy_info_ptrs[irq]; + else + return irq_get_chip_data(irq); +} + +static void set_info_for_irq(unsigned int irq, struct irq_info *info) +{ + if (irq < nr_legacy_irqs()) + legacy_info_ptrs[irq] = info; + else + irq_set_chip_data(irq, info); +} + +static struct irq_info *evtchn_to_info(evtchn_port_t evtchn) +{ + int irq; + if (evtchn >= xen_evtchn_max_channels()) - return -1; + return NULL; if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL) - return -1; - return evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]; + return NULL; + irq = READ_ONCE(evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]); + + return (irq < 0) ? NULL : info_for_irq(irq); } -/* Get info for IRQ */ -struct irq_info *info_for_irq(unsigned irq) +/* Per CPU channel accounting */ +static void channels_on_cpu_dec(struct irq_info *info) { - return irq_get_handler_data(irq); + if (!info->is_accounted) + return; + + info->is_accounted = 0; + + if (WARN_ON_ONCE(info->cpu >= nr_cpu_ids)) + return; + + WARN_ON_ONCE(!atomic_add_unless(&channels_on_cpu[info->cpu], -1 , 0)); +} + +static void channels_on_cpu_inc(struct irq_info *info) +{ + if (WARN_ON_ONCE(info->cpu >= nr_cpu_ids)) + return; + + if (WARN_ON_ONCE(!atomic_add_unless(&channels_on_cpu[info->cpu], 1, + INT_MAX))) + return; + + info->is_accounted = 1; +} + +static void xen_irq_free_desc(unsigned int irq) +{ + /* Legacy IRQ descriptors are managed by the arch. */ + if (irq >= nr_legacy_irqs()) + irq_free_desc(irq); +} + +static void delayed_free_irq(struct work_struct *work) +{ + struct irq_info *info = container_of(to_rcu_work(work), struct irq_info, + rwork); + unsigned int irq = info->irq; + + /* Remove the info pointer only now, with no potential users left. */ + set_info_for_irq(irq, NULL); + + kfree(info); + + xen_irq_free_desc(irq); } /* Constructors for packed IRQ information. */ static int xen_irq_info_common_setup(struct irq_info *info, - unsigned irq, enum xen_irq_type type, - unsigned evtchn, + evtchn_port_t evtchn, unsigned short cpu) { int ret; @@ -169,178 +335,196 @@ static int xen_irq_info_common_setup(struct irq_info *info, BUG_ON(info->type != IRQT_UNBOUND && info->type != type); info->type = type; - info->irq = irq; info->evtchn = evtchn; info->cpu = cpu; + info->mask_reason = EVT_MASK_REASON_EXPLICIT; + raw_spin_lock_init(&info->lock); - ret = set_evtchn_to_irq(evtchn, irq); + ret = set_evtchn_to_irq(evtchn, info->irq); if (ret < 0) return ret; - irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN); + irq_clear_status_flags(info->irq, IRQ_NOREQUEST | IRQ_NOAUTOEN); - return xen_evtchn_port_setup(info); + return xen_evtchn_port_setup(evtchn); } -static int xen_irq_info_evtchn_setup(unsigned irq, - unsigned evtchn) +static int xen_irq_info_evtchn_setup(struct irq_info *info, + evtchn_port_t evtchn, + struct xenbus_device *dev) { - struct irq_info *info = info_for_irq(irq); + int ret; - return xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0); + ret = xen_irq_info_common_setup(info, IRQT_EVTCHN, evtchn, 0); + info->u.interdomain = dev; + if (dev) + atomic_inc(&dev->event_channels); + + return ret; } -static int xen_irq_info_ipi_setup(unsigned cpu, - unsigned irq, - unsigned evtchn, - enum ipi_vector ipi) +static int xen_irq_info_ipi_setup(struct irq_info *info, unsigned int cpu, + evtchn_port_t evtchn, enum ipi_vector ipi) { - struct irq_info *info = info_for_irq(irq); - info->u.ipi = ipi; - per_cpu(ipi_to_irq, cpu)[ipi] = irq; + per_cpu(ipi_to_irq, cpu)[ipi] = info->irq; + per_cpu(ipi_to_evtchn, cpu)[ipi] = evtchn; - return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0); + return xen_irq_info_common_setup(info, IRQT_IPI, evtchn, 0); } -static int xen_irq_info_virq_setup(unsigned cpu, - unsigned irq, - unsigned evtchn, - unsigned virq) +static int xen_irq_info_virq_setup(struct irq_info *info, unsigned int cpu, + evtchn_port_t evtchn, unsigned int virq) { - struct irq_info *info = info_for_irq(irq); - info->u.virq = virq; - per_cpu(virq_to_irq, cpu)[virq] = irq; + per_cpu(virq_to_irq, cpu)[virq] = info->irq; - return xen_irq_info_common_setup(info, irq, IRQT_VIRQ, evtchn, 0); + return xen_irq_info_common_setup(info, IRQT_VIRQ, evtchn, 0); } -static int xen_irq_info_pirq_setup(unsigned irq, - unsigned evtchn, - unsigned pirq, - unsigned gsi, - uint16_t domid, - unsigned char flags) +static int xen_irq_info_pirq_setup(struct irq_info *info, evtchn_port_t evtchn, + unsigned int pirq, unsigned int gsi, + uint16_t domid, unsigned char flags) { - struct irq_info *info = info_for_irq(irq); - info->u.pirq.pirq = pirq; info->u.pirq.gsi = gsi; info->u.pirq.domid = domid; info->u.pirq.flags = flags; - return xen_irq_info_common_setup(info, irq, IRQT_PIRQ, evtchn, 0); + return xen_irq_info_common_setup(info, IRQT_PIRQ, evtchn, 0); } static void xen_irq_info_cleanup(struct irq_info *info) { set_evtchn_to_irq(info->evtchn, -1); + xen_evtchn_port_remove(info->evtchn, info->cpu); info->evtchn = 0; + channels_on_cpu_dec(info); } /* * Accessors for packed IRQ information. */ -unsigned int evtchn_from_irq(unsigned irq) +static evtchn_port_t evtchn_from_irq(unsigned int irq) { - if (unlikely(WARN(irq >= nr_irqs, "Invalid irq %d!\n", irq))) + const struct irq_info *info = NULL; + + if (likely(irq < irq_get_nr_irqs())) + info = info_for_irq(irq); + if (!info) return 0; - return info_for_irq(irq)->evtchn; + return info->evtchn; } -unsigned irq_from_evtchn(unsigned int evtchn) +unsigned int irq_from_evtchn(evtchn_port_t evtchn) { - return get_evtchn_to_irq(evtchn); + struct irq_info *info = evtchn_to_info(evtchn); + + return info ? info->irq : -1; } EXPORT_SYMBOL_GPL(irq_from_evtchn); -int irq_from_virq(unsigned int cpu, unsigned int virq) +int irq_evtchn_from_virq(unsigned int cpu, unsigned int virq, + evtchn_port_t *evtchn) { - return per_cpu(virq_to_irq, cpu)[virq]; + int irq = per_cpu(virq_to_irq, cpu)[virq]; + + *evtchn = evtchn_from_irq(irq); + + return irq; } -static enum ipi_vector ipi_from_irq(unsigned irq) +static enum ipi_vector ipi_from_irq(struct irq_info *info) { - struct irq_info *info = info_for_irq(irq); - BUG_ON(info == NULL); BUG_ON(info->type != IRQT_IPI); return info->u.ipi; } -static unsigned virq_from_irq(unsigned irq) +static unsigned int virq_from_irq(struct irq_info *info) { - struct irq_info *info = info_for_irq(irq); - BUG_ON(info == NULL); BUG_ON(info->type != IRQT_VIRQ); return info->u.virq; } -static unsigned pirq_from_irq(unsigned irq) +static unsigned int pirq_from_irq(struct irq_info *info) { - struct irq_info *info = info_for_irq(irq); - BUG_ON(info == NULL); BUG_ON(info->type != IRQT_PIRQ); return info->u.pirq.pirq; } -static enum xen_irq_type type_from_irq(unsigned irq) +unsigned int cpu_from_evtchn(evtchn_port_t evtchn) { - return info_for_irq(irq)->type; + struct irq_info *info = evtchn_to_info(evtchn); + + return info ? info->cpu : 0; } -unsigned cpu_from_irq(unsigned irq) +static void do_mask(struct irq_info *info, u8 reason) { - return info_for_irq(irq)->cpu; + unsigned long flags; + + raw_spin_lock_irqsave(&info->lock, flags); + + if (!info->mask_reason) + mask_evtchn(info->evtchn); + + info->mask_reason |= reason; + + raw_spin_unlock_irqrestore(&info->lock, flags); } -unsigned int cpu_from_evtchn(unsigned int evtchn) +static void do_unmask(struct irq_info *info, u8 reason) { - int irq = get_evtchn_to_irq(evtchn); - unsigned ret = 0; + unsigned long flags; - if (irq != -1) - ret = cpu_from_irq(irq); + raw_spin_lock_irqsave(&info->lock, flags); - return ret; + info->mask_reason &= ~reason; + + if (!info->mask_reason) + unmask_evtchn(info->evtchn); + + raw_spin_unlock_irqrestore(&info->lock, flags); } #ifdef CONFIG_X86 -static bool pirq_check_eoi_map(unsigned irq) +static bool pirq_check_eoi_map(struct irq_info *info) { - return test_bit(pirq_from_irq(irq), pirq_eoi_map); + return test_bit(pirq_from_irq(info), pirq_eoi_map); } #endif -static bool pirq_needs_eoi_flag(unsigned irq) +static bool pirq_needs_eoi_flag(struct irq_info *info) { - struct irq_info *info = info_for_irq(irq); BUG_ON(info->type != IRQT_PIRQ); return info->u.pirq.flags & PIRQ_NEEDS_EOI; } -static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) +static void bind_evtchn_to_cpu(struct irq_info *info, unsigned int cpu, + bool force_affinity) { - int irq = get_evtchn_to_irq(chn); - struct irq_info *info = info_for_irq(irq); + if (IS_ENABLED(CONFIG_SMP) && force_affinity) { + struct irq_data *data = irq_get_irq_data(info->irq); - BUG_ON(irq == -1); -#ifdef CONFIG_SMP - cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(cpu)); -#endif - xen_evtchn_port_bind_to_cpu(info, cpu); + irq_data_update_affinity(data, cpumask_of(cpu)); + irq_data_update_effective_affinity(data, cpumask_of(cpu)); + } + + xen_evtchn_port_bind_to_cpu(info->evtchn, cpu, info->cpu); + channels_on_cpu_dec(info); info->cpu = cpu; + channels_on_cpu_inc(info); } /** @@ -353,54 +537,216 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) */ void notify_remote_via_irq(int irq) { - int evtchn = evtchn_from_irq(irq); + evtchn_port_t evtchn = evtchn_from_irq(irq); if (VALID_EVTCHN(evtchn)) notify_remote_via_evtchn(evtchn); } EXPORT_SYMBOL_GPL(notify_remote_via_irq); -static void xen_irq_init(unsigned irq) +struct lateeoi_work { + struct delayed_work delayed; + spinlock_t eoi_list_lock; + struct list_head eoi_list; +}; + +static DEFINE_PER_CPU(struct lateeoi_work, lateeoi); + +static void lateeoi_list_del(struct irq_info *info) +{ + struct lateeoi_work *eoi = &per_cpu(lateeoi, info->eoi_cpu); + unsigned long flags; + + spin_lock_irqsave(&eoi->eoi_list_lock, flags); + list_del_init(&info->eoi_list); + spin_unlock_irqrestore(&eoi->eoi_list_lock, flags); +} + +static void lateeoi_list_add(struct irq_info *info) +{ + struct lateeoi_work *eoi = &per_cpu(lateeoi, info->eoi_cpu); + struct irq_info *elem; + u64 now = get_jiffies_64(); + unsigned long delay; + unsigned long flags; + + if (now < info->eoi_time) + delay = info->eoi_time - now; + else + delay = 1; + + spin_lock_irqsave(&eoi->eoi_list_lock, flags); + + elem = list_first_entry_or_null(&eoi->eoi_list, struct irq_info, + eoi_list); + if (!elem || info->eoi_time < elem->eoi_time) { + list_add(&info->eoi_list, &eoi->eoi_list); + mod_delayed_work_on(info->eoi_cpu, system_wq, + &eoi->delayed, delay); + } else { + list_for_each_entry_reverse(elem, &eoi->eoi_list, eoi_list) { + if (elem->eoi_time <= info->eoi_time) + break; + } + list_add(&info->eoi_list, &elem->eoi_list); + } + + spin_unlock_irqrestore(&eoi->eoi_list_lock, flags); +} + +static void xen_irq_lateeoi_locked(struct irq_info *info, bool spurious) { + evtchn_port_t evtchn; + unsigned int cpu; + unsigned int delay = 0; + + evtchn = info->evtchn; + if (!VALID_EVTCHN(evtchn) || !list_empty(&info->eoi_list)) + return; + + if (spurious) { + struct xenbus_device *dev = info->u.interdomain; + unsigned int threshold = 1; + + if (dev && dev->spurious_threshold) + threshold = dev->spurious_threshold; + + if ((1 << info->spurious_cnt) < (HZ << 2)) { + if (info->spurious_cnt != 0xFF) + info->spurious_cnt++; + } + if (info->spurious_cnt > threshold) { + delay = 1 << (info->spurious_cnt - 1 - threshold); + if (delay > HZ) + delay = HZ; + if (!info->eoi_time) + info->eoi_cpu = smp_processor_id(); + info->eoi_time = get_jiffies_64() + delay; + if (dev) + atomic_add(delay, &dev->jiffies_eoi_delayed); + } + if (dev) + atomic_inc(&dev->spurious_events); + } else { + info->spurious_cnt = 0; + } + + cpu = info->eoi_cpu; + if (info->eoi_time && + (info->irq_epoch == per_cpu(irq_epoch, cpu) || delay)) { + lateeoi_list_add(info); + return; + } + + info->eoi_time = 0; + + /* is_active hasn't been reset yet, do it now. */ + smp_store_release(&info->is_active, 0); + do_unmask(info, EVT_MASK_REASON_EOI_PENDING); +} + +static void xen_irq_lateeoi_worker(struct work_struct *work) +{ + struct lateeoi_work *eoi; struct irq_info *info; -#ifdef CONFIG_SMP - /* By default all event channels notify CPU#0. */ - cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(0)); -#endif + u64 now = get_jiffies_64(); + unsigned long flags; - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (info == NULL) - panic("Unable to allocate metadata for IRQ%d\n", irq); + eoi = container_of(to_delayed_work(work), struct lateeoi_work, delayed); + + rcu_read_lock(); - info->type = IRQT_UNBOUND; - info->refcnt = -1; + while (true) { + spin_lock_irqsave(&eoi->eoi_list_lock, flags); - irq_set_handler_data(irq, info); + info = list_first_entry_or_null(&eoi->eoi_list, struct irq_info, + eoi_list); + + if (info == NULL) + break; - list_add_tail(&info->list, &xen_irq_list_head); + if (now < info->eoi_time) { + mod_delayed_work_on(info->eoi_cpu, system_wq, + &eoi->delayed, + info->eoi_time - now); + break; + } + + list_del_init(&info->eoi_list); + + spin_unlock_irqrestore(&eoi->eoi_list_lock, flags); + + info->eoi_time = 0; + + xen_irq_lateeoi_locked(info, false); + } + + spin_unlock_irqrestore(&eoi->eoi_list_lock, flags); + + rcu_read_unlock(); } -static int __must_check xen_allocate_irqs_dynamic(int nvec) +static void xen_cpu_init_eoi(unsigned int cpu) { - int i, irq = irq_alloc_descs(-1, 0, nvec, -1); + struct lateeoi_work *eoi = &per_cpu(lateeoi, cpu); - if (irq >= 0) { - for (i = 0; i < nvec; i++) - xen_irq_init(irq + i); + INIT_DELAYED_WORK(&eoi->delayed, xen_irq_lateeoi_worker); + spin_lock_init(&eoi->eoi_list_lock); + INIT_LIST_HEAD(&eoi->eoi_list); +} + +void xen_irq_lateeoi(unsigned int irq, unsigned int eoi_flags) +{ + struct irq_info *info; + + rcu_read_lock(); + + info = info_for_irq(irq); + + if (info) + xen_irq_lateeoi_locked(info, eoi_flags & XEN_EOI_FLAG_SPURIOUS); + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(xen_irq_lateeoi); + +static struct irq_info *xen_irq_init(unsigned int irq) +{ + struct irq_info *info; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (info) { + info->irq = irq; + info->type = IRQT_UNBOUND; + info->refcnt = -1; + INIT_RCU_WORK(&info->rwork, delayed_free_irq); + + set_info_for_irq(irq, info); + INIT_LIST_HEAD(&info->eoi_list); + list_add_tail(&info->list, &xen_irq_list_head); } - return irq; + return info; } -static inline int __must_check xen_allocate_irq_dynamic(void) +static struct irq_info *xen_allocate_irq_dynamic(void) { + int irq = irq_alloc_desc_from(0, -1); + struct irq_info *info = NULL; - return xen_allocate_irqs_dynamic(1); + if (irq >= 0) { + info = xen_irq_init(irq); + if (!info) + xen_irq_free_desc(irq); + } + + return info; } -static int __must_check xen_allocate_irq_gsi(unsigned gsi) +static struct irq_info *xen_allocate_irq_gsi(unsigned int gsi) { int irq; + struct irq_info *info; /* * A PV guest has no concept of a GSI (since it has no ACPI @@ -417,50 +763,40 @@ static int __must_check xen_allocate_irq_gsi(unsigned gsi) else irq = irq_alloc_desc_at(gsi, -1); - xen_irq_init(irq); + info = xen_irq_init(irq); + if (!info) + xen_irq_free_desc(irq); - return irq; + return info; } -static void xen_free_irq(unsigned irq) +static void xen_free_irq(struct irq_info *info) { - struct irq_info *info = irq_get_handler_data(irq); - if (WARN_ON(!info)) return; - list_del(&info->list); + if (!list_empty(&info->eoi_list)) + lateeoi_list_del(info); - irq_set_handler_data(irq, NULL); + list_del(&info->list); WARN_ON(info->refcnt > 0); - kfree(info); - - /* Legacy IRQ descriptors are managed by the arch. */ - if (irq < nr_legacy_irqs()) - return; - - irq_free_desc(irq); + queue_rcu_work(system_wq, &info->rwork); } -static void xen_evtchn_close(unsigned int port) +/* Not called for lateeoi events. */ +static void event_handler_exit(struct irq_info *info) { - struct evtchn_close close; - - close.port = port; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); + smp_store_release(&info->is_active, 0); + clear_evtchn(info->evtchn); } -static void pirq_query_unmask(int irq) +static void pirq_query_unmask(struct irq_info *info) { struct physdev_irq_status_query irq_status; - struct irq_info *info = info_for_irq(irq); - - BUG_ON(info->type != IRQT_PIRQ); - irq_status.irq = pirq_from_irq(irq); + irq_status.irq = pirq_from_irq(info); if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) irq_status.flags = 0; @@ -469,107 +805,120 @@ static void pirq_query_unmask(int irq) info->u.pirq.flags |= PIRQ_NEEDS_EOI; } -static void eoi_pirq(struct irq_data *data) +static void do_eoi_pirq(struct irq_info *info) { - int evtchn = evtchn_from_irq(data->irq); - struct physdev_eoi eoi = { .irq = pirq_from_irq(data->irq) }; + struct physdev_eoi eoi = { .irq = pirq_from_irq(info) }; int rc = 0; - if (!VALID_EVTCHN(evtchn)) + if (!VALID_EVTCHN(info->evtchn)) return; - if (unlikely(irqd_is_setaffinity_pending(data)) && - likely(!irqd_irq_disabled(data))) { - int masked = test_and_set_mask(evtchn); - - clear_evtchn(evtchn); - - irq_move_masked_irq(data); - - if (!masked) - unmask_evtchn(evtchn); - } else - clear_evtchn(evtchn); + event_handler_exit(info); - if (pirq_needs_eoi(data->irq)) { + if (pirq_needs_eoi(info)) { rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); WARN_ON(rc); } } +static void eoi_pirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + + do_eoi_pirq(info); +} + +static void do_disable_dynirq(struct irq_info *info) +{ + if (VALID_EVTCHN(info->evtchn)) + do_mask(info, EVT_MASK_REASON_EXPLICIT); +} + +static void disable_dynirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + + if (info) + do_disable_dynirq(info); +} + static void mask_ack_pirq(struct irq_data *data) { - disable_dynirq(data); - eoi_pirq(data); + struct irq_info *info = info_for_irq(data->irq); + + if (info) { + do_disable_dynirq(info); + do_eoi_pirq(info); + } } -static unsigned int __startup_pirq(unsigned int irq) +static unsigned int __startup_pirq(struct irq_info *info) { struct evtchn_bind_pirq bind_pirq; - struct irq_info *info = info_for_irq(irq); - int evtchn = evtchn_from_irq(irq); + evtchn_port_t evtchn = info->evtchn; int rc; - BUG_ON(info->type != IRQT_PIRQ); - if (VALID_EVTCHN(evtchn)) goto out; - bind_pirq.pirq = pirq_from_irq(irq); + bind_pirq.pirq = pirq_from_irq(info); /* NB. We are happy to share unless we are probing. */ bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ? BIND_PIRQ__WILL_SHARE : 0; rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); if (rc != 0) { - pr_warn("Failed to obtain physical IRQ %d\n", irq); + pr_warn("Failed to obtain physical IRQ %d\n", info->irq); return 0; } evtchn = bind_pirq.port; - pirq_query_unmask(irq); + pirq_query_unmask(info); - rc = set_evtchn_to_irq(evtchn, irq); + rc = set_evtchn_to_irq(evtchn, info->irq); if (rc) goto err; info->evtchn = evtchn; - bind_evtchn_to_cpu(evtchn, 0); + bind_evtchn_to_cpu(info, 0, false); - rc = xen_evtchn_port_setup(info); + rc = xen_evtchn_port_setup(evtchn); if (rc) goto err; out: - unmask_evtchn(evtchn); - eoi_pirq(irq_get_irq_data(irq)); + do_unmask(info, EVT_MASK_REASON_EXPLICIT); + + do_eoi_pirq(info); return 0; err: - pr_err("irq%d: Failed to set port to irq mapping (%d)\n", irq, rc); + pr_err("irq%d: Failed to set port to irq mapping (%d)\n", info->irq, + rc); xen_evtchn_close(evtchn); return 0; } static unsigned int startup_pirq(struct irq_data *data) { - return __startup_pirq(data->irq); + struct irq_info *info = info_for_irq(data->irq); + + return __startup_pirq(info); } static void shutdown_pirq(struct irq_data *data) { - unsigned int irq = data->irq; - struct irq_info *info = info_for_irq(irq); - unsigned evtchn = evtchn_from_irq(irq); + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info->evtchn; BUG_ON(info->type != IRQT_PIRQ); if (!VALID_EVTCHN(evtchn)) return; - mask_evtchn(evtchn); - xen_evtchn_close(evtchn); + do_mask(info, EVT_MASK_REASON_EXPLICIT); xen_irq_info_cleanup(info); + xen_evtchn_close(evtchn); } static void enable_pirq(struct irq_data *data) @@ -598,10 +947,15 @@ int xen_irq_from_gsi(unsigned gsi) } EXPORT_SYMBOL_GPL(xen_irq_from_gsi); -static void __unbind_from_irq(unsigned int irq) +static void __unbind_from_irq(struct irq_info *info, unsigned int irq) { - int evtchn = evtchn_from_irq(irq); - struct irq_info *info = irq_get_handler_data(irq); + evtchn_port_t evtchn; + bool close_evtchn = false; + + if (!info) { + xen_irq_free_desc(irq); + return; + } if (info->refcnt > 0) { info->refcnt--; @@ -609,26 +963,39 @@ static void __unbind_from_irq(unsigned int irq) return; } + evtchn = info->evtchn; + if (VALID_EVTCHN(evtchn)) { - unsigned int cpu = cpu_from_irq(irq); + unsigned int cpu = info->cpu; + struct xenbus_device *dev; - xen_evtchn_close(evtchn); + if (!info->is_static) + close_evtchn = true; - switch (type_from_irq(irq)) { + switch (info->type) { case IRQT_VIRQ: - per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1; + per_cpu(virq_to_irq, cpu)[virq_from_irq(info)] = -1; break; case IRQT_IPI: - per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1; + per_cpu(ipi_to_irq, cpu)[ipi_from_irq(info)] = -1; + per_cpu(ipi_to_evtchn, cpu)[ipi_from_irq(info)] = 0; + break; + case IRQT_EVTCHN: + dev = info->u.interdomain; + if (dev) + atomic_dec(&dev->event_channels); break; default: break; } xen_irq_info_cleanup(info); + + if (close_evtchn) + xen_evtchn_close(evtchn); } - xen_free_irq(irq); + xen_free_irq(info); } /* @@ -644,24 +1011,24 @@ static void __unbind_from_irq(unsigned int irq) int xen_bind_pirq_gsi_to_irq(unsigned gsi, unsigned pirq, int shareable, char *name) { - int irq = -1; + struct irq_info *info; struct physdev_irq irq_op; int ret; mutex_lock(&irq_mapping_update_lock); - irq = xen_irq_from_gsi(gsi); - if (irq != -1) { + ret = xen_irq_from_gsi(gsi); + if (ret != -1) { pr_info("%s: returning irq %d for gsi %u\n", - __func__, irq, gsi); + __func__, ret, gsi); goto out; } - irq = xen_allocate_irq_gsi(gsi); - if (irq < 0) + info = xen_allocate_irq_gsi(gsi); + if (!info) goto out; - irq_op.irq = irq; + irq_op.irq = info->irq; irq_op.vector = 0; /* Only the privileged domain can do this. For non-priv, the pcifront @@ -669,20 +1036,19 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, * this in the priv domain. */ if (xen_initial_domain() && HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { - xen_free_irq(irq); - irq = -ENOSPC; + xen_free_irq(info); + ret = -ENOSPC; goto out; } - ret = xen_irq_info_pirq_setup(irq, 0, pirq, gsi, DOMID_SELF, + ret = xen_irq_info_pirq_setup(info, 0, pirq, gsi, DOMID_SELF, shareable ? PIRQ_SHAREABLE : 0); if (ret < 0) { - __unbind_from_irq(irq); - irq = ret; + __unbind_from_irq(info, info->irq); goto out; } - pirq_query_unmask(irq); + pirq_query_unmask(info); /* We try to use the handler with the appropriate semantic for the * type of interrupt: if the interrupt is an edge triggered * interrupt we use handle_edge_irq. @@ -699,16 +1065,18 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, * is the right choice either way. */ if (shareable) - irq_set_chip_and_handler_name(irq, &xen_pirq_chip, + irq_set_chip_and_handler_name(info->irq, &xen_pirq_chip, handle_fasteoi_irq, name); else - irq_set_chip_and_handler_name(irq, &xen_pirq_chip, + irq_set_chip_and_handler_name(info->irq, &xen_pirq_chip, handle_edge_irq, name); + ret = info->irq; + out: mutex_unlock(&irq_mapping_update_lock); - return irq; + return ret; } #ifdef CONFIG_PCI_MSI @@ -730,17 +1098,24 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, int pirq, int nvec, const char *name, domid_t domid) { int i, irq, ret; + struct irq_info *info; mutex_lock(&irq_mapping_update_lock); - irq = xen_allocate_irqs_dynamic(nvec); + irq = irq_alloc_descs(-1, 0, nvec, -1); if (irq < 0) goto out; for (i = 0; i < nvec; i++) { + info = xen_irq_init(irq + i); + if (!info) { + ret = -ENOMEM; + goto error_irq; + } + irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name); - ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid, + ret = xen_irq_info_pirq_setup(info, 0, pirq + i, 0, domid, i == 0 ? 0 : PIRQ_MSI_GROUP); if (ret < 0) goto error_irq; @@ -752,9 +1127,12 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, out: mutex_unlock(&irq_mapping_update_lock); return irq; + error_irq: - while (nvec--) - __unbind_from_irq(irq + nvec); + while (nvec--) { + info = info_for_irq(irq + nvec); + __unbind_from_irq(info, irq + nvec); + } mutex_unlock(&irq_mapping_update_lock); return ret; } @@ -790,98 +1168,97 @@ int xen_destroy_irq(int irq) } } - xen_free_irq(irq); + xen_free_irq(info); out: mutex_unlock(&irq_mapping_update_lock); return rc; } -int xen_irq_from_pirq(unsigned pirq) -{ - int irq; - - struct irq_info *info; - - mutex_lock(&irq_mapping_update_lock); - - list_for_each_entry(info, &xen_irq_list_head, list) { - if (info->type != IRQT_PIRQ) - continue; - irq = info->irq; - if (info->u.pirq.pirq == pirq) - goto out; - } - irq = -1; -out: - mutex_unlock(&irq_mapping_update_lock); - - return irq; -} - - int xen_pirq_from_irq(unsigned irq) { - return pirq_from_irq(irq); + struct irq_info *info = info_for_irq(irq); + + return pirq_from_irq(info); } EXPORT_SYMBOL_GPL(xen_pirq_from_irq); -int bind_evtchn_to_irq(unsigned int evtchn) +static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip, + struct xenbus_device *dev, bool shared) { - int irq; - int ret; + int ret = -ENOMEM; + struct irq_info *info; if (evtchn >= xen_evtchn_max_channels()) return -ENOMEM; mutex_lock(&irq_mapping_update_lock); - irq = get_evtchn_to_irq(evtchn); + info = evtchn_to_info(evtchn); - if (irq == -1) { - irq = xen_allocate_irq_dynamic(); - if (irq < 0) + if (!info) { + info = xen_allocate_irq_dynamic(); + if (!info) goto out; - irq_set_chip_and_handler_name(irq, &xen_dynamic_chip, + irq_set_chip_and_handler_name(info->irq, chip, handle_edge_irq, "event"); - ret = xen_irq_info_evtchn_setup(irq, evtchn); + ret = xen_irq_info_evtchn_setup(info, evtchn, dev); if (ret < 0) { - __unbind_from_irq(irq); - irq = ret; + __unbind_from_irq(info, info->irq); goto out; } - /* New interdomain events are bound to VCPU 0. */ - bind_evtchn_to_cpu(evtchn, 0); - } else { - struct irq_info *info = info_for_irq(irq); - WARN_ON(info == NULL || info->type != IRQT_EVTCHN); + /* + * New interdomain events are initially bound to vCPU0 This + * is required to setup the event channel in the first + * place and also important for UP guests because the + * affinity setting is not invoked on them so nothing would + * bind the channel. + */ + bind_evtchn_to_cpu(info, 0, false); + } else if (!WARN_ON(info->type != IRQT_EVTCHN)) { + if (shared && !WARN_ON(info->refcnt < 0)) + info->refcnt++; } + ret = info->irq; + out: mutex_unlock(&irq_mapping_update_lock); - return irq; + return ret; +} + +int bind_evtchn_to_irq(evtchn_port_t evtchn) +{ + return bind_evtchn_to_irq_chip(evtchn, &xen_dynamic_chip, NULL, false); } EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); +int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn) +{ + return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip, NULL, false); +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi); + static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) { struct evtchn_bind_ipi bind_ipi; - int evtchn, irq; + evtchn_port_t evtchn; + struct irq_info *info; int ret; mutex_lock(&irq_mapping_update_lock); - irq = per_cpu(ipi_to_irq, cpu)[ipi]; + ret = per_cpu(ipi_to_irq, cpu)[ipi]; - if (irq == -1) { - irq = xen_allocate_irq_dynamic(); - if (irq < 0) + if (ret == -1) { + info = xen_allocate_irq_dynamic(); + if (!info) goto out; - irq_set_chip_and_handler_name(irq, &xen_percpu_chip, + irq_set_chip_and_handler_name(info->irq, &xen_percpu_chip, handle_percpu_irq, "ipi"); bind_ipi.vcpu = xen_vcpu_nr(cpu); @@ -890,46 +1267,64 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) BUG(); evtchn = bind_ipi.port; - ret = xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); + ret = xen_irq_info_ipi_setup(info, cpu, evtchn, ipi); if (ret < 0) { - __unbind_from_irq(irq); - irq = ret; + __unbind_from_irq(info, info->irq); goto out; } - bind_evtchn_to_cpu(evtchn, cpu); + /* + * Force the affinity mask to the target CPU so proc shows + * the correct target. + */ + bind_evtchn_to_cpu(info, cpu, true); + ret = info->irq; } else { - struct irq_info *info = info_for_irq(irq); + info = info_for_irq(ret); WARN_ON(info == NULL || info->type != IRQT_IPI); } out: mutex_unlock(&irq_mapping_update_lock); - return irq; + return ret; } -int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, - unsigned int remote_port) +static int bind_interdomain_evtchn_to_irq_chip(struct xenbus_device *dev, + evtchn_port_t remote_port, + struct irq_chip *chip, + bool shared) { struct evtchn_bind_interdomain bind_interdomain; int err; - bind_interdomain.remote_dom = remote_domain; + bind_interdomain.remote_dom = dev->otherend_id; bind_interdomain.remote_port = remote_port; err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, &bind_interdomain); - return err ? : bind_evtchn_to_irq(bind_interdomain.local_port); + return err ? : bind_evtchn_to_irq_chip(bind_interdomain.local_port, + chip, dev, shared); +} + +int bind_interdomain_evtchn_to_irq_lateeoi(struct xenbus_device *dev, + evtchn_port_t remote_port) +{ + return bind_interdomain_evtchn_to_irq_chip(dev, remote_port, + &xen_lateeoi_chip, false); } -EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq); +EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq_lateeoi); -static int find_virq(unsigned int virq, unsigned int cpu) +static int find_virq(unsigned int virq, unsigned int cpu, evtchn_port_t *evtchn, + bool percpu) { struct evtchn_status status; - int port, rc = -ENOENT; + evtchn_port_t port; + bool exists = false; memset(&status, 0, sizeof(status)); for (port = 0; port < xen_evtchn_max_channels(); port++) { + int rc; + status.dom = DOMID_SELF; status.port = port; rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status); @@ -937,12 +1332,16 @@ static int find_virq(unsigned int virq, unsigned int cpu) continue; if (status.status != EVTCHNSTAT_virq) continue; - if (status.u.virq == virq && status.vcpu == xen_vcpu_nr(cpu)) { - rc = port; - break; + if (status.u.virq != virq) + continue; + if (status.vcpu == xen_vcpu_nr(cpu)) { + *evtchn = port; + return 0; + } else if (!percpu) { + exists = true; } } - return rc; + return exists ? -EEXIST : -ENOENT; } /** @@ -961,22 +1360,24 @@ EXPORT_SYMBOL_GPL(xen_evtchn_nr_channels); int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu) { struct evtchn_bind_virq bind_virq; - int evtchn, irq, ret; + evtchn_port_t evtchn = 0; + struct irq_info *info; + int ret; mutex_lock(&irq_mapping_update_lock); - irq = per_cpu(virq_to_irq, cpu)[virq]; + ret = per_cpu(virq_to_irq, cpu)[virq]; - if (irq == -1) { - irq = xen_allocate_irq_dynamic(); - if (irq < 0) + if (ret == -1) { + info = xen_allocate_irq_dynamic(); + if (!info) goto out; if (percpu) - irq_set_chip_and_handler_name(irq, &xen_percpu_chip, + irq_set_chip_and_handler_name(info->irq, &xen_percpu_chip, handle_percpu_irq, "virq"); else - irq_set_chip_and_handler_name(irq, &xen_dynamic_chip, + irq_set_chip_and_handler_name(info->irq, &xen_dynamic_chip, handle_edge_irq, "virq"); bind_virq.virq = virq; @@ -987,45 +1388,56 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu) evtchn = bind_virq.port; else { if (ret == -EEXIST) - ret = find_virq(virq, cpu); - BUG_ON(ret < 0); - evtchn = ret; + ret = find_virq(virq, cpu, &evtchn, percpu); + if (ret) { + __unbind_from_irq(info, info->irq); + goto out; + } } - ret = xen_irq_info_virq_setup(cpu, irq, evtchn, virq); + ret = xen_irq_info_virq_setup(info, cpu, evtchn, virq); if (ret < 0) { - __unbind_from_irq(irq); - irq = ret; + __unbind_from_irq(info, info->irq); goto out; } - bind_evtchn_to_cpu(evtchn, cpu); + /* + * Force the affinity mask for percpu interrupts so proc + * shows the correct target. + */ + bind_evtchn_to_cpu(info, cpu, percpu); + ret = info->irq; } else { - struct irq_info *info = info_for_irq(irq); + info = info_for_irq(ret); WARN_ON(info == NULL || info->type != IRQT_VIRQ); } out: mutex_unlock(&irq_mapping_update_lock); - return irq; + return ret; } static void unbind_from_irq(unsigned int irq) { + struct irq_info *info; + mutex_lock(&irq_mapping_update_lock); - __unbind_from_irq(irq); + info = info_for_irq(irq); + __unbind_from_irq(info, irq); mutex_unlock(&irq_mapping_update_lock); } -int bind_evtchn_to_irqhandler(unsigned int evtchn, - irq_handler_t handler, - unsigned long irqflags, - const char *devname, void *dev_id) +static int bind_evtchn_to_irqhandler_chip(evtchn_port_t evtchn, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, void *dev_id, + struct irq_chip *chip) { int irq, retval; - irq = bind_evtchn_to_irq(evtchn); + irq = bind_evtchn_to_irq_chip(evtchn, chip, NULL, + irqflags & IRQF_SHARED); if (irq < 0) return irq; retval = request_irq(irq, handler, irqflags, devname, dev_id); @@ -1036,18 +1448,38 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn, return irq; } + +int bind_evtchn_to_irqhandler(evtchn_port_t evtchn, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, void *dev_id) +{ + return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags, + devname, dev_id, + &xen_dynamic_chip); +} EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); -int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, - unsigned int remote_port, - irq_handler_t handler, - unsigned long irqflags, - const char *devname, - void *dev_id) +int bind_evtchn_to_irqhandler_lateeoi(evtchn_port_t evtchn, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, void *dev_id) +{ + return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags, + devname, dev_id, + &xen_lateeoi_chip); +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler_lateeoi); + +static int bind_interdomain_evtchn_to_irqhandler_chip( + struct xenbus_device *dev, evtchn_port_t remote_port, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id, struct irq_chip *chip) { int irq, retval; - irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port); + irq = bind_interdomain_evtchn_to_irq_chip(dev, remote_port, chip, + irqflags & IRQF_SHARED); if (irq < 0) return irq; @@ -1059,7 +1491,19 @@ int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, return irq; } -EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler); + +int bind_interdomain_evtchn_to_irqhandler_lateeoi(struct xenbus_device *dev, + evtchn_port_t remote_port, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + return bind_interdomain_evtchn_to_irqhandler_chip(dev, + remote_port, handler, irqflags, devname, + dev_id, &xen_lateeoi_chip); +} +EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler_lateeoi); int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, irq_handler_t handler, @@ -1105,7 +1549,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, void unbind_from_irqhandler(unsigned int irq, void *dev_id) { - struct irq_info *info = irq_get_handler_data(irq); + struct irq_info *info = info_for_irq(irq); if (WARN_ON(!info)) return; @@ -1131,15 +1575,9 @@ int xen_set_irq_priority(unsigned irq, unsigned priority) } EXPORT_SYMBOL_GPL(xen_set_irq_priority); -int evtchn_make_refcounted(unsigned int evtchn) +int evtchn_make_refcounted(evtchn_port_t evtchn, bool is_static) { - int irq = get_evtchn_to_irq(evtchn); - struct irq_info *info; - - if (irq == -1) - return -ENOENT; - - info = irq_get_handler_data(irq); + struct irq_info *info = evtchn_to_info(evtchn); if (!info) return -ENOENT; @@ -1147,14 +1585,14 @@ int evtchn_make_refcounted(unsigned int evtchn) WARN_ON(info->refcnt != -1); info->refcnt = 1; + info->is_static = is_static; return 0; } EXPORT_SYMBOL_GPL(evtchn_make_refcounted); -int evtchn_get(unsigned int evtchn) +int evtchn_get(evtchn_port_t evtchn) { - int irq; struct irq_info *info; int err = -ENOENT; @@ -1163,17 +1601,13 @@ int evtchn_get(unsigned int evtchn) mutex_lock(&irq_mapping_update_lock); - irq = get_evtchn_to_irq(evtchn); - if (irq == -1) - goto done; - - info = irq_get_handler_data(irq); + info = evtchn_to_info(evtchn); if (!info) goto done; err = -EINVAL; - if (info->refcnt <= 0) + if (info->refcnt <= 0 || info->refcnt == SHRT_MAX) goto done; info->refcnt++; @@ -1185,18 +1619,19 @@ int evtchn_get(unsigned int evtchn) } EXPORT_SYMBOL_GPL(evtchn_get); -void evtchn_put(unsigned int evtchn) +void evtchn_put(evtchn_port_t evtchn) { - int irq = get_evtchn_to_irq(evtchn); - if (WARN_ON(irq == -1)) + struct irq_info *info = evtchn_to_info(evtchn); + + if (WARN_ON(!info)) return; - unbind_from_irq(irq); + unbind_from_irq(info->irq); } EXPORT_SYMBOL_GPL(evtchn_put); void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) { - int irq; + evtchn_port_t evtchn; #ifdef CONFIG_X86 if (unlikely(vector == XEN_NMI_VECTOR)) { @@ -1207,61 +1642,105 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) return; } #endif - irq = per_cpu(ipi_to_irq, cpu)[vector]; - BUG_ON(irq < 0); - notify_remote_via_irq(irq); + evtchn = per_cpu(ipi_to_evtchn, cpu)[vector]; + BUG_ON(evtchn == 0); + notify_remote_via_evtchn(evtchn); } -static DEFINE_PER_CPU(unsigned, xed_nesting_count); - -static void __xen_evtchn_do_upcall(void) -{ - struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); - int cpu = get_cpu(); +struct evtchn_loop_ctrl { + ktime_t timeout; unsigned count; + bool defer_eoi; +}; - do { - vcpu_info->evtchn_upcall_pending = 0; +void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl) +{ + struct irq_info *info = evtchn_to_info(port); + struct xenbus_device *dev; - if (__this_cpu_inc_return(xed_nesting_count) - 1) - goto out; + if (!info) + return; - xen_evtchn_handle_events(cpu); + /* + * Check for timeout every 256 events. + * We are setting the timeout value only after the first 256 + * events in order to not hurt the common case of few loop + * iterations. The 256 is basically an arbitrary value. + * + * In case we are hitting the timeout we need to defer all further + * EOIs in order to ensure to leave the event handling loop rather + * sooner than later. + */ + if (!ctrl->defer_eoi && !(++ctrl->count & 0xff)) { + ktime_t kt = ktime_get(); + + if (!ctrl->timeout) { + kt = ktime_add_ms(kt, + jiffies_to_msecs(event_loop_timeout)); + ctrl->timeout = kt; + } else if (kt > ctrl->timeout) { + ctrl->defer_eoi = true; + } + } - BUG_ON(!irqs_disabled()); + if (xchg_acquire(&info->is_active, 1)) + return; - count = __this_cpu_read(xed_nesting_count); - __this_cpu_write(xed_nesting_count, 0); - } while (count != 1 || vcpu_info->evtchn_upcall_pending); + dev = (info->type == IRQT_EVTCHN) ? info->u.interdomain : NULL; + if (dev) + atomic_inc(&dev->events); -out: + if (ctrl->defer_eoi) { + info->eoi_cpu = smp_processor_id(); + info->irq_epoch = __this_cpu_read(irq_epoch); + info->eoi_time = get_jiffies_64() + event_eoi_delay; + } - put_cpu(); + generic_handle_irq(info->irq); } -void xen_evtchn_do_upcall(struct pt_regs *regs) +int xen_evtchn_do_upcall(void) { - struct pt_regs *old_regs = set_irq_regs(regs); + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + int ret = vcpu_info->evtchn_upcall_pending ? IRQ_HANDLED : IRQ_NONE; + int cpu = smp_processor_id(); + struct evtchn_loop_ctrl ctrl = { 0 }; - irq_enter(); -#ifdef CONFIG_X86 - inc_irq_stat(irq_hv_callback_count); -#endif + /* + * When closing an event channel the associated IRQ must not be freed + * until all cpus have left the event handling loop. This is ensured + * by taking the rcu_read_lock() while handling events, as freeing of + * the IRQ is handled via queue_rcu_work() _after_ closing the event + * channel. + */ + rcu_read_lock(); - __xen_evtchn_do_upcall(); + do { + vcpu_info->evtchn_upcall_pending = 0; - irq_exit(); - set_irq_regs(old_regs); -} + xen_evtchn_handle_events(cpu, &ctrl); -void xen_hvm_evtchn_do_upcall(void) -{ - __xen_evtchn_do_upcall(); + BUG_ON(!irqs_disabled()); + + virt_rmb(); /* Hypervisor can set upcall pending. */ + + } while (vcpu_info->evtchn_upcall_pending); + + rcu_read_unlock(); + + /* + * Increment irq_epoch only now to defer EOIs only for + * xen_irq_lateeoi() invocations occurring from inside the loop + * above. + */ + __this_cpu_inc(irq_epoch); + + return ret; } -EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall); +EXPORT_SYMBOL_GPL(xen_evtchn_do_upcall); /* Rebind a new event channel to an existing irq. */ -void rebind_evtchn_irq(int evtchn, int irq) +void rebind_evtchn_irq(evtchn_port_t evtchn, int irq) { struct irq_info *info = info_for_irq(irq); @@ -1275,28 +1754,27 @@ void rebind_evtchn_irq(int evtchn, int irq) mutex_lock(&irq_mapping_update_lock); /* After resume the irq<->evtchn mappings are all cleared out */ - BUG_ON(get_evtchn_to_irq(evtchn) != -1); + BUG_ON(evtchn_to_info(evtchn)); /* Expect irq to have been bound before, so there should be a proper type */ BUG_ON(info->type == IRQT_UNBOUND); - (void)xen_irq_info_evtchn_setup(irq, evtchn); + info->irq = irq; + (void)xen_irq_info_evtchn_setup(info, evtchn, NULL); mutex_unlock(&irq_mapping_update_lock); - bind_evtchn_to_cpu(evtchn, info->cpu); - /* This will be deferred until interrupt is processed */ - irq_set_affinity(irq, cpumask_of(info->cpu)); + bind_evtchn_to_cpu(info, info->cpu, false); /* Unmask the event channel. */ enable_irq(irq); } /* Rebind an evtchn so that it gets delivered to a specific cpu */ -int xen_rebind_evtchn_to_cpu(int evtchn, unsigned tcpu) +static int xen_rebind_evtchn_to_cpu(struct irq_info *info, unsigned int tcpu) { struct evtchn_bind_vcpu bind_vcpu; - int masked; + evtchn_port_t evtchn = info ? info->evtchn : 0; if (!VALID_EVTCHN(evtchn)) return -1; @@ -1312,29 +1790,67 @@ int xen_rebind_evtchn_to_cpu(int evtchn, unsigned tcpu) * Mask the event while changing the VCPU binding to prevent * it being delivered on an unexpected VCPU. */ - masked = test_and_set_mask(evtchn); + do_mask(info, EVT_MASK_REASON_TEMPORARY); /* * If this fails, it usually just indicates that we're dealing with a * virq or IPI channel, which don't actually need to be rebound. Ignore * it, but don't do the xenlinux-level rebind in that case. */ - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) - bind_evtchn_to_cpu(evtchn, tcpu); + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) { + int old_cpu = info->cpu; + + bind_evtchn_to_cpu(info, tcpu, false); - if (!masked) - unmask_evtchn(evtchn); + if (info->type == IRQT_VIRQ) { + int virq = info->u.virq; + int irq = per_cpu(virq_to_irq, old_cpu)[virq]; + + per_cpu(virq_to_irq, old_cpu)[virq] = -1; + per_cpu(virq_to_irq, tcpu)[virq] = irq; + } + } + + do_unmask(info, EVT_MASK_REASON_TEMPORARY); return 0; } -EXPORT_SYMBOL_GPL(xen_rebind_evtchn_to_cpu); + +/* + * Find the CPU within @dest mask which has the least number of channels + * assigned. This is not precise as the per cpu counts can be modified + * concurrently. + */ +static unsigned int select_target_cpu(const struct cpumask *dest) +{ + unsigned int cpu, best_cpu = UINT_MAX, minch = UINT_MAX; + + for_each_cpu_and(cpu, dest, cpu_online_mask) { + unsigned int curch = atomic_read(&channels_on_cpu[cpu]); + + if (curch < minch) { + minch = curch; + best_cpu = cpu; + } + } + + /* + * Catch the unlikely case that dest contains no online CPUs. Can't + * recurse. + */ + if (best_cpu == UINT_MAX) + return select_target_cpu(cpu_online_mask); + + return best_cpu; +} static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, bool force) { - unsigned tcpu = cpumask_first_and(dest, cpu_online_mask); - int ret = xen_rebind_evtchn_to_cpu(evtchn_from_irq(data->irq), tcpu); + unsigned int tcpu = select_target_cpu(dest); + int ret; + ret = xen_rebind_evtchn_to_cpu(info_for_irq(data->irq), tcpu); if (!ret) irq_data_update_effective_affinity(data, cpumask_of(tcpu)); @@ -1343,59 +1859,77 @@ static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, static void enable_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(data->irq); + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info ? info->evtchn : 0; if (VALID_EVTCHN(evtchn)) - unmask_evtchn(evtchn); + do_unmask(info, EVT_MASK_REASON_EXPLICIT); } -static void disable_dynirq(struct irq_data *data) +static void do_ack_dynirq(struct irq_info *info) { - int evtchn = evtchn_from_irq(data->irq); + evtchn_port_t evtchn = info->evtchn; if (VALID_EVTCHN(evtchn)) - mask_evtchn(evtchn); + event_handler_exit(info); } static void ack_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(data->irq); + struct irq_info *info = info_for_irq(data->irq); - if (!VALID_EVTCHN(evtchn)) - return; + if (info) + do_ack_dynirq(info); +} - if (unlikely(irqd_is_setaffinity_pending(data)) && - likely(!irqd_irq_disabled(data))) { - int masked = test_and_set_mask(evtchn); +static void mask_ack_dynirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); - clear_evtchn(evtchn); + if (info) { + do_disable_dynirq(info); + do_ack_dynirq(info); + } +} - irq_move_masked_irq(data); +static void lateeoi_ack_dynirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info ? info->evtchn : 0; - if (!masked) - unmask_evtchn(evtchn); - } else + if (VALID_EVTCHN(evtchn)) { + do_mask(info, EVT_MASK_REASON_EOI_PENDING); + /* + * Don't call event_handler_exit(). + * Need to keep is_active non-zero in order to ignore re-raised + * events after cpu affinity changes while a lateeoi is pending. + */ clear_evtchn(evtchn); + } } -static void mask_ack_dynirq(struct irq_data *data) +static void lateeoi_mask_ack_dynirq(struct irq_data *data) { - disable_dynirq(data); - ack_dynirq(data); + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info ? info->evtchn : 0; + + if (VALID_EVTCHN(evtchn)) { + do_mask(info, EVT_MASK_REASON_EXPLICIT); + event_handler_exit(info); + } } static int retrigger_dynirq(struct irq_data *data) { - unsigned int evtchn = evtchn_from_irq(data->irq); - int masked; + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info ? info->evtchn : 0; if (!VALID_EVTCHN(evtchn)) return 0; - masked = test_and_set_mask(evtchn); + do_mask(info, EVT_MASK_REASON_TEMPORARY); set_evtchn(evtchn); - if (!masked) - unmask_evtchn(evtchn); + do_unmask(info, EVT_MASK_REASON_TEMPORARY); return 1; } @@ -1428,26 +1962,29 @@ static void restore_pirqs(void) if (rc) { pr_warn("xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n", gsi, irq, pirq, rc); - xen_free_irq(irq); + xen_free_irq(info); continue; } printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); - __startup_pirq(irq); + __startup_pirq(info); } } static void restore_cpu_virqs(unsigned int cpu) { struct evtchn_bind_virq bind_virq; - int virq, irq, evtchn; + evtchn_port_t evtchn; + struct irq_info *info; + int virq, irq; for (virq = 0; virq < NR_VIRQS; virq++) { if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) continue; + info = info_for_irq(irq); - BUG_ON(virq_from_irq(irq) != virq); + BUG_ON(virq_from_irq(info) != virq); /* Get a new binding from Xen. */ bind_virq.virq = virq; @@ -1458,21 +1995,25 @@ static void restore_cpu_virqs(unsigned int cpu) evtchn = bind_virq.port; /* Record the new mapping. */ - (void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq); - bind_evtchn_to_cpu(evtchn, cpu); + xen_irq_info_virq_setup(info, cpu, evtchn, virq); + /* The affinity mask is still valid */ + bind_evtchn_to_cpu(info, cpu, false); } } static void restore_cpu_ipis(unsigned int cpu) { struct evtchn_bind_ipi bind_ipi; - int ipi, irq, evtchn; + evtchn_port_t evtchn; + struct irq_info *info; + int ipi, irq; for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) { if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) continue; + info = info_for_irq(irq); - BUG_ON(ipi_from_irq(irq) != ipi); + BUG_ON(ipi_from_irq(info) != ipi); /* Get a new binding from Xen. */ bind_ipi.vcpu = xen_vcpu_nr(cpu); @@ -1482,31 +2023,26 @@ static void restore_cpu_ipis(unsigned int cpu) evtchn = bind_ipi.port; /* Record the new mapping. */ - (void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); - bind_evtchn_to_cpu(evtchn, cpu); + xen_irq_info_ipi_setup(info, cpu, evtchn, ipi); + /* The affinity mask is still valid */ + bind_evtchn_to_cpu(info, cpu, false); } } /* Clear an irq's pending state, in preparation for polling on it */ void xen_clear_irq_pending(int irq) { - int evtchn = evtchn_from_irq(irq); + struct irq_info *info = info_for_irq(irq); + evtchn_port_t evtchn = info ? info->evtchn : 0; if (VALID_EVTCHN(evtchn)) - clear_evtchn(evtchn); + event_handler_exit(info); } EXPORT_SYMBOL(xen_clear_irq_pending); -void xen_set_irq_pending(int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - set_evtchn(evtchn); -} bool xen_test_irq_pending(int irq) { - int evtchn = evtchn_from_irq(irq); + evtchn_port_t evtchn = evtchn_from_irq(irq); bool ret = false; if (VALID_EVTCHN(evtchn)) @@ -1566,8 +2102,12 @@ void xen_irq_resume(void) xen_evtchn_resume(); /* No IRQ <-> event-channel mappings. */ - list_for_each_entry(info, &xen_irq_list_head, list) - info->evtchn = 0; /* zap event-channel binding */ + list_for_each_entry(info, &xen_irq_list_head, list) { + /* Zap event-channel binding */ + info->evtchn = 0; + /* Adjust accounting */ + channels_on_cpu_dec(info); + } clear_evtchn_to_irq_all(); @@ -1593,6 +2133,21 @@ static struct irq_chip xen_dynamic_chip __read_mostly = { .irq_retrigger = retrigger_dynirq, }; +static struct irq_chip xen_lateeoi_chip __read_mostly = { + /* The chip name needs to contain "xen-dyn" for irqbalance to work. */ + .name = "xen-dyn-lateeoi", + + .irq_disable = disable_dynirq, + .irq_mask = disable_dynirq, + .irq_unmask = enable_dynirq, + + .irq_ack = lateeoi_ack_dynirq, + .irq_mask_ack = lateeoi_mask_ack_dynirq, + + .irq_set_affinity = set_affinity_irq, + .irq_retrigger = retrigger_dynirq, +}; + static struct irq_chip xen_pirq_chip __read_mostly = { .name = "xen-pirq", @@ -1623,57 +2178,119 @@ static struct irq_chip xen_percpu_chip __read_mostly = { .irq_ack = ack_dynirq, }; -int xen_set_callback_via(uint64_t via) -{ - struct xen_hvm_param a; - a.domid = DOMID_SELF; - a.index = HVM_PARAM_CALLBACK_IRQ; - a.value = via; - return HYPERVISOR_hvm_op(HVMOP_set_param, &a); -} -EXPORT_SYMBOL_GPL(xen_set_callback_via); - +#ifdef CONFIG_X86 #ifdef CONFIG_XEN_PVHVM /* Vector callbacks are better than PCI interrupts to receive event * channel notifications because we can receive vector callbacks on any * vcpu and we don't need PCI support or APIC interactions. */ -void xen_callback_vector(void) +void xen_setup_callback_vector(void) { - int rc; uint64_t callback_via; if (xen_have_vector_callback) { callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR); - rc = xen_set_callback_via(callback_via); - if (rc) { + if (xen_set_callback_via(callback_via)) { pr_err("Request for Xen HVM callback vector failed\n"); - xen_have_vector_callback = 0; - return; + xen_have_vector_callback = false; } - pr_info_once("Xen HVM callback vector for event delivery is enabled\n"); - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, - xen_hvm_callback_vector); } } + +/* + * Setup per-vCPU vector-type callbacks. If this setup is unavailable, + * fallback to the global vector-type callback. + */ +static __init void xen_init_setup_upcall_vector(void) +{ + if (!xen_have_vector_callback) + return; + + if ((cpuid_eax(xen_cpuid_base() + 4) & XEN_HVM_CPUID_UPCALL_VECTOR) && + !xen_set_upcall_vector(0)) + xen_percpu_upcall = true; + else if (xen_feature(XENFEAT_hvm_callback_vector)) + xen_setup_callback_vector(); + else + xen_have_vector_callback = false; +} + +int xen_set_upcall_vector(unsigned int cpu) +{ + int rc; + xen_hvm_evtchn_upcall_vector_t op = { + .vector = HYPERVISOR_CALLBACK_VECTOR, + .vcpu = per_cpu(xen_vcpu_id, cpu), + }; + + rc = HYPERVISOR_hvm_op(HVMOP_set_evtchn_upcall_vector, &op); + if (rc) + return rc; + + /* Trick toolstack to think we are enlightened. */ + if (!cpu) + rc = xen_set_callback_via(1); + + return rc; +} + +static __init void xen_alloc_callback_vector(void) +{ + if (!xen_have_vector_callback) + return; + + pr_info("Xen HVM callback vector for event delivery is enabled\n"); + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_xen_hvm_callback); +} #else -void xen_callback_vector(void) {} -#endif +void xen_setup_callback_vector(void) {} +static inline void xen_init_setup_upcall_vector(void) {} +int xen_set_upcall_vector(unsigned int cpu) {} +static inline void xen_alloc_callback_vector(void) {} +#endif /* CONFIG_XEN_PVHVM */ +#endif /* CONFIG_X86 */ -#undef MODULE_PARAM_PREFIX -#define MODULE_PARAM_PREFIX "xen." +bool xen_fifo_events = true; +module_param_named(fifo_events, xen_fifo_events, bool, 0); -static bool fifo_events = true; -module_param(fifo_events, bool, 0); +static int xen_evtchn_cpu_prepare(unsigned int cpu) +{ + int ret = 0; + + xen_cpu_init_eoi(cpu); + + if (evtchn_ops->percpu_init) + ret = evtchn_ops->percpu_init(cpu); + + return ret; +} + +static int xen_evtchn_cpu_dead(unsigned int cpu) +{ + int ret = 0; + + if (evtchn_ops->percpu_deinit) + ret = evtchn_ops->percpu_deinit(cpu); + + return ret; +} void __init xen_init_IRQ(void) { int ret = -EINVAL; - unsigned int evtchn; + evtchn_port_t evtchn; - if (fifo_events) + if (xen_fifo_events) ret = xen_evtchn_fifo_init(); - if (ret < 0) + if (ret < 0) { xen_evtchn_2l_init(); + xen_fifo_events = false; + } + + xen_cpu_init_eoi(smp_processor_id()); + + cpuhp_setup_state_nocalls(CPUHP_XEN_EVTCHN_PREPARE, + "xen/evtchn:prepare", + xen_evtchn_cpu_prepare, xen_evtchn_cpu_dead); evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()), sizeof(*evtchn_to_irq), GFP_KERNEL); @@ -1687,12 +2304,12 @@ void __init xen_init_IRQ(void) #ifdef CONFIG_X86 if (xen_pv_domain()) { - irq_ctx_init(smp_processor_id()); if (xen_initial_domain()) pci_xen_initial_domain(); } - if (xen_feature(XENFEAT_hvm_callback_vector)) - xen_callback_vector(); + xen_init_setup_upcall_vector(); + xen_alloc_callback_vector(); + if (xen_hvm_domain()) { native_init_IRQ(); diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c index 76b318e88382..655775db7caf 100644 --- a/drivers/xen/events/events_fifo.c +++ b/drivers/xen/events/events_fifo.c @@ -82,7 +82,7 @@ static unsigned event_array_pages __read_mostly; #endif -static inline event_word_t *event_word_from_port(unsigned port) +static inline event_word_t *event_word_from_port(evtchn_port_t port) { unsigned i = port / EVENT_WORDS_PER_PAGE; @@ -138,9 +138,8 @@ static void init_array_page(event_word_t *array_page) array_page[i] = 1 << EVTCHN_FIFO_MASKED; } -static int evtchn_fifo_setup(struct irq_info *info) +static int evtchn_fifo_setup(evtchn_port_t port) { - unsigned port = info->evtchn; unsigned new_array_pages; int ret; @@ -186,70 +185,72 @@ static int evtchn_fifo_setup(struct irq_info *info) return ret; } -static void evtchn_fifo_bind_to_cpu(struct irq_info *info, unsigned cpu) +static void evtchn_fifo_bind_to_cpu(evtchn_port_t evtchn, unsigned int cpu, + unsigned int old_cpu) { /* no-op */ } -static void evtchn_fifo_clear_pending(unsigned port) +static void evtchn_fifo_clear_pending(evtchn_port_t port) { event_word_t *word = event_word_from_port(port); sync_clear_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); } -static void evtchn_fifo_set_pending(unsigned port) +static void evtchn_fifo_set_pending(evtchn_port_t port) { event_word_t *word = event_word_from_port(port); sync_set_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); } -static bool evtchn_fifo_is_pending(unsigned port) +static bool evtchn_fifo_is_pending(evtchn_port_t port) { event_word_t *word = event_word_from_port(port); return sync_test_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); } -static bool evtchn_fifo_test_and_set_mask(unsigned port) -{ - event_word_t *word = event_word_from_port(port); - return sync_test_and_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); -} - -static void evtchn_fifo_mask(unsigned port) +static void evtchn_fifo_mask(evtchn_port_t port) { event_word_t *word = event_word_from_port(port); sync_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); } -static bool evtchn_fifo_is_masked(unsigned port) +static bool evtchn_fifo_is_masked(evtchn_port_t port) { event_word_t *word = event_word_from_port(port); return sync_test_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); } /* - * Clear MASKED, spinning if BUSY is set. + * Clear MASKED if not PENDING, spinning if BUSY is set. + * Return true if mask was cleared. */ -static void clear_masked(volatile event_word_t *word) +static bool clear_masked_cond(volatile event_word_t *word) { - event_word_t new, old, w; + event_word_t new, old; - w = *word; + old = *word; do { - old = w & ~(1 << EVTCHN_FIFO_BUSY); + if (!(old & (1 << EVTCHN_FIFO_MASKED))) + return true; + + if (old & (1 << EVTCHN_FIFO_PENDING)) + return false; + + old = old & ~(1 << EVTCHN_FIFO_BUSY); new = old & ~(1 << EVTCHN_FIFO_MASKED); - w = sync_cmpxchg(word, old, new); - } while (w != old); + } while (!sync_try_cmpxchg(word, &old, new)); + + return true; } -static void evtchn_fifo_unmask(unsigned port) +static void evtchn_fifo_unmask(evtchn_port_t port) { event_word_t *word = event_word_from_port(port); BUG_ON(!irqs_disabled()); - clear_masked(word); - if (evtchn_fifo_is_pending(port)) { + if (!clear_masked_cond(word)) { struct evtchn_unmask unmask = { .port = port }; (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); } @@ -257,36 +258,25 @@ static void evtchn_fifo_unmask(unsigned port) static uint32_t clear_linked(volatile event_word_t *word) { - event_word_t new, old, w; + event_word_t new, old; - w = *word; + old = *word; do { - old = w; - new = (w & ~((1 << EVTCHN_FIFO_LINKED) - | EVTCHN_FIFO_LINK_MASK)); - } while ((w = sync_cmpxchg(word, old, new)) != old); - - return w & EVTCHN_FIFO_LINK_MASK; -} - -static void handle_irq_for_port(unsigned port) -{ - int irq; + new = (old & ~((1 << EVTCHN_FIFO_LINKED) + | EVTCHN_FIFO_LINK_MASK)); + } while (!sync_try_cmpxchg(word, &old, new)); - irq = get_evtchn_to_irq(port); - if (irq != -1) - generic_handle_irq(irq); + return old & EVTCHN_FIFO_LINK_MASK; } -static void consume_one_event(unsigned cpu, +static void consume_one_event(unsigned cpu, struct evtchn_loop_ctrl *ctrl, struct evtchn_fifo_control_block *control_block, - unsigned priority, unsigned long *ready, - bool drop) + unsigned priority, unsigned long *ready) { struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu); uint32_t head; - unsigned port; + evtchn_port_t port; event_word_t *word; head = q->head[priority]; @@ -315,16 +305,17 @@ static void consume_one_event(unsigned cpu, clear_bit(priority, ready); if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port)) { - if (unlikely(drop)) + if (unlikely(!ctrl)) pr_warn("Dropping pending event for port %u\n", port); else - handle_irq_for_port(port); + handle_irq_for_port(port, ctrl); } q->head[priority] = head; } -static void __evtchn_fifo_handle_events(unsigned cpu, bool drop) +static void __evtchn_fifo_handle_events(unsigned cpu, + struct evtchn_loop_ctrl *ctrl) { struct evtchn_fifo_control_block *control_block; unsigned long ready; @@ -336,14 +327,15 @@ static void __evtchn_fifo_handle_events(unsigned cpu, bool drop) while (ready) { q = find_first_bit(&ready, EVTCHN_FIFO_MAX_QUEUES); - consume_one_event(cpu, control_block, q, &ready, drop); + consume_one_event(cpu, ctrl, control_block, q, &ready); ready |= xchg(&control_block->ready, 0); } } -static void evtchn_fifo_handle_events(unsigned cpu) +static void evtchn_fifo_handle_events(unsigned cpu, + struct evtchn_loop_ctrl *ctrl) { - __evtchn_fifo_handle_events(cpu, false); + __evtchn_fifo_handle_events(cpu, ctrl); } static void evtchn_fifo_resume(void) @@ -380,21 +372,6 @@ static void evtchn_fifo_resume(void) event_array_pages = 0; } -static const struct evtchn_ops evtchn_ops_fifo = { - .max_channels = evtchn_fifo_max_channels, - .nr_channels = evtchn_fifo_nr_channels, - .setup = evtchn_fifo_setup, - .bind_to_cpu = evtchn_fifo_bind_to_cpu, - .clear_pending = evtchn_fifo_clear_pending, - .set_pending = evtchn_fifo_set_pending, - .is_pending = evtchn_fifo_is_pending, - .test_and_set_mask = evtchn_fifo_test_and_set_mask, - .mask = evtchn_fifo_mask, - .unmask = evtchn_fifo_unmask, - .handle_events = evtchn_fifo_handle_events, - .resume = evtchn_fifo_resume, -}; - static int evtchn_fifo_alloc_control_block(unsigned cpu) { void *control_block = NULL; @@ -417,19 +394,35 @@ static int evtchn_fifo_alloc_control_block(unsigned cpu) return ret; } -static int xen_evtchn_cpu_prepare(unsigned int cpu) +static int evtchn_fifo_percpu_init(unsigned int cpu) { if (!per_cpu(cpu_control_block, cpu)) return evtchn_fifo_alloc_control_block(cpu); return 0; } -static int xen_evtchn_cpu_dead(unsigned int cpu) +static int evtchn_fifo_percpu_deinit(unsigned int cpu) { - __evtchn_fifo_handle_events(cpu, true); + __evtchn_fifo_handle_events(cpu, NULL); return 0; } +static const struct evtchn_ops evtchn_ops_fifo = { + .max_channels = evtchn_fifo_max_channels, + .nr_channels = evtchn_fifo_nr_channels, + .setup = evtchn_fifo_setup, + .bind_to_cpu = evtchn_fifo_bind_to_cpu, + .clear_pending = evtchn_fifo_clear_pending, + .set_pending = evtchn_fifo_set_pending, + .is_pending = evtchn_fifo_is_pending, + .mask = evtchn_fifo_mask, + .unmask = evtchn_fifo_unmask, + .handle_events = evtchn_fifo_handle_events, + .resume = evtchn_fifo_resume, + .percpu_init = evtchn_fifo_percpu_init, + .percpu_deinit = evtchn_fifo_percpu_deinit, +}; + int __init xen_evtchn_fifo_init(void) { int cpu = smp_processor_id(); @@ -443,9 +436,5 @@ int __init xen_evtchn_fifo_init(void) evtchn_ops = &evtchn_ops_fifo; - cpuhp_setup_state_nocalls(CPUHP_XEN_EVTCHN_PREPARE, - "xen/evtchn:prepare", - xen_evtchn_cpu_prepare, xen_evtchn_cpu_dead); - return ret; } diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h index 50c2050a1e32..19ae31695edc 100644 --- a/drivers/xen/events/events_internal.h +++ b/drivers/xen/events/events_internal.h @@ -1,86 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Xen Event Channels (internal header) * * Copyright (C) 2013 Citrix Systems R&D Ltd. - * - * This source code is licensed under the GNU General Public License, - * Version 2 or later. See the file COPYING for more details. */ #ifndef __EVENTS_INTERNAL_H__ #define __EVENTS_INTERNAL_H__ -/* Interrupt types. */ -enum xen_irq_type { - IRQT_UNBOUND = 0, - IRQT_PIRQ, - IRQT_VIRQ, - IRQT_IPI, - IRQT_EVTCHN -}; - -/* - * Packed IRQ information: - * type - enum xen_irq_type - * event channel - irq->event channel mapping - * cpu - cpu this event channel is bound to - * index - type-specific information: - * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM - * guest, or GSI (real passthrough IRQ) of the device. - * VIRQ - virq number - * IPI - IPI vector - * EVTCHN - - */ -struct irq_info { - struct list_head list; - int refcnt; - enum xen_irq_type type; /* type */ - unsigned irq; - unsigned int evtchn; /* event channel */ - unsigned short cpu; /* cpu bound */ - - union { - unsigned short virq; - enum ipi_vector ipi; - struct { - unsigned short pirq; - unsigned short gsi; - unsigned char vector; - unsigned char flags; - uint16_t domid; - } pirq; - } u; -}; - -#define PIRQ_NEEDS_EOI (1 << 0) -#define PIRQ_SHAREABLE (1 << 1) -#define PIRQ_MSI_GROUP (1 << 2) +struct evtchn_loop_ctrl; struct evtchn_ops { unsigned (*max_channels)(void); unsigned (*nr_channels)(void); - int (*setup)(struct irq_info *info); - void (*bind_to_cpu)(struct irq_info *info, unsigned cpu); + int (*setup)(evtchn_port_t port); + void (*remove)(evtchn_port_t port, unsigned int cpu); + void (*bind_to_cpu)(evtchn_port_t evtchn, unsigned int cpu, + unsigned int old_cpu); - void (*clear_pending)(unsigned port); - void (*set_pending)(unsigned port); - bool (*is_pending)(unsigned port); - bool (*test_and_set_mask)(unsigned port); - void (*mask)(unsigned port); - void (*unmask)(unsigned port); + void (*clear_pending)(evtchn_port_t port); + void (*set_pending)(evtchn_port_t port); + bool (*is_pending)(evtchn_port_t port); + void (*mask)(evtchn_port_t port); + void (*unmask)(evtchn_port_t port); - void (*handle_events)(unsigned cpu); + void (*handle_events)(unsigned cpu, struct evtchn_loop_ctrl *ctrl); void (*resume)(void); + + int (*percpu_init)(unsigned int cpu); + int (*percpu_deinit)(unsigned int cpu); }; extern const struct evtchn_ops *evtchn_ops; -extern int **evtchn_to_irq; -int get_evtchn_to_irq(unsigned int evtchn); +void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl); -struct irq_info *info_for_irq(unsigned irq); -unsigned cpu_from_irq(unsigned irq); -unsigned cpu_from_evtchn(unsigned int evtchn); +unsigned int cpu_from_evtchn(evtchn_port_t evtchn); static inline unsigned xen_evtchn_max_channels(void) { @@ -91,52 +46,56 @@ static inline unsigned xen_evtchn_max_channels(void) * Do any ABI specific setup for a bound event channel before it can * be unmasked and used. */ -static inline int xen_evtchn_port_setup(struct irq_info *info) +static inline int xen_evtchn_port_setup(evtchn_port_t evtchn) { if (evtchn_ops->setup) - return evtchn_ops->setup(info); + return evtchn_ops->setup(evtchn); return 0; } -static inline void xen_evtchn_port_bind_to_cpu(struct irq_info *info, - unsigned cpu) +static inline void xen_evtchn_port_remove(evtchn_port_t evtchn, + unsigned int cpu) { - evtchn_ops->bind_to_cpu(info, cpu); + if (evtchn_ops->remove) + evtchn_ops->remove(evtchn, cpu); } -static inline void clear_evtchn(unsigned port) +static inline void xen_evtchn_port_bind_to_cpu(evtchn_port_t evtchn, + unsigned int cpu, + unsigned int old_cpu) { - evtchn_ops->clear_pending(port); + evtchn_ops->bind_to_cpu(evtchn, cpu, old_cpu); } -static inline void set_evtchn(unsigned port) +static inline void clear_evtchn(evtchn_port_t port) { - evtchn_ops->set_pending(port); + evtchn_ops->clear_pending(port); } -static inline bool test_evtchn(unsigned port) +static inline void set_evtchn(evtchn_port_t port) { - return evtchn_ops->is_pending(port); + evtchn_ops->set_pending(port); } -static inline bool test_and_set_mask(unsigned port) +static inline bool test_evtchn(evtchn_port_t port) { - return evtchn_ops->test_and_set_mask(port); + return evtchn_ops->is_pending(port); } -static inline void mask_evtchn(unsigned port) +static inline void mask_evtchn(evtchn_port_t port) { return evtchn_ops->mask(port); } -static inline void unmask_evtchn(unsigned port) +static inline void unmask_evtchn(evtchn_port_t port) { return evtchn_ops->unmask(port); } -static inline void xen_evtchn_handle_events(unsigned cpu) +static inline void xen_evtchn_handle_events(unsigned cpu, + struct evtchn_loop_ctrl *ctrl) { - return evtchn_ops->handle_events(cpu); + return evtchn_ops->handle_events(cpu, ctrl); } static inline void xen_evtchn_resume(void) diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index 6d1a5e58968f..7e4a13e632dc 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -83,8 +83,9 @@ struct per_user_data { struct user_evtchn { struct rb_node node; struct per_user_data *user; - unsigned port; + evtchn_port_t port; bool enabled; + bool unbinding; }; static void evtchn_free_ring(evtchn_port_t *ring) @@ -138,7 +139,8 @@ static void del_evtchn(struct per_user_data *u, struct user_evtchn *evtchn) kfree(evtchn); } -static struct user_evtchn *find_evtchn(struct per_user_data *u, unsigned port) +static struct user_evtchn *find_evtchn(struct per_user_data *u, + evtchn_port_t port) { struct rb_node *node = u->evtchns.rb_node; @@ -161,20 +163,28 @@ static irqreturn_t evtchn_interrupt(int irq, void *data) { struct user_evtchn *evtchn = data; struct per_user_data *u = evtchn->user; + unsigned int prod, cons; + + /* Handler might be called when tearing down the IRQ. */ + if (evtchn->unbinding) + return IRQ_HANDLED; WARN(!evtchn->enabled, - "Interrupt for port %d, but apparently not enabled; per-user %p\n", + "Interrupt for port %u, but apparently not enabled; per-user %p\n", evtchn->port, u); - disable_irq_nosync(irq); evtchn->enabled = false; spin_lock(&u->ring_prod_lock); - if ((u->ring_prod - u->ring_cons) < u->ring_size) { - *evtchn_ring_entry(u, u->ring_prod) = evtchn->port; - wmb(); /* Ensure ring contents visible */ - if (u->ring_cons == u->ring_prod++) { + prod = READ_ONCE(u->ring_prod); + cons = READ_ONCE(u->ring_cons); + + if ((prod - cons) < u->ring_size) { + *evtchn_ring_entry(u, prod) = evtchn->port; + smp_wmb(); /* Ensure ring contents visible */ + WRITE_ONCE(u->ring_prod, prod + 1); + if (cons == prod) { wake_up_interruptible(&u->evtchn_wait); kill_fasync(&u->evtchn_async_queue, SIGIO, POLL_IN); @@ -210,8 +220,8 @@ static ssize_t evtchn_read(struct file *file, char __user *buf, if (u->ring_overflow) goto unlock_out; - c = u->ring_cons; - p = u->ring_prod; + c = READ_ONCE(u->ring_cons); + p = READ_ONCE(u->ring_prod); if (c != p) break; @@ -221,7 +231,7 @@ static ssize_t evtchn_read(struct file *file, char __user *buf, return -EAGAIN; rc = wait_event_interruptible(u->evtchn_wait, - u->ring_cons != u->ring_prod); + READ_ONCE(u->ring_cons) != READ_ONCE(u->ring_prod)); if (rc) return rc; } @@ -245,13 +255,13 @@ static ssize_t evtchn_read(struct file *file, char __user *buf, } rc = -EFAULT; - rmb(); /* Ensure that we see the port before we copy it. */ + smp_rmb(); /* Ensure that we see the port before we copy it. */ if (copy_to_user(buf, evtchn_ring_entry(u, c), bytes1) || ((bytes2 != 0) && copy_to_user(&buf[bytes1], &u->ring[0], bytes2))) goto unlock_out; - u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t); + WRITE_ONCE(u->ring_cons, c + (bytes1 + bytes2) / sizeof(evtchn_port_t)); rc = bytes1 + bytes2; unlock_out: @@ -286,13 +296,13 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf, mutex_lock(&u->bind_mutex); for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) { - unsigned port = kbuf[i]; + evtchn_port_t port = kbuf[i]; struct user_evtchn *evtchn; evtchn = find_evtchn(u, port); if (evtchn && !evtchn->enabled) { evtchn->enabled = true; - enable_irq(irq_from_evtchn(port)); + xen_irq_lateeoi(irq_from_evtchn(port), 0); } } @@ -361,10 +371,10 @@ static int evtchn_resize_ring(struct per_user_data *u) return 0; } -static int evtchn_bind_to_user(struct per_user_data *u, int port) +static int evtchn_bind_to_user(struct per_user_data *u, evtchn_port_t port, + bool is_static) { struct user_evtchn *evtchn; - struct evtchn_close close; int rc = 0; /* @@ -392,19 +402,19 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port) if (rc < 0) goto err; - rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, 0, - u->name, evtchn); + rc = bind_evtchn_to_irqhandler_lateeoi(port, evtchn_interrupt, IRQF_SHARED, + u->name, evtchn); if (rc < 0) goto err; - rc = evtchn_make_refcounted(port); + rc = evtchn_make_refcounted(port, is_static); return rc; err: /* bind failed, should close the port now */ - close.port = port; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); + if (!is_static) + xen_evtchn_close(port); + del_evtchn(u, evtchn); return rc; } @@ -416,41 +426,12 @@ static void evtchn_unbind_from_user(struct per_user_data *u, BUG_ON(irq < 0); + evtchn->unbinding = true; unbind_from_irqhandler(irq, evtchn); del_evtchn(u, evtchn); } -static DEFINE_PER_CPU(int, bind_last_selected_cpu); - -static void evtchn_bind_interdom_next_vcpu(int evtchn) -{ - unsigned int selected_cpu, irq; - struct irq_desc *desc; - unsigned long flags; - - irq = irq_from_evtchn(evtchn); - desc = irq_to_desc(irq); - - if (!desc) - return; - - raw_spin_lock_irqsave(&desc->lock, flags); - selected_cpu = this_cpu_read(bind_last_selected_cpu); - selected_cpu = cpumask_next_and(selected_cpu, - desc->irq_common_data.affinity, cpu_online_mask); - - if (unlikely(selected_cpu >= nr_cpu_ids)) - selected_cpu = cpumask_first_and(desc->irq_common_data.affinity, - cpu_online_mask); - - this_cpu_write(bind_last_selected_cpu, selected_cpu); - - /* unmask expects irqs to be disabled */ - xen_rebind_evtchn_to_cpu(evtchn, selected_cpu); - raw_spin_unlock_irqrestore(&desc->lock, flags); -} - static long evtchn_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -481,7 +462,7 @@ static long evtchn_ioctl(struct file *file, if (rc != 0) break; - rc = evtchn_bind_to_user(u, bind_virq.port); + rc = evtchn_bind_to_user(u, bind_virq.port, false); if (rc == 0) rc = bind_virq.port; break; @@ -507,11 +488,9 @@ static long evtchn_ioctl(struct file *file, if (rc != 0) break; - rc = evtchn_bind_to_user(u, bind_interdomain.local_port); - if (rc == 0) { + rc = evtchn_bind_to_user(u, bind_interdomain.local_port, false); + if (rc == 0) rc = bind_interdomain.local_port; - evtchn_bind_interdom_next_vcpu(rc); - } break; } @@ -534,7 +513,7 @@ static long evtchn_ioctl(struct file *file, if (rc != 0) break; - rc = evtchn_bind_to_user(u, alloc_unbound.port); + rc = evtchn_bind_to_user(u, alloc_unbound.port, false); if (rc == 0) rc = alloc_unbound.port; break; @@ -563,6 +542,23 @@ static long evtchn_ioctl(struct file *file, break; } + case IOCTL_EVTCHN_BIND_STATIC: { + struct ioctl_evtchn_bind bind; + struct user_evtchn *evtchn; + + rc = -EFAULT; + if (copy_from_user(&bind, uarg, sizeof(bind))) + break; + + rc = -EISCONN; + evtchn = find_evtchn(u, bind.port); + if (evtchn) + break; + + rc = evtchn_bind_to_user(u, bind.port, true); + break; + } + case IOCTL_EVTCHN_NOTIFY: { struct ioctl_evtchn_notify notify; struct user_evtchn *evtchn; @@ -584,7 +580,9 @@ static long evtchn_ioctl(struct file *file, /* Initialise the ring to empty. Clear errors. */ mutex_lock(&u->ring_cons_mutex); spin_lock_irq(&u->ring_prod_lock); - u->ring_cons = u->ring_prod = u->ring_overflow = 0; + WRITE_ONCE(u->ring_cons, 0); + WRITE_ONCE(u->ring_prod, 0); + u->ring_overflow = 0; spin_unlock_irq(&u->ring_prod_lock); mutex_unlock(&u->ring_cons_mutex); rc = 0; @@ -627,7 +625,7 @@ static __poll_t evtchn_poll(struct file *file, poll_table *wait) struct per_user_data *u = file->private_data; poll_wait(file, &u->evtchn_wait, wait); - if (u->ring_cons != u->ring_prod) + if (READ_ONCE(u->ring_cons) != READ_ONCE(u->ring_prod)) mask |= EPOLLIN | EPOLLRDNORM; if (u->ring_overflow) mask = EPOLLERR; @@ -664,7 +662,7 @@ static int evtchn_open(struct inode *inode, struct file *filp) filp->private_data = u; - return nonseekable_open(inode, filp); + return stream_open(inode, filp); } static int evtchn_release(struct inode *inode, struct file *filp) @@ -696,7 +694,6 @@ static const struct file_operations evtchn_fops = { .fasync = evtchn_fasync, .open = evtchn_open, .release = evtchn_release, - .llseek = no_llseek, }; static struct miscdevice evtchn_miscdev = { @@ -731,4 +728,5 @@ static void __exit evtchn_cleanup(void) module_init(evtchn_init); module_exit(evtchn_cleanup); +MODULE_DESCRIPTION("Xen /dev/xen/evtchn device driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/xen/fallback.c b/drivers/xen/fallback.c deleted file mode 100644 index b04fb64c5a91..000000000000 --- a/drivers/xen/fallback.c +++ /dev/null @@ -1,81 +0,0 @@ -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/bug.h> -#include <linux/export.h> -#include <asm/hypervisor.h> -#include <asm/xen/hypercall.h> - -int xen_event_channel_op_compat(int cmd, void *arg) -{ - struct evtchn_op op; - int rc; - - op.cmd = cmd; - memcpy(&op.u, arg, sizeof(op.u)); - rc = _hypercall1(int, event_channel_op_compat, &op); - - switch (cmd) { - case EVTCHNOP_close: - case EVTCHNOP_send: - case EVTCHNOP_bind_vcpu: - case EVTCHNOP_unmask: - /* no output */ - break; - -#define COPY_BACK(eop) \ - case EVTCHNOP_##eop: \ - memcpy(arg, &op.u.eop, sizeof(op.u.eop)); \ - break - - COPY_BACK(bind_interdomain); - COPY_BACK(bind_virq); - COPY_BACK(bind_pirq); - COPY_BACK(status); - COPY_BACK(alloc_unbound); - COPY_BACK(bind_ipi); -#undef COPY_BACK - - default: - WARN_ON(rc != -ENOSYS); - break; - } - - return rc; -} -EXPORT_SYMBOL_GPL(xen_event_channel_op_compat); - -int xen_physdev_op_compat(int cmd, void *arg) -{ - struct physdev_op op; - int rc; - - op.cmd = cmd; - memcpy(&op.u, arg, sizeof(op.u)); - rc = _hypercall1(int, physdev_op_compat, &op); - - switch (cmd) { - case PHYSDEVOP_IRQ_UNMASK_NOTIFY: - case PHYSDEVOP_set_iopl: - case PHYSDEVOP_set_iobitmap: - case PHYSDEVOP_apic_write: - /* no output */ - break; - -#define COPY_BACK(pop, fld) \ - case PHYSDEVOP_##pop: \ - memcpy(arg, &op.u.fld, sizeof(op.u.fld)); \ - break - - COPY_BACK(irq_status_query, irq_status_query); - COPY_BACK(apic_read, apic_op); - COPY_BACK(ASSIGN_VECTOR, irq_op); -#undef COPY_BACK - - default: - WARN_ON(rc != -ENOSYS); - break; - } - - return rc; -} -EXPORT_SYMBOL_GPL(xen_physdev_op_compat); diff --git a/drivers/xen/features.c b/drivers/xen/features.c index d7d34fdfc993..87f1828d40d5 100644 --- a/drivers/xen/features.c +++ b/drivers/xen/features.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** * features.c * @@ -8,13 +9,26 @@ #include <linux/types.h> #include <linux/cache.h> #include <linux/export.h> +#include <linux/printk.h> #include <asm/xen/hypercall.h> +#include <xen/xen.h> #include <xen/interface/xen.h> #include <xen/interface/version.h> #include <xen/features.h> +/* + * Linux kernel expects at least Xen 4.0. + * + * Assume some features to be available for that reason (depending on guest + * mode, of course). + */ +#define chk_required_feature(f) { \ + if (!xen_feature(f)) \ + panic("Xen: feature %s not available!\n", #f); \ + } + u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; EXPORT_SYMBOL_GPL(xen_features); @@ -28,6 +42,11 @@ void xen_setup_features(void) if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) break; for (j = 0; j < 32; j++) - xen_features[i * 32 + j] = !!(fi.submap & 1<<j); + xen_features[i * 32 + j] = !!(fi.submap & 1U << j); + } + + if (xen_pv_domain()) { + chk_required_feature(XENFEAT_mmu_pt_update_preserve_ad); + chk_required_feature(XENFEAT_gnttab_map_avail_bits); } } diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index 3fa40c723e8e..f93f73ecefee 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -169,14 +169,6 @@ undo: __del_gref(gref); } - /* It's possible for the target domain to map the just-allocated grant - * references by blindly guessing their IDs; if this is done, then - * __del_gref will leave them in the queue_gref list. They need to be - * added to the global list so that we can free them when they are no - * longer referenced. - */ - if (unlikely(!list_empty(&queue_gref))) - list_splice_tail(&queue_gref, &gref_list); mutex_unlock(&gref_mutex); return rc; } @@ -184,9 +176,9 @@ undo: static void __del_gref(struct gntalloc_gref *gref) { if (gref->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { - uint8_t *tmp = kmap(gref->page); + uint8_t *tmp = kmap_local_page(gref->page); tmp[gref->notify.pgoff] = 0; - kunmap(gref->page); + kunmap_local(tmp); } if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT) { notify_remote_via_evtchn(gref->notify.event); @@ -196,21 +188,15 @@ static void __del_gref(struct gntalloc_gref *gref) gref->notify.flags = 0; if (gref->gref_id) { - if (gnttab_query_foreign_access(gref->gref_id)) - return; - - if (!gnttab_end_foreign_access_ref(gref->gref_id, 0)) - return; - - gnttab_free_grant_reference(gref->gref_id); + if (gref->page) + gnttab_end_foreign_access(gref->gref_id, gref->page); + else + gnttab_free_grant_reference(gref->gref_id); } gref_size--; list_del(&gref->next_gref); - if (gref->page) - __free_page(gref->page); - kfree(gref); } @@ -331,7 +317,7 @@ static long gntalloc_ioctl_alloc(struct gntalloc_file_private_data *priv, rc = -EFAULT; goto out_free; } - if (copy_to_user(arg->gref_ids, gref_ids, + if (copy_to_user(arg->gref_ids_flex, gref_ids, sizeof(gref_ids[0]) * op.count)) { rc = -EFAULT; goto out_free; @@ -539,7 +525,7 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_private_data = vm_priv; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &gntalloc_vmops; diff --git a/drivers/xen/gntdev-common.h b/drivers/xen/gntdev-common.h index 2f8b949c3eeb..ac8ce3179ba2 100644 --- a/drivers/xen/gntdev-common.h +++ b/drivers/xen/gntdev-common.h @@ -15,21 +15,20 @@ #include <linux/mman.h> #include <linux/mmu_notifier.h> #include <linux/types.h> +#include <xen/interface/event_channel.h> +#include <xen/grant_table.h> struct gntdev_dmabuf_priv; struct gntdev_priv { /* Maps with visible offsets in the file descriptor. */ struct list_head maps; - /* - * Maps that are not visible; will be freed on munmap. - * Only populated if populate_freeable_maps == 1 - */ - struct list_head freeable_maps; /* lock protects maps and freeable_maps. */ struct mutex lock; - struct mm_struct *mm; - struct mmu_notifier mn; + + /* Free instances of struct gntdev_copy_batch. */ + struct gntdev_copy_batch *batch; + struct mutex batch_lock; #ifdef CONFIG_XEN_GRANT_DMA_ALLOC /* Device for which DMA memory is allocated. */ @@ -45,12 +44,14 @@ struct gntdev_unmap_notify { int flags; /* Address relative to the start of the gntdev_grant_map. */ int addr; - int event; + evtchn_port_t event; }; struct gntdev_grant_map { + atomic_t in_use; + struct mmu_interval_notifier notifier; + bool notifier_init; struct list_head next; - struct vm_area_struct *vma; int index; int count; int flags; @@ -61,6 +62,7 @@ struct gntdev_grant_map { struct gnttab_unmap_grant_ref *unmap_ops; struct gnttab_map_grant_ref *kmap_ops; struct gnttab_unmap_grant_ref *kunmap_ops; + bool *being_removed; struct page **pages; unsigned long pages_vm_start; @@ -78,6 +80,11 @@ struct gntdev_grant_map { /* Needed to avoid allocation in gnttab_dma_free_pages(). */ xen_pfn_t *frames; #endif + + /* Number of live grants */ + atomic_t live_grants; + /* Needed to avoid allocation in __unmap_grant_pages */ + struct gntab_unmap_queue_data unmap_data; }; struct gntdev_grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count, @@ -87,7 +94,7 @@ void gntdev_add_map(struct gntdev_priv *priv, struct gntdev_grant_map *add); void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map); -bool gntdev_account_mapped_pages(int count); +bool gntdev_test_page_count(unsigned int count); int gntdev_map_grant_pages(struct gntdev_grant_map *map); diff --git a/drivers/xen/gntdev-dmabuf.c b/drivers/xen/gntdev-dmabuf.c index cba6b586bfbd..550980dd3b0b 100644 --- a/drivers/xen/gntdev-dmabuf.c +++ b/drivers/xen/gntdev-dmabuf.c @@ -11,9 +11,11 @@ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/dma-buf.h> +#include <linux/dma-direct.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/uaccess.h> +#include <linux/module.h> #include <xen/xen.h> #include <xen/grant_table.h> @@ -21,14 +23,7 @@ #include "gntdev-common.h" #include "gntdev-dmabuf.h" -#ifndef GRANT_INVALID_REF -/* - * Note on usage of grant reference 0 as invalid grant reference: - * grant reference 0 is valid, but never exposed to a driver, - * because of the fact it is already in use/reserved by the PV console. - */ -#define GRANT_INVALID_REF 0 -#endif +MODULE_IMPORT_NS("DMA_BUF"); struct gntdev_dmabuf { struct gntdev_dmabuf_priv *priv; @@ -56,7 +51,7 @@ struct gntdev_dmabuf { /* Number of pages this buffer has. */ int nr_pages; - /* Pages of this buffer. */ + /* Pages of this buffer (only for dma-buf export). */ struct page **pages; }; @@ -80,6 +75,12 @@ struct gntdev_dmabuf_priv { struct list_head imp_list; /* This is the lock which protects dma_buf_xxx lists. */ struct mutex lock; + /* + * We reference this file while exporting dma-bufs, so + * the grant device context is not destroyed while there are + * external users alive. + */ + struct file *filp; }; /* DMA buffer export support. */ @@ -241,10 +242,9 @@ static void dmabuf_exp_ops_detach(struct dma_buf *dma_buf, if (sgt) { if (gntdev_dmabuf_attach->dir != DMA_NONE) - dma_unmap_sg_attrs(attach->dev, sgt->sgl, - sgt->nents, - gntdev_dmabuf_attach->dir, - DMA_ATTR_SKIP_CPU_SYNC); + dma_unmap_sgtable(attach->dev, sgt, + gntdev_dmabuf_attach->dir, + DMA_ATTR_SKIP_CPU_SYNC); sg_free_table(sgt); } @@ -282,8 +282,8 @@ dmabuf_exp_ops_map_dma_buf(struct dma_buf_attachment *attach, sgt = dmabuf_pages_to_sgt(gntdev_dmabuf->pages, gntdev_dmabuf->nr_pages); if (!IS_ERR(sgt)) { - if (!dma_map_sg_attrs(attach->dev, sgt->sgl, sgt->nents, dir, - DMA_ATTR_SKIP_CPU_SYNC)) { + if (dma_map_sgtable(attach->dev, sgt, dir, + DMA_ATTR_SKIP_CPU_SYNC)) { sg_free_table(sgt); kfree(sgt); sgt = ERR_PTR(-ENOMEM); @@ -311,6 +311,7 @@ static void dmabuf_exp_release(struct kref *kref) dmabuf_exp_wait_obj_signal(gntdev_dmabuf->priv, gntdev_dmabuf); list_del(&gntdev_dmabuf->next); + fput(gntdev_dmabuf->priv->filp); kfree(gntdev_dmabuf); } @@ -335,35 +336,12 @@ static void dmabuf_exp_ops_release(struct dma_buf *dma_buf) mutex_unlock(&priv->lock); } -static void *dmabuf_exp_ops_kmap(struct dma_buf *dma_buf, - unsigned long page_num) -{ - /* Not implemented. */ - return NULL; -} - -static void dmabuf_exp_ops_kunmap(struct dma_buf *dma_buf, - unsigned long page_num, void *addr) -{ - /* Not implemented. */ -} - -static int dmabuf_exp_ops_mmap(struct dma_buf *dma_buf, - struct vm_area_struct *vma) -{ - /* Not implemented. */ - return 0; -} - static const struct dma_buf_ops dmabuf_exp_ops = { .attach = dmabuf_exp_ops_attach, .detach = dmabuf_exp_ops_detach, .map_dma_buf = dmabuf_exp_ops_map_dma_buf, .unmap_dma_buf = dmabuf_exp_ops_unmap_dma_buf, .release = dmabuf_exp_ops_release, - .map = dmabuf_exp_ops_kmap, - .unmap = dmabuf_exp_ops_kunmap, - .mmap = dmabuf_exp_ops_mmap, }; struct gntdev_dmabuf_export_args { @@ -379,8 +357,11 @@ struct gntdev_dmabuf_export_args { static int dmabuf_exp_from_pages(struct gntdev_dmabuf_export_args *args) { DEFINE_DMA_BUF_EXPORT_INFO(exp_info); - struct gntdev_dmabuf *gntdev_dmabuf; - int ret; + struct gntdev_dmabuf *gntdev_dmabuf __free(kfree) = NULL; + CLASS(get_unused_fd, ret)(O_CLOEXEC); + + if (ret < 0) + return ret; gntdev_dmabuf = kzalloc(sizeof(*gntdev_dmabuf), GFP_KERNEL); if (!gntdev_dmabuf) @@ -405,31 +386,21 @@ static int dmabuf_exp_from_pages(struct gntdev_dmabuf_export_args *args) exp_info.priv = gntdev_dmabuf; gntdev_dmabuf->dmabuf = dma_buf_export(&exp_info); - if (IS_ERR(gntdev_dmabuf->dmabuf)) { - ret = PTR_ERR(gntdev_dmabuf->dmabuf); - gntdev_dmabuf->dmabuf = NULL; - goto fail; - } - - ret = dma_buf_fd(gntdev_dmabuf->dmabuf, O_CLOEXEC); - if (ret < 0) - goto fail; + if (IS_ERR(gntdev_dmabuf->dmabuf)) + return PTR_ERR(gntdev_dmabuf->dmabuf); gntdev_dmabuf->fd = ret; args->fd = ret; pr_debug("Exporting DMA buffer with fd %d\n", ret); + get_file(gntdev_dmabuf->priv->filp); mutex_lock(&args->dmabuf_priv->lock); list_add(&gntdev_dmabuf->next, &args->dmabuf_priv->exp_list); mutex_unlock(&args->dmabuf_priv->lock); - return 0; -fail: - if (gntdev_dmabuf->dmabuf) - dma_buf_put(gntdev_dmabuf->dmabuf); - kfree(gntdev_dmabuf); - return ret; + fd_install(take_fd(ret), no_free_ptr(gntdev_dmabuf)->dmabuf->file); + return 0; } static struct gntdev_grant_map * @@ -438,7 +409,7 @@ dmabuf_exp_alloc_backing_storage(struct gntdev_priv *priv, int dmabuf_flags, { struct gntdev_grant_map *map; - if (unlikely(count <= 0)) + if (unlikely(gntdev_test_page_count(count))) return ERR_PTR(-EINVAL); if ((dmabuf_flags & GNTDEV_DMA_FLAG_WC) && @@ -451,11 +422,6 @@ dmabuf_exp_alloc_backing_storage(struct gntdev_priv *priv, int dmabuf_flags, if (!map) return ERR_PTR(-ENOMEM); - if (unlikely(gntdev_account_mapped_pages(count))) { - pr_debug("can't map %d pages: over limit\n", count); - gntdev_put_map(NULL, map); - return ERR_PTR(-ENOMEM); - } return map; } @@ -511,7 +477,7 @@ out: /* DMA buffer import support. */ static int -dmabuf_imp_grant_foreign_access(struct page **pages, u32 *refs, +dmabuf_imp_grant_foreign_access(unsigned long *gfns, u32 *refs, int count, int domid) { grant_ref_t priv_gref_head; @@ -534,7 +500,7 @@ dmabuf_imp_grant_foreign_access(struct page **pages, u32 *refs, } gnttab_grant_foreign_access_ref(cur_ref, domid, - xen_page_to_gfn(pages[i]), 0); + gfns[i], 0); refs[i] = cur_ref; } @@ -550,13 +516,12 @@ static void dmabuf_imp_end_foreign_access(u32 *refs, int count) int i; for (i = 0; i < count; i++) - if (refs[i] != GRANT_INVALID_REF) - gnttab_end_foreign_access(refs[i], 0, 0UL); + if (refs[i] != INVALID_GRANT_REF) + gnttab_end_foreign_access(refs[i], NULL); } static void dmabuf_imp_free_storage(struct gntdev_dmabuf *gntdev_dmabuf) { - kfree(gntdev_dmabuf->pages); kfree(gntdev_dmabuf->u.imp.refs); kfree(gntdev_dmabuf); } @@ -576,16 +541,10 @@ static struct gntdev_dmabuf *dmabuf_imp_alloc_storage(int count) if (!gntdev_dmabuf->u.imp.refs) goto fail; - gntdev_dmabuf->pages = kcalloc(count, - sizeof(gntdev_dmabuf->pages[0]), - GFP_KERNEL); - if (!gntdev_dmabuf->pages) - goto fail; - gntdev_dmabuf->nr_pages = count; for (i = 0; i < count; i++) - gntdev_dmabuf->u.imp.refs[i] = GRANT_INVALID_REF; + gntdev_dmabuf->u.imp.refs[i] = INVALID_GRANT_REF; return gntdev_dmabuf; @@ -603,7 +562,8 @@ dmabuf_imp_to_refs(struct gntdev_dmabuf_priv *priv, struct device *dev, struct dma_buf *dma_buf; struct dma_buf_attachment *attach; struct sg_table *sgt; - struct sg_page_iter sg_iter; + struct sg_dma_page_iter sg_iter; + unsigned long *gfns; int i; dma_buf = dma_buf_get(fd); @@ -627,12 +587,20 @@ dmabuf_imp_to_refs(struct gntdev_dmabuf_priv *priv, struct device *dev, gntdev_dmabuf->u.imp.attach = attach; - sgt = dma_buf_map_attachment(attach, DMA_BIDIRECTIONAL); + sgt = dma_buf_map_attachment_unlocked(attach, DMA_BIDIRECTIONAL); if (IS_ERR(sgt)) { ret = ERR_CAST(sgt); goto fail_detach; } + /* Check that we have zero offset. */ + if (sgt->sgl->offset) { + ret = ERR_PTR(-EINVAL); + pr_debug("DMA buffer has %d bytes offset, user-space expects 0\n", + sgt->sgl->offset); + goto fail_unmap; + } + /* Check number of pages that imported buffer has. */ if (attach->dmabuf->size != gntdev_dmabuf->nr_pages << PAGE_SHIFT) { ret = ERR_PTR(-EINVAL); @@ -643,26 +611,31 @@ dmabuf_imp_to_refs(struct gntdev_dmabuf_priv *priv, struct device *dev, gntdev_dmabuf->u.imp.sgt = sgt; - /* Now convert sgt to array of pages and check for page validity. */ + gfns = kcalloc(count, sizeof(*gfns), GFP_KERNEL); + if (!gfns) { + ret = ERR_PTR(-ENOMEM); + goto fail_unmap; + } + + /* + * Now convert sgt to array of gfns without accessing underlying pages. + * It is not allowed to access the underlying struct page of an sg table + * exported by DMA-buf, but since we deal with special Xen dma device here + * (not a normal physical one) look at the dma addresses in the sg table + * and then calculate gfns directly from them. + */ i = 0; - for_each_sg_page(sgt->sgl, &sg_iter, sgt->nents, 0) { - struct page *page = sg_page_iter_page(&sg_iter); - /* - * Check if page is valid: this can happen if we are given - * a page from VRAM or other resources which are not backed - * by a struct page. - */ - if (!pfn_valid(page_to_pfn(page))) { - ret = ERR_PTR(-EINVAL); - goto fail_unmap; - } + for_each_sgtable_dma_page(sgt, &sg_iter, 0) { + dma_addr_t addr = sg_page_iter_dma_address(&sg_iter); + unsigned long pfn = bfn_to_pfn(XEN_PFN_DOWN(dma_to_phys(dev, addr))); - gntdev_dmabuf->pages[i++] = page; + gfns[i++] = pfn_to_gfn(pfn); } - ret = ERR_PTR(dmabuf_imp_grant_foreign_access(gntdev_dmabuf->pages, + ret = ERR_PTR(dmabuf_imp_grant_foreign_access(gfns, gntdev_dmabuf->u.imp.refs, count, domid)); + kfree(gfns); if (IS_ERR(ret)) goto fail_end_access; @@ -677,7 +650,7 @@ dmabuf_imp_to_refs(struct gntdev_dmabuf_priv *priv, struct device *dev, fail_end_access: dmabuf_imp_end_foreign_access(gntdev_dmabuf->u.imp.refs, count); fail_unmap: - dma_buf_unmap_attachment(attach, sgt, DMA_BIDIRECTIONAL); + dma_buf_unmap_attachment_unlocked(attach, sgt, DMA_BIDIRECTIONAL); fail_detach: dma_buf_detach(dma_buf, attach); fail_free_obj: @@ -727,8 +700,8 @@ static int dmabuf_imp_release(struct gntdev_dmabuf_priv *priv, u32 fd) attach = gntdev_dmabuf->u.imp.attach; if (gntdev_dmabuf->u.imp.sgt) - dma_buf_unmap_attachment(attach, gntdev_dmabuf->u.imp.sgt, - DMA_BIDIRECTIONAL); + dma_buf_unmap_attachment_unlocked(attach, gntdev_dmabuf->u.imp.sgt, + DMA_BIDIRECTIONAL); dma_buf = attach->dmabuf; dma_buf_detach(attach->dmabuf, attach); dma_buf_put(dma_buf); @@ -737,25 +710,32 @@ static int dmabuf_imp_release(struct gntdev_dmabuf_priv *priv, u32 fd) return 0; } +static void dmabuf_imp_release_all(struct gntdev_dmabuf_priv *priv) +{ + struct gntdev_dmabuf *q, *gntdev_dmabuf; + + list_for_each_entry_safe(gntdev_dmabuf, q, &priv->imp_list, next) + dmabuf_imp_release(priv, gntdev_dmabuf->fd); +} + /* DMA buffer IOCTL support. */ -long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv, int use_ptemod, +long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv, struct ioctl_gntdev_dmabuf_exp_from_refs __user *u) { struct ioctl_gntdev_dmabuf_exp_from_refs op; u32 *refs; long ret; - if (use_ptemod) { - pr_debug("Cannot provide dma-buf: use_ptemode %d\n", - use_ptemod); + if (xen_pv_domain()) { + pr_debug("Cannot provide dma-buf in a PV domain\n"); return -EINVAL; } if (copy_from_user(&op, u, sizeof(op)) != 0) return -EFAULT; - if (unlikely(op.count <= 0)) + if (unlikely(gntdev_test_page_count(op.count))) return -EINVAL; refs = kcalloc(op.count, sizeof(*refs), GFP_KERNEL); @@ -802,7 +782,7 @@ long gntdev_ioctl_dmabuf_imp_to_refs(struct gntdev_priv *priv, if (copy_from_user(&op, u, sizeof(op)) != 0) return -EFAULT; - if (unlikely(op.count <= 0)) + if (unlikely(gntdev_test_page_count(op.count))) return -EINVAL; gntdev_dmabuf = dmabuf_imp_to_refs(priv->dmabuf_priv, @@ -834,7 +814,7 @@ long gntdev_ioctl_dmabuf_imp_release(struct gntdev_priv *priv, return dmabuf_imp_release(priv->dmabuf_priv, op.fd); } -struct gntdev_dmabuf_priv *gntdev_dmabuf_init(void) +struct gntdev_dmabuf_priv *gntdev_dmabuf_init(struct file *filp) { struct gntdev_dmabuf_priv *priv; @@ -847,10 +827,13 @@ struct gntdev_dmabuf_priv *gntdev_dmabuf_init(void) INIT_LIST_HEAD(&priv->exp_wait_list); INIT_LIST_HEAD(&priv->imp_list); + priv->filp = filp; + return priv; } void gntdev_dmabuf_fini(struct gntdev_dmabuf_priv *priv) { + dmabuf_imp_release_all(priv); kfree(priv); } diff --git a/drivers/xen/gntdev-dmabuf.h b/drivers/xen/gntdev-dmabuf.h index 7220a53d0fc5..9adf96ac74d3 100644 --- a/drivers/xen/gntdev-dmabuf.h +++ b/drivers/xen/gntdev-dmabuf.h @@ -14,11 +14,11 @@ struct gntdev_dmabuf_priv; struct gntdev_priv; -struct gntdev_dmabuf_priv *gntdev_dmabuf_init(void); +struct gntdev_dmabuf_priv *gntdev_dmabuf_init(struct file *filp); void gntdev_dmabuf_fini(struct gntdev_dmabuf_priv *priv); -long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv, int use_ptemod, +long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv, struct ioctl_gntdev_dmabuf_exp_from_refs __user *u); long gntdev_ioctl_dmabuf_exp_wait_released(struct gntdev_priv *priv, diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 5efc5eee9544..2c960f187f7c 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -22,6 +22,7 @@ #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt +#include <linux/dma-mapping.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/init.h> @@ -34,9 +35,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/refcount.h> -#ifdef CONFIG_XEN_GRANT_DMA_ALLOC -#include <linux/of_device.h> -#endif +#include <linux/workqueue.h> #include <xen/xen.h> #include <xen/grant_table.h> @@ -57,26 +56,33 @@ MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, " "Gerd Hoffmann <kraxel@redhat.com>"); MODULE_DESCRIPTION("User-space granted page access driver"); -static int limit = 1024*1024; -module_param(limit, int, 0644); -MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by " - "the gntdev device"); +#define GNTDEV_COPY_BATCH 16 -static atomic_t pages_mapped = ATOMIC_INIT(0); +struct gntdev_copy_batch { + struct gnttab_copy ops[GNTDEV_COPY_BATCH]; + struct page *pages[GNTDEV_COPY_BATCH]; + s16 __user *status[GNTDEV_COPY_BATCH]; + unsigned int nr_ops; + unsigned int nr_pages; + bool writeable; + struct gntdev_copy_batch *next; +}; -static int use_ptemod; -#define populate_freeable_maps use_ptemod +static unsigned int limit = 64*1024; +module_param(limit, uint, 0644); +MODULE_PARM_DESC(limit, + "Maximum number of grants that may be mapped by one mapping request"); -static int unmap_grant_pages(struct gntdev_grant_map *map, - int offset, int pages); +static void unmap_grant_pages(struct gntdev_grant_map *map, + int offset, int pages); static struct miscdevice gntdev_miscdev; /* ------------------------------------------------------------------ */ -bool gntdev_account_mapped_pages(int count) +bool gntdev_test_page_count(unsigned int count) { - return atomic_add_return(count, &pages_mapped) > limit; + return !count || count > limit; } static void gntdev_print_maps(struct gntdev_priv *priv, @@ -117,14 +123,15 @@ static void gntdev_free_map(struct gntdev_grant_map *map) gnttab_free_pages(map->count, map->pages); #ifdef CONFIG_XEN_GRANT_DMA_ALLOC - kfree(map->frames); + kvfree(map->frames); #endif - kfree(map->pages); - kfree(map->grants); - kfree(map->map_ops); - kfree(map->unmap_ops); - kfree(map->kmap_ops); - kfree(map->kunmap_ops); + kvfree(map->pages); + kvfree(map->grants); + kvfree(map->map_ops); + kvfree(map->unmap_ops); + kvfree(map->kmap_ops); + kvfree(map->kunmap_ops); + kvfree(map->being_removed); kfree(map); } @@ -138,19 +145,29 @@ struct gntdev_grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count, if (NULL == add) return NULL; - add->grants = kcalloc(count, sizeof(add->grants[0]), GFP_KERNEL); - add->map_ops = kcalloc(count, sizeof(add->map_ops[0]), GFP_KERNEL); - add->unmap_ops = kcalloc(count, sizeof(add->unmap_ops[0]), GFP_KERNEL); - add->kmap_ops = kcalloc(count, sizeof(add->kmap_ops[0]), GFP_KERNEL); - add->kunmap_ops = kcalloc(count, sizeof(add->kunmap_ops[0]), GFP_KERNEL); - add->pages = kcalloc(count, sizeof(add->pages[0]), GFP_KERNEL); + add->grants = kvmalloc_array(count, sizeof(add->grants[0]), + GFP_KERNEL); + add->map_ops = kvmalloc_array(count, sizeof(add->map_ops[0]), + GFP_KERNEL); + add->unmap_ops = kvmalloc_array(count, sizeof(add->unmap_ops[0]), + GFP_KERNEL); + add->pages = kvcalloc(count, sizeof(add->pages[0]), GFP_KERNEL); + add->being_removed = + kvcalloc(count, sizeof(add->being_removed[0]), GFP_KERNEL); if (NULL == add->grants || NULL == add->map_ops || NULL == add->unmap_ops || - NULL == add->kmap_ops || - NULL == add->kunmap_ops || - NULL == add->pages) + NULL == add->pages || + NULL == add->being_removed) goto err; + if (xen_pv_domain()) { + add->kmap_ops = kvmalloc_array(count, sizeof(add->kmap_ops[0]), + GFP_KERNEL); + add->kunmap_ops = kvmalloc_array(count, sizeof(add->kunmap_ops[0]), + GFP_KERNEL); + if (NULL == add->kmap_ops || NULL == add->kunmap_ops) + goto err; + } #ifdef CONFIG_XEN_GRANT_DMA_ALLOC add->dma_flags = dma_flags; @@ -162,8 +179,8 @@ struct gntdev_grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count, if (dma_flags & (GNTDEV_DMA_FLAG_WC | GNTDEV_DMA_FLAG_COHERENT)) { struct gnttab_dma_alloc_args args; - add->frames = kcalloc(count, sizeof(add->frames[0]), - GFP_KERNEL); + add->frames = kvcalloc(count, sizeof(add->frames[0]), + GFP_KERNEL); if (!add->frames) goto err; @@ -187,10 +204,14 @@ struct gntdev_grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count, goto err; for (i = 0; i < count; i++) { - add->map_ops[i].handle = -1; - add->unmap_ops[i].handle = -1; - add->kmap_ops[i].handle = -1; - add->kunmap_ops[i].handle = -1; + add->grants[i].domid = DOMID_INVALID; + add->grants[i].ref = INVALID_GRANT_REF; + add->map_ops[i].handle = INVALID_GRANT_HANDLE; + add->unmap_ops[i].handle = INVALID_GRANT_HANDLE; + if (xen_pv_domain()) { + add->kmap_ops[i].handle = INVALID_GRANT_HANDLE; + add->kunmap_ops[i].handle = INVALID_GRANT_HANDLE; + } } add->index = 0; @@ -244,69 +265,76 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map) if (!refcount_dec_and_test(&map->users)) return; - atomic_sub(map->count, &pages_mapped); + if (map->pages && !xen_pv_domain()) { + /* + * Increment the reference count. This ensures that the + * subsequent call to unmap_grant_pages() will not wind up + * re-entering itself. It *can* wind up calling + * gntdev_put_map() recursively, but such calls will be with a + * reference count greater than 1, so they will return before + * this code is reached. The recursion depth is thus limited to + * 1. Do NOT use refcount_inc() here, as it will detect that + * the reference count is zero and WARN(). + */ + refcount_set(&map->users, 1); + + /* + * Unmap the grants. This may or may not be asynchronous, so it + * is possible that the reference count is 1 on return, but it + * could also be greater than 1. + */ + unmap_grant_pages(map, 0, map->count); + + /* Check if the memory now needs to be freed */ + if (!refcount_dec_and_test(&map->users)) + return; + + /* + * All pages have been returned to the hypervisor, so free the + * map. + */ + } + + if (xen_pv_domain() && map->notifier_init) + mmu_interval_notifier_remove(&map->notifier); if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) { notify_remote_via_evtchn(map->notify.event); evtchn_put(map->notify.event); } - - if (populate_freeable_maps && priv) { - mutex_lock(&priv->lock); - list_del(&map->next); - mutex_unlock(&priv->lock); - } - - if (map->pages && !use_ptemod) - unmap_grant_pages(map, 0, map->count); gntdev_free_map(map); } /* ------------------------------------------------------------------ */ -static int find_grant_ptes(pte_t *pte, pgtable_t token, - unsigned long addr, void *data) +static int find_grant_ptes(pte_t *pte, unsigned long addr, void *data) { struct gntdev_grant_map *map = data; - unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT; - int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte; + unsigned int pgnr = (addr - map->pages_vm_start) >> PAGE_SHIFT; + int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte | + (1 << _GNTMAP_guest_avail0); u64 pte_maddr; BUG_ON(pgnr >= map->count); pte_maddr = arbitrary_virt_to_machine(pte).maddr; - /* - * Set the PTE as special to force get_user_pages_fast() fall - * back to the slow path. If this is not supported as part of - * the grant map, it will be done afterwards. - */ - if (xen_feature(XENFEAT_gnttab_map_avail_bits)) - flags |= (1 << _GNTMAP_guest_avail0); - + /* Note: this will perform a pte_mkspecial() through the hypercall. */ gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, flags, map->grants[pgnr].ref, map->grants[pgnr].domid); gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, flags, - -1 /* handle */); + INVALID_GRANT_HANDLE); return 0; } -#ifdef CONFIG_X86 -static int set_grant_ptes_as_special(pte_t *pte, pgtable_t token, - unsigned long addr, void *data) -{ - set_pte_at(current->mm, addr, pte, pte_mkspecial(*pte)); - return 0; -} -#endif - int gntdev_map_grant_pages(struct gntdev_grant_map *map) { + size_t alloced = 0; int i, err = 0; - if (!use_ptemod) { + if (!xen_pv_domain()) { /* Note: it could already be mapped */ - if (map->map_ops[0].handle != -1) + if (map->map_ops[0].handle != INVALID_GRANT_HANDLE) return 0; for (i = 0; i < map->count; i++) { unsigned long addr = (unsigned long) @@ -315,7 +343,7 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map) map->grants[i].ref, map->grants[i].domid); gnttab_set_unmap_op(&map->unmap_ops[i], addr, - map->flags, -1 /* handle */); + map->flags, INVALID_GRANT_HANDLE); } } else { /* @@ -323,111 +351,157 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map) * to the kernel linear addresses of the struct pages. * These ptes are completely different from the user ptes dealt * with find_grant_ptes. + * Note that GNTMAP_device_map isn't needed here: The + * dev_bus_addr output field gets consumed only from ->map_ops, + * and by not requesting it when mapping we also avoid needing + * to mirror dev_bus_addr into ->unmap_ops (and holding an extra + * reference to the page in the hypervisor). */ + unsigned int flags = (map->flags & ~GNTMAP_device_map) | + GNTMAP_host_map; + for (i = 0; i < map->count; i++) { unsigned long address = (unsigned long) pfn_to_kaddr(page_to_pfn(map->pages[i])); BUG_ON(PageHighMem(map->pages[i])); - gnttab_set_map_op(&map->kmap_ops[i], address, - map->flags | GNTMAP_host_map, + gnttab_set_map_op(&map->kmap_ops[i], address, flags, map->grants[i].ref, map->grants[i].domid); gnttab_set_unmap_op(&map->kunmap_ops[i], address, - map->flags | GNTMAP_host_map, -1); + flags, INVALID_GRANT_HANDLE); } } pr_debug("map %d+%d\n", map->index, map->count); - err = gnttab_map_refs(map->map_ops, use_ptemod ? map->kmap_ops : NULL, - map->pages, map->count); - if (err) - return err; + err = gnttab_map_refs(map->map_ops, map->kmap_ops, map->pages, + map->count); for (i = 0; i < map->count; i++) { - if (map->map_ops[i].status) { + if (map->map_ops[i].status == GNTST_okay) { + map->unmap_ops[i].handle = map->map_ops[i].handle; + alloced++; + } else if (!err) err = -EINVAL; - continue; - } - map->unmap_ops[i].handle = map->map_ops[i].handle; - if (use_ptemod) - map->kunmap_ops[i].handle = map->kmap_ops[i].handle; -#ifdef CONFIG_XEN_GRANT_DMA_ALLOC - else if (map->dma_vaddr) { - unsigned long bfn; + if (map->flags & GNTMAP_device_map) + map->unmap_ops[i].dev_bus_addr = map->map_ops[i].dev_bus_addr; - bfn = pfn_to_bfn(page_to_pfn(map->pages[i])); - map->unmap_ops[i].dev_bus_addr = __pfn_to_phys(bfn); + if (xen_pv_domain()) { + if (map->kmap_ops[i].status == GNTST_okay) { + alloced++; + map->kunmap_ops[i].handle = map->kmap_ops[i].handle; + } else if (!err) + err = -EINVAL; } -#endif } + atomic_add(alloced, &map->live_grants); return err; } -static int __unmap_grant_pages(struct gntdev_grant_map *map, int offset, - int pages) +static void __unmap_grant_pages_done(int result, + struct gntab_unmap_queue_data *data) { - int i, err = 0; - struct gntab_unmap_queue_data unmap_data; + unsigned int i; + struct gntdev_grant_map *map = data->data; + unsigned int offset = data->unmap_ops - map->unmap_ops; + int successful_unmaps = 0; + int live_grants; + + for (i = 0; i < data->count; i++) { + if (map->unmap_ops[offset + i].status == GNTST_okay && + map->unmap_ops[offset + i].handle != INVALID_GRANT_HANDLE) + successful_unmaps++; + + WARN_ON(map->unmap_ops[offset + i].status != GNTST_okay && + map->unmap_ops[offset + i].handle != INVALID_GRANT_HANDLE); + pr_debug("unmap handle=%d st=%d\n", + map->unmap_ops[offset+i].handle, + map->unmap_ops[offset+i].status); + map->unmap_ops[offset+i].handle = INVALID_GRANT_HANDLE; + if (xen_pv_domain()) { + if (map->kunmap_ops[offset + i].status == GNTST_okay && + map->kunmap_ops[offset + i].handle != INVALID_GRANT_HANDLE) + successful_unmaps++; + + WARN_ON(map->kunmap_ops[offset + i].status != GNTST_okay && + map->kunmap_ops[offset + i].handle != INVALID_GRANT_HANDLE); + pr_debug("kunmap handle=%u st=%d\n", + map->kunmap_ops[offset+i].handle, + map->kunmap_ops[offset+i].status); + map->kunmap_ops[offset+i].handle = INVALID_GRANT_HANDLE; + } + } + + /* + * Decrease the live-grant counter. This must happen after the loop to + * prevent premature reuse of the grants by gnttab_mmap(). + */ + live_grants = atomic_sub_return(successful_unmaps, &map->live_grants); + if (WARN_ON(live_grants < 0)) + pr_err("%s: live_grants became negative (%d) after unmapping %d pages!\n", + __func__, live_grants, successful_unmaps); + + /* Release reference taken by __unmap_grant_pages */ + gntdev_put_map(NULL, map); +} +static void __unmap_grant_pages(struct gntdev_grant_map *map, int offset, + int pages) +{ if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { int pgno = (map->notify.addr >> PAGE_SHIFT); + if (pgno >= offset && pgno < offset + pages) { /* No need for kmap, pages are in lowmem */ uint8_t *tmp = pfn_to_kaddr(page_to_pfn(map->pages[pgno])); + tmp[map->notify.addr & (PAGE_SIZE-1)] = 0; map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE; } } - unmap_data.unmap_ops = map->unmap_ops + offset; - unmap_data.kunmap_ops = use_ptemod ? map->kunmap_ops + offset : NULL; - unmap_data.pages = map->pages + offset; - unmap_data.count = pages; + map->unmap_data.unmap_ops = map->unmap_ops + offset; + map->unmap_data.kunmap_ops = xen_pv_domain() ? map->kunmap_ops + offset : NULL; + map->unmap_data.pages = map->pages + offset; + map->unmap_data.count = pages; + map->unmap_data.done = __unmap_grant_pages_done; + map->unmap_data.data = map; + refcount_inc(&map->users); /* to keep map alive during async call below */ - err = gnttab_unmap_refs_sync(&unmap_data); - if (err) - return err; - - for (i = 0; i < pages; i++) { - if (map->unmap_ops[offset+i].status) - err = -EINVAL; - pr_debug("unmap handle=%d st=%d\n", - map->unmap_ops[offset+i].handle, - map->unmap_ops[offset+i].status); - map->unmap_ops[offset+i].handle = -1; - } - return err; + gnttab_unmap_refs_async(&map->unmap_data); } -static int unmap_grant_pages(struct gntdev_grant_map *map, int offset, - int pages) +static void unmap_grant_pages(struct gntdev_grant_map *map, int offset, + int pages) { - int range, err = 0; + int range; + + if (atomic_read(&map->live_grants) == 0) + return; /* Nothing to do */ pr_debug("unmap %d+%d [%d+%d]\n", map->index, map->count, offset, pages); /* It is possible the requested range will have a "hole" where we * already unmapped some of the grants. Only unmap valid ranges. */ - while (pages && !err) { - while (pages && map->unmap_ops[offset].handle == -1) { + while (pages) { + while (pages && map->being_removed[offset]) { offset++; pages--; } range = 0; while (range < pages) { - if (map->unmap_ops[offset+range].handle == -1) + if (map->being_removed[offset + range]) break; + map->being_removed[offset + range] = true; range++; } - err = __unmap_grant_pages(map, offset, range); + if (range) + __unmap_grant_pages(map, offset, range); offset += range; pages -= range; } - - return err; } /* ------------------------------------------------------------------ */ @@ -447,23 +521,12 @@ static void gntdev_vma_close(struct vm_area_struct *vma) struct gntdev_priv *priv = file->private_data; pr_debug("gntdev_vma_close %p\n", vma); - if (use_ptemod) { - /* It is possible that an mmu notifier could be running - * concurrently, so take priv->lock to ensure that the vma won't - * vanishing during the unmap_grant_pages call, since we will - * spin here until that completes. Such a concurrent call will - * not do any unmapping, since that has been done prior to - * closing the vma, but it may still iterate the unmap_ops list. - */ - mutex_lock(&priv->lock); - map->vma = NULL; - mutex_unlock(&priv->lock); - } + vma->vm_private_data = NULL; gntdev_put_map(priv, map); } -static struct page *gntdev_vma_find_special_page(struct vm_area_struct *vma, +static struct page *gntdev_vma_find_normal_page(struct vm_area_struct *vma, unsigned long addr) { struct gntdev_grant_map *map = vma->vm_private_data; @@ -474,114 +537,48 @@ static struct page *gntdev_vma_find_special_page(struct vm_area_struct *vma, static const struct vm_operations_struct gntdev_vmops = { .open = gntdev_vma_open, .close = gntdev_vma_close, - .find_special_page = gntdev_vma_find_special_page, + .find_normal_page = gntdev_vma_find_normal_page, }; /* ------------------------------------------------------------------ */ -static bool in_range(struct gntdev_grant_map *map, - unsigned long start, unsigned long end) -{ - if (!map->vma) - return false; - if (map->vma->vm_start >= end) - return false; - if (map->vma->vm_end <= start) - return false; - - return true; -} - -static int unmap_if_in_range(struct gntdev_grant_map *map, - unsigned long start, unsigned long end, - bool blockable) +static bool gntdev_invalidate(struct mmu_interval_notifier *mn, + const struct mmu_notifier_range *range, + unsigned long cur_seq) { + struct gntdev_grant_map *map = + container_of(mn, struct gntdev_grant_map, notifier); unsigned long mstart, mend; - int err; - - if (!in_range(map, start, end)) - return 0; - - if (!blockable) - return -EAGAIN; - - mstart = max(start, map->vma->vm_start); - mend = min(end, map->vma->vm_end); - pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", - map->index, map->count, - map->vma->vm_start, map->vma->vm_end, - start, end, mstart, mend); - err = unmap_grant_pages(map, - (mstart - map->vma->vm_start) >> PAGE_SHIFT, - (mend - mstart) >> PAGE_SHIFT); - WARN_ON(err); - - return 0; -} - -static int mn_invl_range_start(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) -{ - struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); - struct gntdev_grant_map *map; - int ret = 0; + unsigned long map_start, map_end; - if (range->blockable) - mutex_lock(&priv->lock); - else if (!mutex_trylock(&priv->lock)) - return -EAGAIN; - - list_for_each_entry(map, &priv->maps, next) { - ret = unmap_if_in_range(map, range->start, range->end, - range->blockable); - if (ret) - goto out_unlock; - } - list_for_each_entry(map, &priv->freeable_maps, next) { - ret = unmap_if_in_range(map, range->start, range->end, - range->blockable); - if (ret) - goto out_unlock; - } + if (!mmu_notifier_range_blockable(range)) + return false; -out_unlock: - mutex_unlock(&priv->lock); + map_start = map->pages_vm_start; + map_end = map->pages_vm_start + (map->count << PAGE_SHIFT); - return ret; -} + /* + * If the VMA is split or otherwise changed the notifier is not + * updated, but we don't want to process VA's outside the modified + * VMA. FIXME: It would be much more understandable to just prevent + * modifying the VMA in the first place. + */ + if (map_start >= range->end || map_end <= range->start) + return true; -static void mn_release(struct mmu_notifier *mn, - struct mm_struct *mm) -{ - struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); - struct gntdev_grant_map *map; - int err; + mstart = max(range->start, map_start); + mend = min(range->end, map_end); + pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", + map->index, map->count, map_start, map_end, + range->start, range->end, mstart, mend); + unmap_grant_pages(map, (mstart - map_start) >> PAGE_SHIFT, + (mend - mstart) >> PAGE_SHIFT); - mutex_lock(&priv->lock); - list_for_each_entry(map, &priv->maps, next) { - if (!map->vma) - continue; - pr_debug("map %d+%d (%lx %lx)\n", - map->index, map->count, - map->vma->vm_start, map->vma->vm_end); - err = unmap_grant_pages(map, /* offset */ 0, map->count); - WARN_ON(err); - } - list_for_each_entry(map, &priv->freeable_maps, next) { - if (!map->vma) - continue; - pr_debug("map %d+%d (%lx %lx)\n", - map->index, map->count, - map->vma->vm_start, map->vma->vm_end); - err = unmap_grant_pages(map, /* offset */ 0, map->count); - WARN_ON(err); - } - mutex_unlock(&priv->lock); + return true; } -static const struct mmu_notifier_ops gntdev_mmu_ops = { - .release = mn_release, - .invalidate_range_start = mn_invl_range_start, +static const struct mmu_interval_notifier_ops gntdev_mmu_ops = { + .invalidate = gntdev_invalidate, }; /* ------------------------------------------------------------------ */ @@ -589,52 +586,30 @@ static const struct mmu_notifier_ops gntdev_mmu_ops = { static int gntdev_open(struct inode *inode, struct file *flip) { struct gntdev_priv *priv; - int ret = 0; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; INIT_LIST_HEAD(&priv->maps); - INIT_LIST_HEAD(&priv->freeable_maps); mutex_init(&priv->lock); + mutex_init(&priv->batch_lock); + #ifdef CONFIG_XEN_GNTDEV_DMABUF - priv->dmabuf_priv = gntdev_dmabuf_init(); + priv->dmabuf_priv = gntdev_dmabuf_init(flip); if (IS_ERR(priv->dmabuf_priv)) { - ret = PTR_ERR(priv->dmabuf_priv); - kfree(priv); - return ret; - } -#endif + int ret = PTR_ERR(priv->dmabuf_priv); - if (use_ptemod) { - priv->mm = get_task_mm(current); - if (!priv->mm) { - kfree(priv); - return -ENOMEM; - } - priv->mn.ops = &gntdev_mmu_ops; - ret = mmu_notifier_register(&priv->mn, priv->mm); - mmput(priv->mm); - } - - if (ret) { kfree(priv); return ret; } +#endif flip->private_data = priv; #ifdef CONFIG_XEN_GRANT_DMA_ALLOC priv->dma_dev = gntdev_miscdev.this_device; - - /* - * The device is not spawn from a device tree, so arch_setup_dma_ops - * is not called, thus leaving the device with dummy DMA ops. - * Fix this by calling of_dma_configure() with a NULL node to set - * default DMA ops. - */ - of_dma_configure(priv->dma_dev, NULL, true); + dma_coerce_mask_and_coherent(priv->dma_dev, DMA_BIT_MASK(64)); #endif pr_debug("priv %p\n", priv); @@ -645,6 +620,7 @@ static int gntdev_release(struct inode *inode, struct file *flip) { struct gntdev_priv *priv = flip->private_data; struct gntdev_grant_map *map; + struct gntdev_copy_batch *batch; pr_debug("priv %p\n", priv); @@ -655,16 +631,20 @@ static int gntdev_release(struct inode *inode, struct file *flip) list_del(&map->next); gntdev_put_map(NULL /* already removed */, map); } - WARN_ON(!list_empty(&priv->freeable_maps)); mutex_unlock(&priv->lock); + mutex_lock(&priv->batch_lock); + while (priv->batch) { + batch = priv->batch; + priv->batch = batch->next; + kfree(batch); + } + mutex_unlock(&priv->batch_lock); + #ifdef CONFIG_XEN_GNTDEV_DMABUF gntdev_dmabuf_fini(priv->dmabuf_priv); #endif - if (use_ptemod) - mmu_notifier_unregister(&priv->mn, priv->mm); - kfree(priv); return 0; } @@ -679,7 +659,7 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, if (copy_from_user(&op, u, sizeof(op)) != 0) return -EFAULT; pr_debug("priv %p, add %d\n", priv, op.count); - if (unlikely(op.count <= 0)) + if (unlikely(gntdev_test_page_count(op.count))) return -EINVAL; err = -ENOMEM; @@ -687,12 +667,6 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, if (!map) return err; - if (unlikely(gntdev_account_mapped_pages(op.count))) { - pr_debug("can't map: over limit\n"); - gntdev_put_map(NULL, map); - return err; - } - if (copy_from_user(map->grants, &u->refs, sizeof(map->grants[0]) * op.count) != 0) { gntdev_put_map(NULL, map); @@ -725,8 +699,6 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); if (map) { list_del(&map->next); - if (populate_freeable_maps) - list_add_tail(&map->next, &priv->freeable_maps); err = 0; } mutex_unlock(&priv->lock); @@ -747,7 +719,7 @@ static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv, return -EFAULT; pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr); - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); vma = find_vma(current->mm, op.vaddr); if (!vma || vma->vm_ops != &gntdev_vmops) goto out_unlock; @@ -761,7 +733,7 @@ static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv, rv = 0; out_unlock: - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (rv == 0 && copy_to_user(u, &op, sizeof(op)) != 0) return -EFAULT; @@ -774,7 +746,7 @@ static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) struct gntdev_grant_map *map; int rc; int out_flags; - unsigned int out_event; + evtchn_port_t out_event; if (copy_from_user(&op, u, sizeof(op))) return -EFAULT; @@ -834,25 +806,15 @@ static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) return rc; } -#define GNTDEV_COPY_BATCH 16 - -struct gntdev_copy_batch { - struct gnttab_copy ops[GNTDEV_COPY_BATCH]; - struct page *pages[GNTDEV_COPY_BATCH]; - s16 __user *status[GNTDEV_COPY_BATCH]; - unsigned int nr_ops; - unsigned int nr_pages; -}; - static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt, - bool writeable, unsigned long *gfn) + unsigned long *gfn) { unsigned long addr = (unsigned long)virt; struct page *page; unsigned long xen_pfn; int ret; - ret = get_user_pages_fast(addr, 1, writeable, &page); + ret = pin_user_pages_fast(addr, 1, batch->writeable ? FOLL_WRITE : 0, &page); if (ret < 0) return ret; @@ -866,11 +828,9 @@ static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt, static void gntdev_put_pages(struct gntdev_copy_batch *batch) { - unsigned int i; - - for (i = 0; i < batch->nr_pages; i++) - put_page(batch->pages[i]); + unpin_user_pages_dirty_lock(batch->pages, batch->nr_pages, batch->writeable); batch->nr_pages = 0; + batch->writeable = false; } static int gntdev_copy(struct gntdev_copy_batch *batch) @@ -959,8 +919,9 @@ static int gntdev_grant_copy_seg(struct gntdev_copy_batch *batch, virt = seg->source.virt + copied; off = (unsigned long)virt & ~XEN_PAGE_MASK; len = min(len, (size_t)XEN_PAGE_SIZE - off); + batch->writeable = false; - ret = gntdev_get_page(batch, virt, false, &gfn); + ret = gntdev_get_page(batch, virt, &gfn); if (ret < 0) return ret; @@ -978,8 +939,9 @@ static int gntdev_grant_copy_seg(struct gntdev_copy_batch *batch, virt = seg->dest.virt + copied; off = (unsigned long)virt & ~XEN_PAGE_MASK; len = min(len, (size_t)XEN_PAGE_SIZE - off); + batch->writeable = true; - ret = gntdev_get_page(batch, virt, true, &gfn); + ret = gntdev_get_page(batch, virt, &gfn); if (ret < 0) return ret; @@ -1001,36 +963,53 @@ static int gntdev_grant_copy_seg(struct gntdev_copy_batch *batch, static long gntdev_ioctl_grant_copy(struct gntdev_priv *priv, void __user *u) { struct ioctl_gntdev_grant_copy copy; - struct gntdev_copy_batch batch; + struct gntdev_copy_batch *batch; unsigned int i; int ret = 0; if (copy_from_user(©, u, sizeof(copy))) return -EFAULT; - batch.nr_ops = 0; - batch.nr_pages = 0; + mutex_lock(&priv->batch_lock); + if (!priv->batch) { + batch = kmalloc(sizeof(*batch), GFP_KERNEL); + } else { + batch = priv->batch; + priv->batch = batch->next; + } + mutex_unlock(&priv->batch_lock); + if (!batch) + return -ENOMEM; + + batch->nr_ops = 0; + batch->nr_pages = 0; for (i = 0; i < copy.count; i++) { struct gntdev_grant_copy_segment seg; if (copy_from_user(&seg, ©.segments[i], sizeof(seg))) { ret = -EFAULT; + gntdev_put_pages(batch); goto out; } - ret = gntdev_grant_copy_seg(&batch, &seg, ©.segments[i].status); - if (ret < 0) + ret = gntdev_grant_copy_seg(batch, &seg, ©.segments[i].status); + if (ret < 0) { + gntdev_put_pages(batch); goto out; + } cond_resched(); } - if (batch.nr_ops) - ret = gntdev_copy(&batch); - return ret; + if (batch->nr_ops) + ret = gntdev_copy(batch); + + out: + mutex_lock(&priv->batch_lock); + batch->next = priv->batch; + priv->batch = batch; + mutex_unlock(&priv->batch_lock); - out: - gntdev_put_pages(&batch); return ret; } @@ -1058,7 +1037,7 @@ static long gntdev_ioctl(struct file *flip, #ifdef CONFIG_XEN_GNTDEV_DMABUF case IOCTL_GNTDEV_DMABUF_EXP_FROM_REFS: - return gntdev_ioctl_dmabuf_exp_from_refs(priv, use_ptemod, ptr); + return gntdev_ioctl_dmabuf_exp_from_refs(priv, ptr); case IOCTL_GNTDEV_DMABUF_EXP_WAIT_RELEASED: return gntdev_ioctl_dmabuf_exp_wait_released(priv, ptr); @@ -1084,39 +1063,31 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) int index = vma->vm_pgoff; int count = vma_pages(vma); struct gntdev_grant_map *map; - int i, err = -EINVAL; + int err = -EINVAL; if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) return -EINVAL; pr_debug("map %d+%d at %lx (pgoff %lx)\n", - index, count, vma->vm_start, vma->vm_pgoff); + index, count, vma->vm_start, vma->vm_pgoff); mutex_lock(&priv->lock); map = gntdev_find_map_index(priv, index, count); if (!map) goto unlock_out; - if (use_ptemod && map->vma) + if (!atomic_add_unless(&map->in_use, 1, 1)) goto unlock_out; - if (use_ptemod && priv->mm != vma->vm_mm) { - pr_warn("Huh? Other mm?\n"); - goto unlock_out; - } refcount_inc(&map->users); vma->vm_ops = &gntdev_vmops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP | VM_MIXEDMAP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP | VM_MIXEDMAP); - if (use_ptemod) - vma->vm_flags |= VM_DONTCOPY; + if (xen_pv_domain()) + vm_flags_set(vma, VM_DONTCOPY); vma->vm_private_data = map; - - if (use_ptemod) - map->vma = vma; - if (map->flags) { if ((vma->vm_flags & VM_WRITE) && (map->flags & GNTMAP_readonly)) @@ -1127,10 +1098,32 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) map->flags |= GNTMAP_readonly; } + map->pages_vm_start = vma->vm_start; + + if (xen_pv_domain()) { + err = mmu_interval_notifier_insert_locked( + &map->notifier, vma->vm_mm, vma->vm_start, + vma->vm_end - vma->vm_start, &gntdev_mmu_ops); + if (err) + goto out_unlock_put; + + map->notifier_init = true; + } mutex_unlock(&priv->lock); - if (use_ptemod) { - map->pages_vm_start = vma->vm_start; + if (xen_pv_domain()) { + /* + * gntdev takes the address of the PTE in find_grant_ptes() and + * passes it to the hypervisor in gntdev_map_grant_pages(). The + * purpose of the notifier is to prevent the hypervisor pointer + * to the PTE from going stale. + * + * Since this vma's mappings can't be touched without the + * mmap_lock, and we are holding it now, there is no need for + * the notifier_range locking pattern. + */ + mmu_interval_read_begin(&map->notifier); + err = apply_to_page_range(vma->vm_mm, vma->vm_start, vma->vm_end - vma->vm_start, find_grant_ptes, map); @@ -1144,30 +1137,10 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) if (err) goto out_put_map; - if (!use_ptemod) { - for (i = 0; i < count; i++) { - err = vm_insert_page(vma, vma->vm_start + i*PAGE_SIZE, - map->pages[i]); - if (err) - goto out_put_map; - } - } else { -#ifdef CONFIG_X86 - /* - * If the PTEs were not made special by the grant map - * hypercall, do so here. - * - * This is racy since the mapping is already visible - * to userspace but userspace should be well-behaved - * enough to not touch it until the mmap() call - * returns. - */ - if (!xen_feature(XENFEAT_gnttab_map_avail_bits)) { - apply_to_page_range(vma->vm_mm, vma->vm_start, - vma->vm_end - vma->vm_start, - set_grant_ptes_as_special, NULL); - } -#endif + if (!xen_pv_domain()) { + err = vm_map_pages_zero(vma, map->pages, map->count); + if (err) + goto out_put_map; } return 0; @@ -1179,10 +1152,8 @@ unlock_out: out_unlock_put: mutex_unlock(&priv->lock); out_put_map: - if (use_ptemod) { - map->vma = NULL; + if (xen_pv_domain()) unmap_grant_pages(map, 0, map->count); - } gntdev_put_map(priv, map); return err; } @@ -1210,8 +1181,6 @@ static int __init gntdev_init(void) if (!xen_domain()) return -ENODEV; - use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap); - err = misc_register(&gntdev_miscdev); if (err != 0) { pr_err("Could not register gntdev device\n"); diff --git a/drivers/xen/grant-dma-iommu.c b/drivers/xen/grant-dma-iommu.c new file mode 100644 index 000000000000..0965e2dd4edf --- /dev/null +++ b/drivers/xen/grant-dma-iommu.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Stub IOMMU driver which does nothing. + * The main purpose of it being present is to reuse generic IOMMU device tree + * bindings by Xen grant DMA-mapping layer. + * + * Copyright (C) 2022 EPAM Systems Inc. + */ + +#include <linux/iommu.h> +#include <linux/of.h> +#include <linux/platform_device.h> + +struct grant_dma_iommu_device { + struct device *dev; + struct iommu_device iommu; +}; + +static struct iommu_device *grant_dma_iommu_probe_device(struct device *dev) +{ + return ERR_PTR(-ENODEV); +} + +/* Nothing is really needed here except a dummy probe_device callback */ +static const struct iommu_ops grant_dma_iommu_ops = { + .probe_device = grant_dma_iommu_probe_device, +}; + +static const struct of_device_id grant_dma_iommu_of_match[] = { + { .compatible = "xen,grant-dma" }, + { }, +}; + +static int grant_dma_iommu_probe(struct platform_device *pdev) +{ + struct grant_dma_iommu_device *mmu; + int ret; + + mmu = devm_kzalloc(&pdev->dev, sizeof(*mmu), GFP_KERNEL); + if (!mmu) + return -ENOMEM; + + mmu->dev = &pdev->dev; + + ret = iommu_device_register(&mmu->iommu, &grant_dma_iommu_ops, &pdev->dev); + if (ret) + return ret; + + platform_set_drvdata(pdev, mmu); + + return 0; +} + +static void grant_dma_iommu_remove(struct platform_device *pdev) +{ + struct grant_dma_iommu_device *mmu = platform_get_drvdata(pdev); + + platform_set_drvdata(pdev, NULL); + iommu_device_unregister(&mmu->iommu); +} + +static struct platform_driver grant_dma_iommu_driver = { + .driver = { + .name = "grant-dma-iommu", + .of_match_table = grant_dma_iommu_of_match, + }, + .probe = grant_dma_iommu_probe, + .remove = grant_dma_iommu_remove, +}; + +static int __init grant_dma_iommu_init(void) +{ + struct device_node *iommu_np; + + iommu_np = of_find_matching_node(NULL, grant_dma_iommu_of_match); + if (!iommu_np) + return 0; + + of_node_put(iommu_np); + + return platform_driver_register(&grant_dma_iommu_driver); +} +subsys_initcall(grant_dma_iommu_init); diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c new file mode 100644 index 000000000000..14077d23f2a1 --- /dev/null +++ b/drivers/xen/grant-dma-ops.c @@ -0,0 +1,422 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Xen grant DMA-mapping layer - contains special DMA-mapping routines + * for providing grant references as DMA addresses to be used by frontends + * (e.g. virtio) in Xen guests + * + * Copyright (c) 2021, Juergen Gross <jgross@suse.com> + */ + +#include <linux/module.h> +#include <linux/dma-map-ops.h> +#include <linux/of.h> +#include <linux/pci.h> +#include <linux/pfn.h> +#include <linux/xarray.h> +#include <linux/virtio_anchor.h> +#include <linux/virtio.h> +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/grant_table.h> + +struct xen_grant_dma_data { + /* The ID of backend domain */ + domid_t backend_domid; + /* Is device behaving sane? */ + bool broken; +}; + +static DEFINE_XARRAY_FLAGS(xen_grant_dma_devices, XA_FLAGS_LOCK_IRQ); + +#define XEN_GRANT_DMA_ADDR_OFF (1ULL << 63) + +static inline dma_addr_t grant_to_dma(grant_ref_t grant) +{ + return XEN_GRANT_DMA_ADDR_OFF | ((dma_addr_t)grant << XEN_PAGE_SHIFT); +} + +static inline grant_ref_t dma_to_grant(dma_addr_t dma) +{ + return (grant_ref_t)((dma & ~XEN_GRANT_DMA_ADDR_OFF) >> XEN_PAGE_SHIFT); +} + +static struct xen_grant_dma_data *find_xen_grant_dma_data(struct device *dev) +{ + struct xen_grant_dma_data *data; + unsigned long flags; + + xa_lock_irqsave(&xen_grant_dma_devices, flags); + data = xa_load(&xen_grant_dma_devices, (unsigned long)dev); + xa_unlock_irqrestore(&xen_grant_dma_devices, flags); + + return data; +} + +static int store_xen_grant_dma_data(struct device *dev, + struct xen_grant_dma_data *data) +{ + unsigned long flags; + int ret; + + xa_lock_irqsave(&xen_grant_dma_devices, flags); + ret = xa_err(__xa_store(&xen_grant_dma_devices, (unsigned long)dev, data, + GFP_ATOMIC)); + xa_unlock_irqrestore(&xen_grant_dma_devices, flags); + + return ret; +} + +/* + * DMA ops for Xen frontends (e.g. virtio). + * + * Used to act as a kind of software IOMMU for Xen guests by using grants as + * DMA addresses. + * Such a DMA address is formed by using the grant reference as a frame + * number and setting the highest address bit (this bit is for the backend + * to be able to distinguish it from e.g. a mmio address). + */ +static void *xen_grant_dma_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, + unsigned long attrs) +{ + struct xen_grant_dma_data *data; + unsigned int i, n_pages = XEN_PFN_UP(size); + unsigned long pfn; + grant_ref_t grant; + void *ret; + + data = find_xen_grant_dma_data(dev); + if (!data) + return NULL; + + if (unlikely(data->broken)) + return NULL; + + ret = alloc_pages_exact(n_pages * XEN_PAGE_SIZE, gfp); + if (!ret) + return NULL; + + pfn = virt_to_pfn(ret); + + if (gnttab_alloc_grant_reference_seq(n_pages, &grant)) { + free_pages_exact(ret, n_pages * XEN_PAGE_SIZE); + return NULL; + } + + for (i = 0; i < n_pages; i++) { + gnttab_grant_foreign_access_ref(grant + i, data->backend_domid, + pfn_to_gfn(pfn + i), 0); + } + + *dma_handle = grant_to_dma(grant); + + return ret; +} + +static void xen_grant_dma_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, unsigned long attrs) +{ + struct xen_grant_dma_data *data; + unsigned int i, n_pages = XEN_PFN_UP(size); + grant_ref_t grant; + + data = find_xen_grant_dma_data(dev); + if (!data) + return; + + if (unlikely(data->broken)) + return; + + grant = dma_to_grant(dma_handle); + + for (i = 0; i < n_pages; i++) { + if (unlikely(!gnttab_end_foreign_access_ref(grant + i))) { + dev_alert(dev, "Grant still in use by backend domain, disabled for further use\n"); + data->broken = true; + return; + } + } + + gnttab_free_grant_reference_seq(grant, n_pages); + + free_pages_exact(vaddr, n_pages * XEN_PAGE_SIZE); +} + +static struct page *xen_grant_dma_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, + enum dma_data_direction dir, + gfp_t gfp) +{ + void *vaddr; + + vaddr = xen_grant_dma_alloc(dev, size, dma_handle, gfp, 0); + if (!vaddr) + return NULL; + + return virt_to_page(vaddr); +} + +static void xen_grant_dma_free_pages(struct device *dev, size_t size, + struct page *vaddr, dma_addr_t dma_handle, + enum dma_data_direction dir) +{ + xen_grant_dma_free(dev, size, page_to_virt(vaddr), dma_handle, 0); +} + +static dma_addr_t xen_grant_dma_map_phys(struct device *dev, phys_addr_t phys, + size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + struct xen_grant_dma_data *data; + unsigned long offset = offset_in_page(phys); + unsigned long dma_offset = xen_offset_in_page(offset), + pfn_offset = XEN_PFN_DOWN(offset); + unsigned int i, n_pages = XEN_PFN_UP(dma_offset + size); + grant_ref_t grant; + dma_addr_t dma_handle; + + if (unlikely(attrs & DMA_ATTR_MMIO)) + return DMA_MAPPING_ERROR; + + if (WARN_ON(dir == DMA_NONE)) + return DMA_MAPPING_ERROR; + + data = find_xen_grant_dma_data(dev); + if (!data) + return DMA_MAPPING_ERROR; + + if (unlikely(data->broken)) + return DMA_MAPPING_ERROR; + + if (gnttab_alloc_grant_reference_seq(n_pages, &grant)) + return DMA_MAPPING_ERROR; + + for (i = 0; i < n_pages; i++) { + gnttab_grant_foreign_access_ref(grant + i, data->backend_domid, + pfn_to_gfn(page_to_xen_pfn(phys_to_page(phys)) + i + pfn_offset), + dir == DMA_TO_DEVICE); + } + + dma_handle = grant_to_dma(grant) + dma_offset; + + return dma_handle; +} + +static void xen_grant_dma_unmap_phys(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + struct xen_grant_dma_data *data; + unsigned long dma_offset = xen_offset_in_page(dma_handle); + unsigned int i, n_pages = XEN_PFN_UP(dma_offset + size); + grant_ref_t grant; + + if (WARN_ON(dir == DMA_NONE)) + return; + + data = find_xen_grant_dma_data(dev); + if (!data) + return; + + if (unlikely(data->broken)) + return; + + grant = dma_to_grant(dma_handle); + + for (i = 0; i < n_pages; i++) { + if (unlikely(!gnttab_end_foreign_access_ref(grant + i))) { + dev_alert(dev, "Grant still in use by backend domain, disabled for further use\n"); + data->broken = true; + return; + } + } + + gnttab_free_grant_reference_seq(grant, n_pages); +} + +static void xen_grant_dma_unmap_sg(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + unsigned long attrs) +{ + struct scatterlist *s; + unsigned int i; + + if (WARN_ON(dir == DMA_NONE)) + return; + + for_each_sg(sg, s, nents, i) + xen_grant_dma_unmap_phys(dev, s->dma_address, sg_dma_len(s), dir, + attrs); +} + +static int xen_grant_dma_map_sg(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + unsigned long attrs) +{ + struct scatterlist *s; + unsigned int i; + + if (WARN_ON(dir == DMA_NONE)) + return -EINVAL; + + for_each_sg(sg, s, nents, i) { + s->dma_address = xen_grant_dma_map_phys(dev, sg_phys(s), + s->length, dir, attrs); + if (s->dma_address == DMA_MAPPING_ERROR) + goto out; + + sg_dma_len(s) = s->length; + } + + return nents; + +out: + xen_grant_dma_unmap_sg(dev, sg, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); + sg_dma_len(sg) = 0; + + return -EIO; +} + +static int xen_grant_dma_supported(struct device *dev, u64 mask) +{ + return mask == DMA_BIT_MASK(64); +} + +static const struct dma_map_ops xen_grant_dma_ops = { + .alloc = xen_grant_dma_alloc, + .free = xen_grant_dma_free, + .alloc_pages_op = xen_grant_dma_alloc_pages, + .free_pages = xen_grant_dma_free_pages, + .mmap = dma_common_mmap, + .get_sgtable = dma_common_get_sgtable, + .map_phys = xen_grant_dma_map_phys, + .unmap_phys = xen_grant_dma_unmap_phys, + .map_sg = xen_grant_dma_map_sg, + .unmap_sg = xen_grant_dma_unmap_sg, + .dma_supported = xen_grant_dma_supported, +}; + +static struct device_node *xen_dt_get_node(struct device *dev) +{ + if (dev_is_pci(dev)) { + struct pci_dev *pdev = to_pci_dev(dev); + struct pci_bus *bus = pdev->bus; + + /* Walk up to the root bus to look for PCI Host controller */ + while (!pci_is_root_bus(bus)) + bus = bus->parent; + + if (!bus->bridge->parent) + return NULL; + return of_node_get(bus->bridge->parent->of_node); + } + + return of_node_get(dev->of_node); +} + +static int xen_dt_grant_init_backend_domid(struct device *dev, + struct device_node *np, + domid_t *backend_domid) +{ + struct of_phandle_args iommu_spec = { .args_count = 1 }; + + if (dev_is_pci(dev)) { + struct pci_dev *pdev = to_pci_dev(dev); + u32 rid = PCI_DEVID(pdev->bus->number, pdev->devfn); + + if (of_map_id(np, rid, "iommu-map", "iommu-map-mask", &iommu_spec.np, + iommu_spec.args)) { + dev_dbg(dev, "Cannot translate ID\n"); + return -ESRCH; + } + } else { + if (of_parse_phandle_with_args(np, "iommus", "#iommu-cells", + 0, &iommu_spec)) { + dev_dbg(dev, "Cannot parse iommus property\n"); + return -ESRCH; + } + } + + if (!of_device_is_compatible(iommu_spec.np, "xen,grant-dma") || + iommu_spec.args_count != 1) { + dev_dbg(dev, "Incompatible IOMMU node\n"); + of_node_put(iommu_spec.np); + return -ESRCH; + } + + of_node_put(iommu_spec.np); + + /* + * The endpoint ID here means the ID of the domain where the + * corresponding backend is running + */ + *backend_domid = iommu_spec.args[0]; + + return 0; +} + +static int xen_grant_init_backend_domid(struct device *dev, + domid_t *backend_domid) +{ + struct device_node *np; + int ret = -ENODEV; + + np = xen_dt_get_node(dev); + if (np) { + ret = xen_dt_grant_init_backend_domid(dev, np, backend_domid); + of_node_put(np); + } else if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT) || xen_pv_domain()) { + dev_info(dev, "Using dom0 as backend\n"); + *backend_domid = 0; + ret = 0; + } + + return ret; +} + +static void xen_grant_setup_dma_ops(struct device *dev, domid_t backend_domid) +{ + struct xen_grant_dma_data *data; + + data = find_xen_grant_dma_data(dev); + if (data) { + dev_err(dev, "Xen grant DMA data is already created\n"); + return; + } + + data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); + if (!data) + goto err; + + data->backend_domid = backend_domid; + + if (store_xen_grant_dma_data(dev, data)) { + dev_err(dev, "Cannot store Xen grant DMA data\n"); + goto err; + } + + dev->dma_ops = &xen_grant_dma_ops; + + return; + +err: + devm_kfree(dev, data); + dev_err(dev, "Cannot set up Xen grant DMA ops, retain platform DMA ops\n"); +} + +bool xen_virtio_restricted_mem_acc(struct virtio_device *dev) +{ + domid_t backend_domid; + + if (!xen_grant_init_backend_domid(dev->dev.parent, &backend_domid)) { + xen_grant_setup_dma_ops(dev->dev.parent, backend_domid); + return true; + } + + return false; +} + +MODULE_DESCRIPTION("Xen grant DMA-mapping layer"); +MODULE_AUTHOR("Juergen Gross <jgross@suse.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 7ea6fb6a2e5d..3e76e33f6e08 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -33,6 +33,7 @@ #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt +#include <linux/bitmap.h> #include <linux/memblock.h> #include <linux/sched.h> #include <linux/mm.h> @@ -64,18 +65,38 @@ #include <asm/xen/hypercall.h> #include <asm/xen/interface.h> -#include <asm/pgtable.h> #include <asm/sync_bitops.h> -/* External tools reserve first few grant table entries. */ -#define NR_RESERVED_ENTRIES 8 #define GNTTAB_LIST_END 0xffffffff static grant_ref_t **gnttab_list; static unsigned int nr_grant_frames; + +/* + * Handling of free grants: + * + * Free grants are in a simple list anchored in gnttab_free_head. They are + * linked by grant ref, the last element contains GNTTAB_LIST_END. The number + * of free entries is stored in gnttab_free_count. + * Additionally there is a bitmap of free entries anchored in + * gnttab_free_bitmap. This is being used for simplifying allocation of + * multiple consecutive grants, which is needed e.g. for support of virtio. + * gnttab_last_free is used to add free entries of new frames at the end of + * the free list. + * gnttab_free_tail_ptr specifies the variable which references the start + * of consecutive free grants ending with gnttab_last_free. This pointer is + * updated in a rather defensive way, in order to avoid performance hits in + * hot paths. + * All those variables are protected by gnttab_list_lock. + */ static int gnttab_free_count; -static grant_ref_t gnttab_free_head; +static unsigned int gnttab_size; +static grant_ref_t gnttab_free_head = GNTTAB_LIST_END; +static grant_ref_t gnttab_last_free = GNTTAB_LIST_END; +static grant_ref_t *gnttab_free_tail_ptr; +static unsigned long *gnttab_free_bitmap; static DEFINE_SPINLOCK(gnttab_list_lock); + struct grant_frames xen_auto_xlat_grant_frames; static unsigned int xen_gnttab_version; module_param_named(version, xen_gnttab_version, uint, 0); @@ -110,7 +131,7 @@ struct gnttab_ops { void (*unmap_frames)(void); /* * Introducing a valid entry into the grant table, granting the frame of - * this grant entry to domain for accessing or transfering. Ref + * this grant entry to domain for accessing. Ref * parameter is reference of this introduced grant entry, domid is id of * granted domain, frame is the page frame to be granted, and flags is * status of the grant entry to be updated. @@ -119,28 +140,16 @@ struct gnttab_ops { unsigned long frame, unsigned flags); /* * Stop granting a grant entry to domain for accessing. Ref parameter is - * reference of a grant entry whose grant access will be stopped, - * readonly is not in use in this function. If the grant entry is - * currently mapped for reading or writing, just return failure(==0) - * directly and don't tear down the grant access. Otherwise, stop grant - * access for this entry and return success(==1). - */ - int (*end_foreign_access_ref)(grant_ref_t ref, int readonly); - /* - * Stop granting a grant entry to domain for transfer. Ref parameter is - * reference of a grant entry whose grant transfer will be stopped. If - * tranfer has not started, just reclaim the grant entry and return - * failure(==0). Otherwise, wait for the transfer to complete and then - * return the frame. + * reference of a grant entry whose grant access will be stopped. + * If the grant entry is currently mapped for reading or writing, just + * return failure(==0) directly and don't tear down the grant access. + * Otherwise, stop grant access for this entry and return success(==1). */ - unsigned long (*end_foreign_transfer_ref)(grant_ref_t ref); + int (*end_foreign_access_ref)(grant_ref_t ref); /* - * Query the status of a grant entry. Ref parameter is reference of - * queried grant entry, return value is the status of queried entry. - * Detailed status(writing/reading) can be gotten from the return value - * by bit operations. + * Read the frame number related to a given grant reference. */ - int (*query_foreign_access)(grant_ref_t ref); + unsigned long (*read_frame)(grant_ref_t ref); }; struct unmap_refs_callback_data { @@ -183,16 +192,116 @@ static int get_free_entries(unsigned count) ref = head = gnttab_free_head; gnttab_free_count -= count; - while (count-- > 1) - head = gnttab_entry(head); + while (count--) { + bitmap_clear(gnttab_free_bitmap, head, 1); + if (gnttab_free_tail_ptr == __gnttab_entry(head)) + gnttab_free_tail_ptr = &gnttab_free_head; + if (count) + head = gnttab_entry(head); + } gnttab_free_head = gnttab_entry(head); gnttab_entry(head) = GNTTAB_LIST_END; + if (!gnttab_free_count) { + gnttab_last_free = GNTTAB_LIST_END; + gnttab_free_tail_ptr = NULL; + } + spin_unlock_irqrestore(&gnttab_list_lock, flags); return ref; } +static int get_seq_entry_count(void) +{ + if (gnttab_last_free == GNTTAB_LIST_END || !gnttab_free_tail_ptr || + *gnttab_free_tail_ptr == GNTTAB_LIST_END) + return 0; + + return gnttab_last_free - *gnttab_free_tail_ptr + 1; +} + +/* Rebuilds the free grant list and tries to find count consecutive entries. */ +static int get_free_seq(unsigned int count) +{ + int ret = -ENOSPC; + unsigned int from, to; + grant_ref_t *last; + + gnttab_free_tail_ptr = &gnttab_free_head; + last = &gnttab_free_head; + + for (from = find_first_bit(gnttab_free_bitmap, gnttab_size); + from < gnttab_size; + from = find_next_bit(gnttab_free_bitmap, gnttab_size, to + 1)) { + to = find_next_zero_bit(gnttab_free_bitmap, gnttab_size, + from + 1); + if (ret < 0 && to - from >= count) { + ret = from; + bitmap_clear(gnttab_free_bitmap, ret, count); + from += count; + gnttab_free_count -= count; + if (from == to) + continue; + } + + /* + * Recreate the free list in order to have it properly sorted. + * This is needed to make sure that the free tail has the maximum + * possible size. + */ + while (from < to) { + *last = from; + last = __gnttab_entry(from); + gnttab_last_free = from; + from++; + } + if (to < gnttab_size) + gnttab_free_tail_ptr = __gnttab_entry(to - 1); + } + + *last = GNTTAB_LIST_END; + if (gnttab_last_free != gnttab_size - 1) + gnttab_free_tail_ptr = NULL; + + return ret; +} + +static int get_free_entries_seq(unsigned int count) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&gnttab_list_lock, flags); + + if (gnttab_free_count < count) { + ret = gnttab_expand(count - gnttab_free_count); + if (ret < 0) + goto out; + } + + if (get_seq_entry_count() < count) { + ret = get_free_seq(count); + if (ret >= 0) + goto out; + ret = gnttab_expand(count - get_seq_entry_count()); + if (ret < 0) + goto out; + } + + ret = *gnttab_free_tail_ptr; + *gnttab_free_tail_ptr = gnttab_entry(ret + count - 1); + gnttab_free_count -= count; + if (!gnttab_free_count) + gnttab_free_tail_ptr = NULL; + bitmap_clear(gnttab_free_bitmap, ret, count); + + out: + spin_unlock_irqrestore(&gnttab_list_lock, flags); + + return ret; +} + static void do_free_callbacks(void) { struct gnttab_free_callback *callback, *next; @@ -219,25 +328,56 @@ static inline void check_free_callbacks(void) do_free_callbacks(); } -static void put_free_entry(grant_ref_t ref) +static void put_free_entry_locked(grant_ref_t ref) { - unsigned long flags; - spin_lock_irqsave(&gnttab_list_lock, flags); + if (unlikely(ref < GNTTAB_NR_RESERVED_ENTRIES)) + return; + gnttab_entry(ref) = gnttab_free_head; gnttab_free_head = ref; + if (!gnttab_free_count) + gnttab_last_free = ref; + if (gnttab_free_tail_ptr == &gnttab_free_head) + gnttab_free_tail_ptr = __gnttab_entry(ref); gnttab_free_count++; + bitmap_set(gnttab_free_bitmap, ref, 1); +} + +static void put_free_entry(grant_ref_t ref) +{ + unsigned long flags; + + spin_lock_irqsave(&gnttab_list_lock, flags); + put_free_entry_locked(ref); check_free_callbacks(); spin_unlock_irqrestore(&gnttab_list_lock, flags); } +static void gnttab_set_free(unsigned int start, unsigned int n) +{ + unsigned int i; + + for (i = start; i < start + n - 1; i++) + gnttab_entry(i) = i + 1; + + gnttab_entry(i) = GNTTAB_LIST_END; + if (!gnttab_free_count) { + gnttab_free_head = start; + gnttab_free_tail_ptr = &gnttab_free_head; + } else { + gnttab_entry(gnttab_last_free) = start; + } + gnttab_free_count += n; + gnttab_last_free = i; + + bitmap_set(gnttab_free_bitmap, start, n); +} + /* * Following applies to gnttab_update_entry_v1 and gnttab_update_entry_v2. * Introducing a valid entry into the grant table: * 1. Write ent->domid. - * 2. Write ent->frame: - * GTF_permit_access: Frame to which access is permitted. - * GTF_accept_transfer: Pseudo-phys frame slot being filled by new - * frame, or zero if none. + * 2. Write ent->frame: Frame to which access is permitted. * 3. Write memory barrier (WMB). * 4. Write ent->flags, inc. valid type. */ @@ -285,39 +425,21 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, } EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); -static int gnttab_query_foreign_access_v1(grant_ref_t ref) -{ - return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing); -} - -static int gnttab_query_foreign_access_v2(grant_ref_t ref) -{ - return grstatus[ref] & (GTF_reading|GTF_writing); -} - -int gnttab_query_foreign_access(grant_ref_t ref) -{ - return gnttab_interface->query_foreign_access(ref); -} -EXPORT_SYMBOL_GPL(gnttab_query_foreign_access); - -static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly) +static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref) { - u16 flags, nflags; - u16 *pflags; + u16 *pflags = &gnttab_shared.v1[ref].flags; + u16 flags; - pflags = &gnttab_shared.v1[ref].flags; - nflags = *pflags; + flags = *pflags; do { - flags = nflags; if (flags & (GTF_reading|GTF_writing)) return 0; - } while ((nflags = sync_cmpxchg(pflags, flags, 0)) != flags); + } while (!sync_try_cmpxchg(pflags, &flags, 0)); return 1; } -static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref, int readonly) +static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref) { gnttab_shared.v2[ref].hdr.flags = 0; mb(); /* Concurrent access by hypervisor. */ @@ -340,24 +462,33 @@ static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref, int readonly) return 1; } -static inline int _gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) +static inline int _gnttab_end_foreign_access_ref(grant_ref_t ref) { - return gnttab_interface->end_foreign_access_ref(ref, readonly); + return gnttab_interface->end_foreign_access_ref(ref); } -int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) +int gnttab_end_foreign_access_ref(grant_ref_t ref) { - if (_gnttab_end_foreign_access_ref(ref, readonly)) + if (_gnttab_end_foreign_access_ref(ref)) return 1; pr_warn("WARNING: g.e. %#x still in use!\n", ref); return 0; } EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref); +static unsigned long gnttab_read_frame_v1(grant_ref_t ref) +{ + return gnttab_shared.v1[ref].frame; +} + +static unsigned long gnttab_read_frame_v2(grant_ref_t ref) +{ + return gnttab_shared.v2[ref].full_page.frame; +} + struct deferred_entry { struct list_head list; grant_ref_t ref; - bool ro; uint16_t warn_delay; struct page *page; }; @@ -365,14 +496,21 @@ static LIST_HEAD(deferred_list); static void gnttab_handle_deferred(struct timer_list *); static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred); +static atomic64_t deferred_count; +static atomic64_t leaked_count; +static unsigned int free_per_iteration = 10; +module_param(free_per_iteration, uint, 0600); + static void gnttab_handle_deferred(struct timer_list *unused) { - unsigned int nr = 10; + unsigned int nr = READ_ONCE(free_per_iteration); + const bool ignore_limit = nr == 0; struct deferred_entry *first = NULL; unsigned long flags; + size_t freed = 0; spin_lock_irqsave(&gnttab_list_lock, flags); - while (nr--) { + while ((ignore_limit || nr--) && !list_empty(&deferred_list)) { struct deferred_entry *entry = list_first_entry(&deferred_list, struct deferred_entry, list); @@ -381,14 +519,15 @@ static void gnttab_handle_deferred(struct timer_list *unused) break; list_del(&entry->list); spin_unlock_irqrestore(&gnttab_list_lock, flags); - if (_gnttab_end_foreign_access_ref(entry->ref, entry->ro)) { + if (_gnttab_end_foreign_access_ref(entry->ref)) { + uint64_t ret = atomic64_dec_return(&deferred_count); + put_free_entry(entry->ref); - if (entry->page) { - pr_debug("freeing g.e. %#x (pfn %#lx)\n", - entry->ref, page_to_pfn(entry->page)); - put_page(entry->page); - } else - pr_info("freeing g.e. %#x\n", entry->ref); + pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n", + entry->ref, page_to_pfn(entry->page), + (unsigned long long)ret); + put_page(entry->page); + freed++; kfree(entry); entry = NULL; } else { @@ -400,27 +539,35 @@ static void gnttab_handle_deferred(struct timer_list *unused) spin_lock_irqsave(&gnttab_list_lock, flags); if (entry) list_add_tail(&entry->list, &deferred_list); - else if (list_empty(&deferred_list)) - break; } - if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) { + if (list_empty(&deferred_list)) + WARN_ON(atomic64_read(&deferred_count)); + else if (!timer_pending(&deferred_timer)) { deferred_timer.expires = jiffies + HZ; add_timer(&deferred_timer); } spin_unlock_irqrestore(&gnttab_list_lock, flags); + pr_debug("Freed %zu references", freed); } -static void gnttab_add_deferred(grant_ref_t ref, bool readonly, - struct page *page) +static void gnttab_add_deferred(grant_ref_t ref, struct page *page) { - struct deferred_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); - const char *what = KERN_WARNING "leaking"; + struct deferred_entry *entry; + gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL; + uint64_t leaked, deferred; + + entry = kmalloc(sizeof(*entry), gfp); + if (!page) { + unsigned long gfn = gnttab_interface->read_frame(ref); + + page = pfn_to_page(gfn_to_pfn(gfn)); + get_page(page); + } if (entry) { unsigned long flags; entry->ref = ref; - entry->ro = readonly; entry->page = page; entry->warn_delay = 60; spin_lock_irqsave(&gnttab_list_lock, flags); @@ -430,120 +577,38 @@ static void gnttab_add_deferred(grant_ref_t ref, bool readonly, add_timer(&deferred_timer); } spin_unlock_irqrestore(&gnttab_list_lock, flags); - what = KERN_DEBUG "deferring"; - } - printk("%s g.e. %#x (pfn %#lx)\n", - what, ref, page ? page_to_pfn(page) : -1); -} - -void gnttab_end_foreign_access(grant_ref_t ref, int readonly, - unsigned long page) -{ - if (gnttab_end_foreign_access_ref(ref, readonly)) { - put_free_entry(ref); - if (page != 0) - put_page(virt_to_page(page)); - } else - gnttab_add_deferred(ref, readonly, - page ? virt_to_page(page) : NULL); -} -EXPORT_SYMBOL_GPL(gnttab_end_foreign_access); - -int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn) -{ - int ref; - - ref = get_free_entries(1); - if (unlikely(ref < 0)) - return -ENOSPC; - gnttab_grant_foreign_transfer_ref(ref, domid, pfn); - - return ref; -} -EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer); - -void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid, - unsigned long pfn) -{ - gnttab_interface->update_entry(ref, domid, pfn, GTF_accept_transfer); -} -EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref); - -static unsigned long gnttab_end_foreign_transfer_ref_v1(grant_ref_t ref) -{ - unsigned long frame; - u16 flags; - u16 *pflags; - - pflags = &gnttab_shared.v1[ref].flags; - - /* - * If a transfer is not even yet started, try to reclaim the grant - * reference and return failure (== 0). - */ - while (!((flags = *pflags) & GTF_transfer_committed)) { - if (sync_cmpxchg(pflags, flags, 0) == flags) - return 0; - cpu_relax(); - } - - /* If a transfer is in progress then wait until it is completed. */ - while (!(flags & GTF_transfer_completed)) { - flags = *pflags; - cpu_relax(); + deferred = atomic64_inc_return(&deferred_count); + leaked = atomic64_read(&leaked_count); + pr_debug("deferring g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n", + ref, page ? page_to_pfn(page) : -1, deferred, leaked); + } else { + deferred = atomic64_read(&deferred_count); + leaked = atomic64_inc_return(&leaked_count); + pr_warn("leaking g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n", + ref, page ? page_to_pfn(page) : -1, deferred, leaked); } - - rmb(); /* Read the frame number /after/ reading completion status. */ - frame = gnttab_shared.v1[ref].frame; - BUG_ON(frame == 0); - - return frame; } -static unsigned long gnttab_end_foreign_transfer_ref_v2(grant_ref_t ref) +int gnttab_try_end_foreign_access(grant_ref_t ref) { - unsigned long frame; - u16 flags; - u16 *pflags; - - pflags = &gnttab_shared.v2[ref].hdr.flags; + int ret = _gnttab_end_foreign_access_ref(ref); - /* - * If a transfer is not even yet started, try to reclaim the grant - * reference and return failure (== 0). - */ - while (!((flags = *pflags) & GTF_transfer_committed)) { - if (sync_cmpxchg(pflags, flags, 0) == flags) - return 0; - cpu_relax(); - } - - /* If a transfer is in progress then wait until it is completed. */ - while (!(flags & GTF_transfer_completed)) { - flags = *pflags; - cpu_relax(); - } - - rmb(); /* Read the frame number /after/ reading completion status. */ - frame = gnttab_shared.v2[ref].full_page.frame; - BUG_ON(frame == 0); - - return frame; -} + if (ret) + put_free_entry(ref); -unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref) -{ - return gnttab_interface->end_foreign_transfer_ref(ref); + return ret; } -EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref); +EXPORT_SYMBOL_GPL(gnttab_try_end_foreign_access); -unsigned long gnttab_end_foreign_transfer(grant_ref_t ref) +void gnttab_end_foreign_access(grant_ref_t ref, struct page *page) { - unsigned long frame = gnttab_end_foreign_transfer_ref(ref); - put_free_entry(ref); - return frame; + if (gnttab_try_end_foreign_access(ref)) { + if (page) + put_page(page); + } else + gnttab_add_deferred(ref, page); } -EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer); +EXPORT_SYMBOL_GPL(gnttab_end_foreign_access); void gnttab_free_grant_reference(grant_ref_t ref) { @@ -555,23 +620,31 @@ void gnttab_free_grant_references(grant_ref_t head) { grant_ref_t ref; unsigned long flags; - int count = 1; - if (head == GNTTAB_LIST_END) - return; + spin_lock_irqsave(&gnttab_list_lock, flags); - ref = head; - while (gnttab_entry(ref) != GNTTAB_LIST_END) { - ref = gnttab_entry(ref); - count++; + while (head != GNTTAB_LIST_END) { + ref = gnttab_entry(head); + put_free_entry_locked(head); + head = ref; } - gnttab_entry(ref) = gnttab_free_head; - gnttab_free_head = head; - gnttab_free_count += count; check_free_callbacks(); spin_unlock_irqrestore(&gnttab_list_lock, flags); } EXPORT_SYMBOL_GPL(gnttab_free_grant_references); +void gnttab_free_grant_reference_seq(grant_ref_t head, unsigned int count) +{ + unsigned long flags; + unsigned int i; + + spin_lock_irqsave(&gnttab_list_lock, flags); + for (i = count; i > 0; i--) + put_free_entry_locked(head + i - 1); + check_free_callbacks(); + spin_unlock_irqrestore(&gnttab_list_lock, flags); +} +EXPORT_SYMBOL_GPL(gnttab_free_grant_reference_seq); + int gnttab_alloc_grant_references(u16 count, grant_ref_t *head) { int h = get_free_entries(count); @@ -585,6 +658,24 @@ int gnttab_alloc_grant_references(u16 count, grant_ref_t *head) } EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references); +int gnttab_alloc_grant_reference_seq(unsigned int count, grant_ref_t *first) +{ + int h; + + if (count == 1) + h = get_free_entries(1); + else + h = get_free_entries_seq(count); + + if (h < 0) + return -ENOSPC; + + *first = h; + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_alloc_grant_reference_seq); + int gnttab_empty_grant_references(const grant_ref_t *private_head) { return (*private_head == GNTTAB_LIST_END); @@ -664,7 +755,6 @@ static int grow_gnttab_list(unsigned int more_frames) unsigned int nr_glist_frames, new_nr_glist_frames; unsigned int grefs_per_frame; - BUG_ON(gnttab_interface == NULL); grefs_per_frame = gnttab_interface->grefs_per_grant_frame; new_nr_grant_frames = nr_grant_frames + more_frames; @@ -678,16 +768,13 @@ static int grow_gnttab_list(unsigned int more_frames) goto grow_nomem; } + gnttab_set_free(gnttab_size, extra_entries); - for (i = grefs_per_frame * nr_grant_frames; - i < grefs_per_frame * new_nr_grant_frames - 1; i++) - gnttab_entry(i) = i + 1; - - gnttab_entry(i) = gnttab_free_head; - gnttab_free_head = grefs_per_frame * nr_grant_frames; - gnttab_free_count += extra_entries; + if (!gnttab_free_tail_ptr) + gnttab_free_tail_ptr = __gnttab_entry(gnttab_size); nr_grant_frames = new_nr_grant_frames; + gnttab_size += extra_entries; check_free_callbacks(); @@ -738,7 +825,7 @@ int gnttab_setup_auto_xlat_frames(phys_addr_t addr) if (xen_auto_xlat_grant_frames.count) return -EINVAL; - vaddr = xen_remap(addr, XEN_PAGE_SIZE * max_nr_gframes); + vaddr = memremap(addr, XEN_PAGE_SIZE * max_nr_gframes, MEMREMAP_WB); if (vaddr == NULL) { pr_warn("Failed to ioremap gnttab share frames (addr=%pa)!\n", &addr); @@ -746,7 +833,7 @@ int gnttab_setup_auto_xlat_frames(phys_addr_t addr) } pfn = kcalloc(max_nr_gframes, sizeof(pfn[0]), GFP_KERNEL); if (!pfn) { - xen_unmap(vaddr); + memunmap(vaddr); return -ENOMEM; } for (i = 0; i < max_nr_gframes; i++) @@ -765,7 +852,7 @@ void gnttab_free_auto_xlat_frames(void) if (!xen_auto_xlat_grant_frames.count) return; kfree(xen_auto_xlat_grant_frames.pfn); - xen_unmap(xen_auto_xlat_grant_frames.vaddr); + memunmap(xen_auto_xlat_grant_frames.vaddr); xen_auto_xlat_grant_frames.pfn = NULL; xen_auto_xlat_grant_frames.count = 0; @@ -803,7 +890,7 @@ int gnttab_alloc_pages(int nr_pages, struct page **pages) { int ret; - ret = alloc_xenballooned_pages(nr_pages, pages); + ret = xen_alloc_unpopulated_pages(nr_pages, pages); if (ret < 0) return ret; @@ -815,6 +902,129 @@ int gnttab_alloc_pages(int nr_pages, struct page **pages) } EXPORT_SYMBOL_GPL(gnttab_alloc_pages); +#ifdef CONFIG_XEN_UNPOPULATED_ALLOC +static inline void cache_init(struct gnttab_page_cache *cache) +{ + cache->pages = NULL; +} + +static inline bool cache_empty(struct gnttab_page_cache *cache) +{ + return !cache->pages; +} + +static inline struct page *cache_deq(struct gnttab_page_cache *cache) +{ + struct page *page; + + page = cache->pages; + cache->pages = page->zone_device_data; + + return page; +} + +static inline void cache_enq(struct gnttab_page_cache *cache, struct page *page) +{ + page->zone_device_data = cache->pages; + cache->pages = page; +} +#else +static inline void cache_init(struct gnttab_page_cache *cache) +{ + INIT_LIST_HEAD(&cache->pages); +} + +static inline bool cache_empty(struct gnttab_page_cache *cache) +{ + return list_empty(&cache->pages); +} + +static inline struct page *cache_deq(struct gnttab_page_cache *cache) +{ + struct page *page; + + page = list_first_entry(&cache->pages, struct page, lru); + list_del(&page->lru); + + return page; +} + +static inline void cache_enq(struct gnttab_page_cache *cache, struct page *page) +{ + list_add(&page->lru, &cache->pages); +} +#endif + +void gnttab_page_cache_init(struct gnttab_page_cache *cache) +{ + spin_lock_init(&cache->lock); + cache_init(cache); + cache->num_pages = 0; +} +EXPORT_SYMBOL_GPL(gnttab_page_cache_init); + +int gnttab_page_cache_get(struct gnttab_page_cache *cache, struct page **page) +{ + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + + if (cache_empty(cache)) { + spin_unlock_irqrestore(&cache->lock, flags); + return gnttab_alloc_pages(1, page); + } + + page[0] = cache_deq(cache); + cache->num_pages--; + + spin_unlock_irqrestore(&cache->lock, flags); + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_page_cache_get); + +void gnttab_page_cache_put(struct gnttab_page_cache *cache, struct page **page, + unsigned int num) +{ + unsigned long flags; + unsigned int i; + + spin_lock_irqsave(&cache->lock, flags); + + for (i = 0; i < num; i++) + cache_enq(cache, page[i]); + cache->num_pages += num; + + spin_unlock_irqrestore(&cache->lock, flags); +} +EXPORT_SYMBOL_GPL(gnttab_page_cache_put); + +void gnttab_page_cache_shrink(struct gnttab_page_cache *cache, unsigned int num) +{ + struct page *page[10]; + unsigned int i = 0; + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + + while (cache->num_pages > num) { + page[i] = cache_deq(cache); + cache->num_pages--; + if (++i == ARRAY_SIZE(page)) { + spin_unlock_irqrestore(&cache->lock, flags); + gnttab_free_pages(i, page); + i = 0; + spin_lock_irqsave(&cache->lock, flags); + } + } + + spin_unlock_irqrestore(&cache->lock, flags); + + if (i != 0) + gnttab_free_pages(i, page); +} +EXPORT_SYMBOL_GPL(gnttab_page_cache_shrink); + void gnttab_pages_clear_private(int nr_pages, struct page **pages) { int i; @@ -832,13 +1042,13 @@ EXPORT_SYMBOL_GPL(gnttab_pages_clear_private); /** * gnttab_free_pages - free pages allocated by gnttab_alloc_pages() - * @nr_pages; number of pages to free + * @nr_pages: number of pages to free * @pages: the pages */ void gnttab_free_pages(int nr_pages, struct page **pages) { gnttab_pages_clear_private(nr_pages, pages); - free_xenballooned_pages(nr_pages, pages); + xen_free_unpopulated_pages(nr_pages, pages); } EXPORT_SYMBOL_GPL(gnttab_free_pages); @@ -853,6 +1063,9 @@ int gnttab_dma_alloc_pages(struct gnttab_dma_alloc_args *args) size_t size; int i, ret; + if (args->nr_pages < 0 || args->nr_pages > (INT_MAX >> PAGE_SHIFT)) + return -ENOMEM; + size = args->nr_pages << PAGE_SHIFT; if (args->coherent) args->vaddr = dma_alloc_coherent(args->dev, size, @@ -991,7 +1204,7 @@ void gnttab_foreach_grant_in_range(struct page *page, unsigned int glen; unsigned long xen_pfn; - len = min_t(unsigned int, PAGE_SIZE - offset, len); + len = min(PAGE_SIZE - offset, len); goffset = xen_offset_in_page(offset); xen_pfn = page_to_xen_pfn(page) + XEN_PFN_DOWN(offset); @@ -1160,7 +1373,6 @@ EXPORT_SYMBOL_GPL(gnttab_unmap_refs_sync); static unsigned int nr_status_frames(unsigned int nr_grant_frames) { - BUG_ON(gnttab_interface == NULL); return gnttab_frames(nr_grant_frames, SPP); } @@ -1237,7 +1449,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) unsigned int nr_gframes = end_idx + 1; int rc; - if (xen_feature(XENFEAT_auto_translated_physmap)) { + if (!xen_pv_domain()) { struct xen_add_to_physmap xatp; unsigned int i = end_idx; rc = 0; @@ -1296,8 +1508,7 @@ static const struct gnttab_ops gnttab_v1_ops = { .unmap_frames = gnttab_unmap_frames_v1, .update_entry = gnttab_update_entry_v1, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v1, - .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v1, - .query_foreign_access = gnttab_query_foreign_access_v1, + .read_frame = gnttab_read_frame_v1, }; static const struct gnttab_ops gnttab_v2_ops = { @@ -1308,8 +1519,7 @@ static const struct gnttab_ops gnttab_v2_ops = { .unmap_frames = gnttab_unmap_frames_v2, .update_entry = gnttab_update_entry_v2, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v2, - .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v2, - .query_foreign_access = gnttab_query_foreign_access_v2, + .read_frame = gnttab_read_frame_v2, }; static bool gnttab_need_v2(void) @@ -1360,11 +1570,10 @@ static int gnttab_setup(void) if (max_nr_gframes < nr_grant_frames) return -ENOSYS; - if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) { + if (!xen_pv_domain() && gnttab_shared.addr == NULL) { gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr; if (gnttab_shared.addr == NULL) { - pr_warn("gnttab share frames (addr=0x%08lx) is not mapped!\n", - (unsigned long)xen_auto_xlat_grant_frames.vaddr); + pr_warn("gnttab share frames is not mapped!\n"); return -ENOMEM; } } @@ -1379,7 +1588,7 @@ int gnttab_resume(void) int gnttab_suspend(void) { - if (!xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_pv_domain()) gnttab_interface->unmap_frames(); return 0; } @@ -1389,7 +1598,6 @@ static int gnttab_expand(unsigned int req_entries) int rc; unsigned int cur, extra; - BUG_ON(gnttab_interface == NULL); cur = nr_grant_frames; extra = ((req_entries + gnttab_interface->grefs_per_grant_frame - 1) / gnttab_interface->grefs_per_grant_frame); @@ -1412,21 +1620,20 @@ static int gnttab_expand(unsigned int req_entries) int gnttab_init(void) { int i; - unsigned long max_nr_grant_frames; + unsigned long max_nr_grant_frames, max_nr_grefs; unsigned int max_nr_glist_frames, nr_glist_frames; - unsigned int nr_init_grefs; int ret; gnttab_request_version(); max_nr_grant_frames = gnttab_max_grant_frames(); + max_nr_grefs = max_nr_grant_frames * + gnttab_interface->grefs_per_grant_frame; nr_grant_frames = 1; /* Determine the maximum number of frames required for the * grant reference free list on the current hypervisor. */ - BUG_ON(gnttab_interface == NULL); - max_nr_glist_frames = (max_nr_grant_frames * - gnttab_interface->grefs_per_grant_frame / RPP); + max_nr_glist_frames = max_nr_grefs / RPP; gnttab_list = kmalloc_array(max_nr_glist_frames, sizeof(grant_ref_t *), @@ -1443,6 +1650,12 @@ int gnttab_init(void) } } + gnttab_free_bitmap = bitmap_zalloc(max_nr_grefs, GFP_KERNEL); + if (!gnttab_free_bitmap) { + ret = -ENOMEM; + goto ini_nomem; + } + ret = arch_gnttab_init(max_nr_grant_frames, nr_status_frames(max_nr_grant_frames)); if (ret < 0) @@ -1453,15 +1666,10 @@ int gnttab_init(void) goto ini_nomem; } - nr_init_grefs = nr_grant_frames * - gnttab_interface->grefs_per_grant_frame; - - for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++) - gnttab_entry(i) = i + 1; + gnttab_size = nr_grant_frames * gnttab_interface->grefs_per_grant_frame; - gnttab_entry(nr_init_grefs - 1) = GNTTAB_LIST_END; - gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES; - gnttab_free_head = NR_RESERVED_ENTRIES; + gnttab_set_free(GNTTAB_NR_RESERVED_ENTRIES, + gnttab_size - GNTTAB_NR_RESERVED_ENTRIES); printk("Grant table initialized\n"); return 0; @@ -1470,6 +1678,7 @@ int gnttab_init(void) for (i--; i >= 0; i--) free_page((unsigned long)gnttab_list[i]); kfree(gnttab_list); + bitmap_free(gnttab_free_bitmap); return ret; } EXPORT_SYMBOL_GPL(gnttab_init); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 5bb01a62f214..e20c40a62e64 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Handle extern requests for shutdown, reboot and sysrq */ @@ -10,6 +11,7 @@ #include <linux/reboot.h> #include <linux/sysrq.h> #include <linux/stop_machine.h> +#include <linux/suspend.h> #include <linux/freezer.h> #include <linux/syscore_ops.h> #include <linux/export.h> @@ -51,12 +53,6 @@ void xen_resume_notifier_register(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(xen_resume_notifier_register); -void xen_resume_notifier_unregister(struct notifier_block *nb) -{ - raw_notifier_chain_unregister(&xen_resume_notifier, nb); -} -EXPORT_SYMBOL_GPL(xen_resume_notifier_unregister); - #ifdef CONFIG_HIBERNATE_CALLBACKS static int xen_suspend(void *data) { @@ -100,10 +96,16 @@ static void do_suspend(void) shutting_down = SHUTDOWN_SUSPEND; + if (!mutex_trylock(&system_transition_mutex)) + { + pr_err("%s: failed to take system_transition_mutex\n", __func__); + goto out; + } + err = freeze_processes(); if (err) { pr_err("%s: freeze processes failed %d\n", __func__, err); - goto out; + goto out_unlock; } err = freeze_kernel_threads(); @@ -115,7 +117,7 @@ static void do_suspend(void) err = dpm_suspend_start(PMSG_FREEZE); if (err) { pr_err("%s: dpm_suspend_start %d\n", __func__, err); - goto out_thaw; + goto out_resume_end; } printk(KERN_DEBUG "suspending xenstore...\n"); @@ -140,6 +142,8 @@ static void do_suspend(void) raw_notifier_call_chain(&xen_resume_notifier, 0, NULL); + xen_arch_resume(); + dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE); if (err) { @@ -147,18 +151,19 @@ static void do_suspend(void) si.cancelled = 1; } - xen_arch_resume(); - out_resume: if (!si.cancelled) xs_resume(); else xs_suspend_cancel(); +out_resume_end: dpm_resume_end(si.cancelled ? PMSG_THAW : PMSG_RESTORE); out_thaw: thaw_processes(); +out_unlock: + mutex_unlock(&system_transition_mutex); out: shutting_down = SHUTDOWN_INVALID; } @@ -178,6 +183,7 @@ static int poweroff_nb(struct notifier_block *cb, unsigned long code, void *unus case SYS_HALT: case SYS_POWER_OFF: shutting_down = SHUTDOWN_POWEROFF; + break; default: break; } @@ -203,10 +209,10 @@ static void do_poweroff(void) static void do_reboot(void) { shutting_down = SHUTDOWN_POWEROFF; /* ? */ - ctrl_alt_del(); + orderly_reboot(); } -static struct shutdown_handler shutdown_handlers[] = { +static const struct shutdown_handler shutdown_handlers[] = { { "poweroff", true, do_poweroff }, { "halt", false, do_poweroff }, { "reboot", true, do_reboot }, diff --git a/drivers/xen/mcelog.c b/drivers/xen/mcelog.c index b8bf61abb65b..4f65b641c054 100644 --- a/drivers/xen/mcelog.c +++ b/drivers/xen/mcelog.c @@ -182,7 +182,6 @@ static const struct file_operations xen_mce_chrdev_ops = { .read = xen_mce_chrdev_read, .poll = xen_mce_chrdev_poll, .unlocked_ioctl = xen_mce_chrdev_ioctl, - .llseek = no_llseek, }; static struct miscdevice xen_mce_chrdev_device = { @@ -222,7 +221,7 @@ static int convert_log(struct mc_info *mi) struct mcinfo_global *mc_global; struct mcinfo_bank *mc_bank; struct xen_mce m; - uint32_t i; + unsigned int i, j; mic = NULL; x86_mcinfo_lookup(&mic, mi, MC_TYPE_GLOBAL); @@ -248,7 +247,17 @@ static int convert_log(struct mc_info *mi) m.socketid = g_physinfo[i].mc_chipid; m.cpu = m.extcpu = g_physinfo[i].mc_cpunr; m.cpuvendor = (__u8)g_physinfo[i].mc_vendor; - m.mcgcap = g_physinfo[i].mc_msrvalues[__MC_MSR_MCGCAP].value; + for (j = 0; j < g_physinfo[i].mc_nmsrvals; ++j) + switch (g_physinfo[i].mc_msrvalues[j].reg) { + case MSR_IA32_MCG_CAP: + m.mcgcap = g_physinfo[i].mc_msrvalues[j].value; + break; + + case MSR_PPIN: + case MSR_AMD_PPIN: + m.ppin = g_physinfo[i].mc_msrvalues[j].value; + break; + } mic = NULL; x86_mcinfo_lookup(&mic, mi, MC_TYPE_BANK); diff --git a/drivers/xen/mem-reservation.c b/drivers/xen/mem-reservation.c index 3782cf070338..24648836e0d4 100644 --- a/drivers/xen/mem-reservation.c +++ b/drivers/xen/mem-reservation.c @@ -35,6 +35,7 @@ void __xenmem_reservation_va_mapping_update(unsigned long count, for (i = 0; i < count; i++) { struct page *page = pages[i]; unsigned long pfn = page_to_pfn(page); + int ret; BUG_ON(!page); @@ -46,16 +47,10 @@ void __xenmem_reservation_va_mapping_update(unsigned long count, set_phys_to_machine(pfn, frames[i]); - /* Link back into the page tables if not highmem. */ - if (!PageHighMem(page)) { - int ret; - - ret = HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - mfn_pte(frames[i], PAGE_KERNEL), - 0); - BUG_ON(ret); - } + ret = HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + mfn_pte(frames[i], PAGE_KERNEL), 0); + BUG_ON(ret); } } EXPORT_SYMBOL_GPL(__xenmem_reservation_va_mapping_update); @@ -68,6 +63,7 @@ void __xenmem_reservation_va_mapping_reset(unsigned long count, for (i = 0; i < count; i++) { struct page *page = pages[i]; unsigned long pfn = page_to_pfn(page); + int ret; /* * We don't support PV MMU when Linux and Xen are using @@ -75,14 +71,11 @@ void __xenmem_reservation_va_mapping_reset(unsigned long count, */ BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); - if (!PageHighMem(page)) { - int ret; + ret = HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + __pte_ma(0), 0); + BUG_ON(ret); - ret = HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - __pte_ma(0), 0); - BUG_ON(ret); - } __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } } diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index 7494dbeb4409..bfe07adb3e3a 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -1,25 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2009, Intel Corporation. * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * * Author: Weidong Han <weidong.han@intel.com> */ #include <linux/pci.h> #include <linux/acpi.h> #include <linux/pci-acpi.h> +#include <xen/pci.h> #include <xen/xen.h> #include <xen/interface/physdev.h> #include <xen/interface/xen.h> @@ -29,6 +18,8 @@ #include "../pci/pci.h" #ifdef CONFIG_PCI_MMCONFIG #include <asm/pci_x86.h> + +static int xen_mcfg_late(void); #endif static bool __read_mostly pci_seg_supported = true; @@ -40,17 +31,36 @@ static int xen_add_device(struct device *dev) #ifdef CONFIG_PCI_IOV struct pci_dev *physfn = pci_dev->physfn; #endif +#ifdef CONFIG_PCI_MMCONFIG + static bool pci_mcfg_reserved = false; + /* + * Reserve MCFG areas in Xen on first invocation due to this being + * potentially called from inside of acpi_init immediately after + * MCFG table has been finally parsed. + */ + if (!pci_mcfg_reserved) { + xen_mcfg_late(); + pci_mcfg_reserved = true; + } +#endif + + if (pci_domain_nr(pci_dev->bus) >> 16) { + /* + * The hypercall interface is limited to 16bit PCI segment + * values, do not attempt to register devices with Xen in + * segments greater or equal than 0x10000. + */ + dev_info(dev, + "not registering with Xen: invalid PCI segment\n"); + return 0; + } if (pci_seg_supported) { - struct { - struct physdev_pci_device_add add; - uint32_t pxm; - } add_ext = { - .add.seg = pci_domain_nr(pci_dev->bus), - .add.bus = pci_dev->bus->number, - .add.devfn = pci_dev->devfn - }; - struct physdev_pci_device_add *add = &add_ext.add; + DEFINE_RAW_FLEX(struct physdev_pci_device_add, add, optarr, 1); + + add->seg = pci_domain_nr(pci_dev->bus); + add->bus = pci_dev->bus->number; + add->devfn = pci_dev->devfn; #ifdef CONFIG_ACPI acpi_handle handle; @@ -151,6 +161,16 @@ static int xen_remove_device(struct device *dev) int r; struct pci_dev *pci_dev = to_pci_dev(dev); + if (pci_domain_nr(pci_dev->bus) >> 16) { + /* + * The hypercall interface is limited to 16bit PCI segment + * values. + */ + dev_info(dev, + "not unregistering with Xen: invalid PCI segment\n"); + return 0; + } + if (pci_seg_supported) { struct physdev_pci_device device = { .seg = pci_domain_nr(pci_dev->bus), @@ -175,6 +195,29 @@ static int xen_remove_device(struct device *dev) return r; } +int xen_reset_device(const struct pci_dev *dev) +{ + struct pci_device_reset device = { + .dev.seg = pci_domain_nr(dev->bus), + .dev.bus = dev->bus->number, + .dev.devfn = dev->devfn, + .flags = PCI_DEVICE_RESET_FLR, + }; + + if (pci_domain_nr(dev->bus) >> 16) { + /* + * The hypercall interface is limited to 16bit PCI segment + * values. + */ + dev_info(&dev->dev, + "unable to notify Xen of device reset: invalid PCI segment\n"); + return 0; + } + + return HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_reset, &device); +} +EXPORT_SYMBOL_GPL(xen_reset_device); + static int xen_pci_notifier(struct notifier_block *nb, unsigned long action, void *data) { @@ -213,7 +256,7 @@ static int __init register_xen_pci_notifier(void) arch_initcall(register_xen_pci_notifier); #ifdef CONFIG_PCI_MMCONFIG -static int __init xen_mcfg_late(void) +static int xen_mcfg_late(void) { struct pci_mmcfg_region *cfg; int rc; @@ -252,8 +295,79 @@ static int __init xen_mcfg_late(void) } return 0; } -/* - * Needs to be done after acpi_init which are subsys_initcall. - */ -subsys_initcall_sync(xen_mcfg_late); +#endif + +#ifdef CONFIG_XEN_DOM0 +struct xen_device_domain_owner { + domid_t domain; + struct pci_dev *dev; + struct list_head list; +}; + +static DEFINE_SPINLOCK(dev_domain_list_spinlock); +static LIST_HEAD(dev_domain_list); + +static struct xen_device_domain_owner *find_device(struct pci_dev *dev) +{ + struct xen_device_domain_owner *owner; + + list_for_each_entry(owner, &dev_domain_list, list) { + if (owner->dev == dev) + return owner; + } + return NULL; +} + +int xen_find_device_domain_owner(struct pci_dev *dev) +{ + struct xen_device_domain_owner *owner; + int domain = -ENODEV; + + spin_lock(&dev_domain_list_spinlock); + owner = find_device(dev); + if (owner) + domain = owner->domain; + spin_unlock(&dev_domain_list_spinlock); + return domain; +} +EXPORT_SYMBOL_GPL(xen_find_device_domain_owner); + +int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain) +{ + struct xen_device_domain_owner *owner; + + owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL); + if (!owner) + return -ENODEV; + + spin_lock(&dev_domain_list_spinlock); + if (find_device(dev)) { + spin_unlock(&dev_domain_list_spinlock); + kfree(owner); + return -EEXIST; + } + owner->domain = domain; + owner->dev = dev; + list_add_tail(&owner->list, &dev_domain_list); + spin_unlock(&dev_domain_list_spinlock); + return 0; +} +EXPORT_SYMBOL_GPL(xen_register_device_domain_owner); + +int xen_unregister_device_domain_owner(struct pci_dev *dev) +{ + struct xen_device_domain_owner *owner; + + spin_lock(&dev_domain_list_spinlock); + owner = find_device(dev); + if (!owner) { + spin_unlock(&dev_domain_list_spinlock); + return -ENODEV; + } + list_del(&owner->list); + spin_unlock(&dev_domain_list_spinlock); + kfree(owner); + return 0; +} +EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner); #endif diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c index cdc6daa7a9f6..093ad4a08672 100644 --- a/drivers/xen/pcpu.c +++ b/drivers/xen/pcpu.c @@ -47,6 +47,9 @@ #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> +#ifdef CONFIG_ACPI +#include <acpi/processor.h> +#endif /* * @cpu_id: Xen physical cpu logic number @@ -58,10 +61,11 @@ struct pcpu { struct list_head list; struct device dev; uint32_t cpu_id; + uint32_t acpi_id; uint32_t flags; }; -static struct bus_type xen_pcpu_subsys = { +static const struct bus_type xen_pcpu_subsys = { .name = "xen_cpu", .dev_name = "xen_cpu", }; @@ -92,7 +96,7 @@ static int xen_pcpu_up(uint32_t cpu_id) return HYPERVISOR_platform_op(&op); } -static ssize_t show_online(struct device *dev, +static ssize_t online_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -101,7 +105,7 @@ static ssize_t show_online(struct device *dev, return sprintf(buf, "%u\n", !!(cpu->flags & XEN_PCPU_FLAGS_ONLINE)); } -static ssize_t __ref store_online(struct device *dev, +static ssize_t online_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -130,7 +134,7 @@ static ssize_t __ref store_online(struct device *dev, ret = count; return ret; } -static DEVICE_ATTR(online, S_IRUGO | S_IWUSR, show_online, store_online); +static DEVICE_ATTR_RW(online); static struct attribute *pcpu_dev_attrs[] = { &dev_attr_online.attr, @@ -228,7 +232,7 @@ static int register_pcpu(struct pcpu *pcpu) err = device_register(dev); if (err) { - pcpu_release(dev); + put_device(dev); return err; } @@ -249,6 +253,7 @@ static struct pcpu *create_and_register_pcpu(struct xenpf_pcpuinfo *info) INIT_LIST_HEAD(&pcpu->list); pcpu->cpu_id = info->xen_cpuid; + pcpu->acpi_id = info->acpi_id; pcpu->flags = info->flags; /* Need hold on xen_pcpu_lock before pcpu list manipulations */ @@ -345,41 +350,6 @@ static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -/* Sync with Xen hypervisor after cpu hotadded */ -void xen_pcpu_hotplug_sync(void) -{ - schedule_work(&xen_pcpu_work); -} -EXPORT_SYMBOL_GPL(xen_pcpu_hotplug_sync); - -/* - * For hypervisor presented cpu, return logic cpu id; - * For hypervisor non-presented cpu, return -ENODEV. - */ -int xen_pcpu_id(uint32_t acpi_id) -{ - int cpu_id = 0, max_id = 0; - struct xen_platform_op op; - - op.cmd = XENPF_get_cpuinfo; - while (cpu_id <= max_id) { - op.u.pcpu_info.xen_cpuid = cpu_id; - if (HYPERVISOR_platform_op(&op)) { - cpu_id++; - continue; - } - - if (acpi_id == op.u.pcpu_info.acpi_id) - return cpu_id; - if (op.u.pcpu_info.max_present > max_id) - max_id = op.u.pcpu_info.max_present; - cpu_id++; - } - - return -ENODEV; -} -EXPORT_SYMBOL_GPL(xen_pcpu_id); - static int __init xen_pcpu_init(void) { int irq, ret; @@ -416,3 +386,40 @@ err1: return ret; } arch_initcall(xen_pcpu_init); + +#ifdef CONFIG_ACPI +bool __init xen_processor_present(uint32_t acpi_id) +{ + const struct pcpu *pcpu; + bool online = false; + + mutex_lock(&xen_pcpu_lock); + list_for_each_entry(pcpu, &xen_pcpus, list) + if (pcpu->acpi_id == acpi_id) { + online = pcpu->flags & XEN_PCPU_FLAGS_ONLINE; + break; + } + mutex_unlock(&xen_pcpu_lock); + + return online; +} + +void xen_sanitize_proc_cap_bits(uint32_t *cap) +{ + struct xen_platform_op op = { + .cmd = XENPF_set_processor_pminfo, + .u.set_pminfo.id = -1, + .u.set_pminfo.type = XEN_PM_PDC, + }; + u32 buf[3] = { ACPI_PDC_REVISION_ID, 1, *cap }; + int ret; + + set_xen_guest_handle(op.u.set_pminfo.pdc, buf); + ret = HYPERVISOR_platform_op(&op); + if (ret) + pr_err("sanitize of _PDC buffer bits from Xen failed: %d\n", + ret); + else + *cap = buf[2]; +} +#endif diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index 5d7dcad0b0a0..1db82da56db6 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** * platform-pci.c * @@ -8,20 +9,6 @@ * Copyright (c) 2005, Intel Corporation. * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * */ @@ -39,6 +26,8 @@ #define DRV_NAME "xen-platform-pci" +#define PCI_DEVICE_ID_XEN_PLATFORM_XS61 0x0002 + static unsigned long platform_mmio; static unsigned long platform_mmio_alloc; static unsigned long platform_mmiolen; @@ -67,7 +56,8 @@ static uint64_t get_callback_via(struct pci_dev *pdev) pin = pdev->pin; /* We don't know the GSI. Specify the PCI INTx line instead. */ - return ((uint64_t)0x01 << HVM_CALLBACK_VIA_TYPE_SHIFT) | /* PCI INTx identifier */ + return ((uint64_t)HVM_PARAM_CALLBACK_TYPE_PCI_INTX << + HVM_CALLBACK_VIA_TYPE_SHIFT) | ((uint64_t)pci_domain_nr(pdev->bus) << 32) | ((uint64_t)pdev->bus->number << 16) | ((uint64_t)(pdev->devfn & 0xff) << 8) | @@ -76,18 +66,17 @@ static uint64_t get_callback_via(struct pci_dev *pdev) static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id) { - xen_hvm_evtchn_do_upcall(); - return IRQ_HANDLED; + return xen_evtchn_do_upcall(); } static int xen_allocate_irq(struct pci_dev *pdev) { return request_irq(pdev->irq, do_hvm_evtchn_intr, - IRQF_NOBALANCING | IRQF_TRIGGER_RISING, + IRQF_NOBALANCING | IRQF_SHARED, "xen-platform-pci", pdev); } -static int platform_pci_resume(struct pci_dev *pdev) +static int platform_pci_resume(struct device *dev) { int err; @@ -96,7 +85,7 @@ static int platform_pci_resume(struct pci_dev *pdev) err = xen_set_callback_via(callback_via); if (err) { - dev_err(&pdev->dev, "platform_pci_resume failure!\n"); + dev_err(dev, "platform_pci_resume failure!\n"); return err; } return 0; @@ -145,12 +134,19 @@ static int platform_pci_probe(struct pci_dev *pdev, dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret); goto out; } + /* + * It doesn't strictly *have* to run on CPU0 but it sure + * as hell better process the event channel ports delivered + * to CPU0. + */ + irq_set_affinity(pdev->irq, cpumask_of(0)); + callback_via = get_callback_via(pdev); ret = xen_set_callback_via(callback_via); if (ret) { dev_warn(&pdev->dev, "Unable to set the evtchn callback " "err=%d\n", ret); - goto out; + goto irq_out; } } @@ -158,14 +154,16 @@ static int platform_pci_probe(struct pci_dev *pdev, grant_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); ret = gnttab_setup_auto_xlat_frames(grant_frames); if (ret) - goto out; + goto irq_out; ret = gnttab_init(); if (ret) goto grant_out; - xenbus_probe(NULL); return 0; grant_out: gnttab_free_auto_xlat_frames(); +irq_out: + if (!xen_have_vector_callback) + free_irq(pdev->irq, pdev); out: pci_release_region(pdev, 0); mem_out: @@ -178,16 +176,22 @@ pci_out: static const struct pci_device_id platform_pci_tbl[] = { {PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, + {PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM_XS61, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, {0,} }; +static const struct dev_pm_ops platform_pm_ops = { + .resume_noirq = platform_pci_resume, +}; + static struct pci_driver platform_driver = { .name = DRV_NAME, .probe = platform_pci_probe, .id_table = platform_pci_tbl, -#ifdef CONFIG_PM - .resume_early = platform_pci_resume, -#endif + .driver = { + .pm = &platform_pm_ops, + }, }; builtin_pci_driver(platform_driver); diff --git a/drivers/xen/preempt.c b/drivers/xen/preempt.c deleted file mode 100644 index 08cb419eb4e6..000000000000 --- a/drivers/xen/preempt.c +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Preemptible hypercalls - * - * Copyright (C) 2014 Citrix Systems R&D ltd. - * - * This source code is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - */ - -#include <linux/sched.h> -#include <xen/xen-ops.h> - -#ifndef CONFIG_PREEMPT - -/* - * Some hypercalls issued by the toolstack can take many 10s of - * seconds. Allow tasks running hypercalls via the privcmd driver to - * be voluntarily preempted even if full kernel preemption is - * disabled. - * - * Such preemptible hypercalls are bracketed by - * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() - * calls. - */ - -DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); -EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); - -asmlinkage __visible void xen_maybe_preempt_hcall(void) -{ - if (unlikely(__this_cpu_read(xen_in_preemptible_hcall) - && need_resched())) { - /* - * Clear flag as we may be rescheduled on a different - * cpu. - */ - __this_cpu_write(xen_in_preemptible_hcall, false); - _cond_resched(); - __this_cpu_write(xen_in_preemptible_hcall, true); - } -} -#endif /* CONFIG_PREEMPT */ diff --git a/drivers/xen/privcmd-buf.c b/drivers/xen/privcmd-buf.c index de01a6d0059d..0f0dad427d7e 100644 --- a/drivers/xen/privcmd-buf.c +++ b/drivers/xen/privcmd-buf.c @@ -19,6 +19,7 @@ #include "privcmd.h" +MODULE_DESCRIPTION("Xen Mmap of hypercall buffers"); MODULE_LICENSE("GPL"); struct privcmd_buf_private { @@ -140,8 +141,7 @@ static int privcmd_buf_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma_priv = kzalloc(sizeof(*vma_priv) + count * sizeof(void *), - GFP_KERNEL); + vma_priv = kzalloc(struct_size(vma_priv, pages, count), GFP_KERNEL); if (!vma_priv) return -ENOMEM; @@ -157,7 +157,7 @@ static int privcmd_buf_mmap(struct file *file, struct vm_area_struct *vma) vma_priv->file_priv = file_priv; vma_priv->users = 1; - vma->vm_flags |= VM_IO | VM_DONTEXPAND; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND); vma->vm_ops = &privcmd_buf_vm_ops; vma->vm_private_data = vma_priv; @@ -166,12 +166,8 @@ static int privcmd_buf_mmap(struct file *file, struct vm_area_struct *vma) if (vma_priv->n_pages != count) ret = -ENOMEM; else - for (i = 0; i < vma_priv->n_pages; i++) { - ret = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE, - vma_priv->pages[i]); - if (ret) - break; - } + ret = vm_map_pages_zero(vma, vma_priv->pages, + vma_priv->n_pages); if (ret) privcmd_buf_vmapriv_free(vma_priv); diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index b24ddac1604b..f52a457b302d 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** * privcmd.c * @@ -8,11 +9,17 @@ #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt +#include <linux/eventfd.h> +#include <linux/file.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/mutex.h> +#include <linux/poll.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/srcu.h> #include <linux/string.h> +#include <linux/workqueue.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/mman.h> @@ -23,25 +30,29 @@ #include <linux/seq_file.h> #include <linux/miscdevice.h> #include <linux/moduleparam.h> +#include <linux/virtio_mmio.h> -#include <asm/pgalloc.h> -#include <asm/pgtable.h> -#include <asm/tlb.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> #include <xen/xen.h> +#include <xen/events.h> #include <xen/privcmd.h> #include <xen/interface/xen.h> #include <xen/interface/memory.h> #include <xen/interface/hvm/dm_op.h> +#include <xen/interface/hvm/ioreq.h> #include <xen/features.h> #include <xen/page.h> #include <xen/xen-ops.h> #include <xen/balloon.h> +#ifdef CONFIG_XEN_ACPI +#include <xen/acpi.h> +#endif #include "privcmd.h" +MODULE_DESCRIPTION("Xen hypercall passthrough driver"); MODULE_LICENSE("GPL"); #define PRIV_VMA_LOCKED ((void *)1) @@ -259,8 +270,8 @@ static long privcmd_ioctl_mmap(struct file *file, void __user *udata) LIST_HEAD(pagelist); struct mmap_gfn_state state; - /* We only support privcmd_ioctl_mmap_batch for auto translated. */ - if (xen_feature(XENFEAT_auto_translated_physmap)) + /* We only support privcmd_ioctl_mmap_batch for non-auto-translated. */ + if (!xen_pv_domain()) return -ENOSYS; if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) @@ -277,14 +288,14 @@ static long privcmd_ioctl_mmap(struct file *file, void __user *udata) if (rc || list_empty(&pagelist)) goto out; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); { struct page *page = list_first_entry(&pagelist, struct page, lru); struct privcmd_mmap_entry *msg = page_address(page); - vma = find_vma(mm, msg->va); + vma = vma_lookup(mm, msg->va); rc = -EINVAL; if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) @@ -302,7 +313,7 @@ static long privcmd_ioctl_mmap(struct file *file, void __user *udata) out_up: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); out: free_page_list(&pagelist); @@ -342,7 +353,7 @@ static int mmap_batch_fn(void *data, int nr, void *state) struct page **cur_pages = NULL; int ret; - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) cur_pages = &pages[st->index]; BUG_ON(nr < 0); @@ -422,15 +433,15 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) int rc; struct page **pages; - pages = kcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); + pages = kvcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); if (pages == NULL) return -ENOMEM; - rc = alloc_xenballooned_pages(numpgs, pages); + rc = xen_alloc_unpopulated_pages(numpgs, pages); if (rc != 0) { pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, numpgs, rc); - kfree(pages); + kvfree(pages); return -ENOMEM; } BUG_ON(vma->vm_private_data != NULL); @@ -498,7 +509,7 @@ static long privcmd_ioctl_mmap_batch( } } - down_write(&mm->mmap_sem); + mmap_write_lock(mm); vma = find_vma(mm, m.addr); if (!vma || @@ -524,7 +535,7 @@ static long privcmd_ioctl_mmap_batch( ret = -EINVAL; goto out_unlock; } - if (xen_feature(XENFEAT_auto_translated_physmap)) { + if (!xen_pv_domain()) { ret = alloc_empty_pages(vma, nr_pages); if (ret < 0) goto out_unlock; @@ -554,7 +565,7 @@ static long privcmd_ioctl_mmap_batch( BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t), &pagelist, mmap_batch_fn, &state)); - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); if (state.global_error) { /* Write back errors in second pass. */ @@ -575,34 +586,38 @@ out: return ret; out_unlock: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); goto out; } static int lock_pages( struct privcmd_dm_op_buf kbufs[], unsigned int num, - struct page *pages[], unsigned int nr_pages) + struct page *pages[], unsigned int nr_pages, unsigned int *pinned) { - unsigned int i; + unsigned int i, off = 0; - for (i = 0; i < num; i++) { + for (i = 0; i < num; ) { unsigned int requested; - int pinned; + int page_count; requested = DIV_ROUND_UP( offset_in_page(kbufs[i].uptr) + kbufs[i].size, - PAGE_SIZE); + PAGE_SIZE) - off; if (requested > nr_pages) return -ENOSPC; - pinned = get_user_pages_fast( - (unsigned long) kbufs[i].uptr, + page_count = pin_user_pages_fast( + (unsigned long)kbufs[i].uptr + off * PAGE_SIZE, requested, FOLL_WRITE, pages); - if (pinned < 0) - return pinned; + if (page_count <= 0) + return page_count ? : -EFAULT; + + *pinned += page_count; + nr_pages -= page_count; + pages += page_count; - nr_pages -= pinned; - pages += pinned; + off = (requested == page_count) ? 0 : off + page_count; + i += !off; } return 0; @@ -610,15 +625,7 @@ static int lock_pages( static void unlock_pages(struct page *pages[], unsigned int nr_pages) { - unsigned int i; - - if (!pages) - return; - - for (i = 0; i < nr_pages; i++) { - if (pages[i]) - put_page(pages[i]); - } + unpin_user_pages_dirty_lock(pages, nr_pages, true); } static long privcmd_ioctl_dm_op(struct file *file, void __user *udata) @@ -631,6 +638,7 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata) struct xen_dm_op_buf *xbufs = NULL; unsigned int i; long rc; + unsigned int pinned = 0; if (copy_from_user(&kdata, udata, sizeof(kdata))) return -EFAULT; @@ -684,8 +692,8 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata) goto out; } - rc = lock_pages(kbufs, kdata.num, pages, nr_pages); - if (rc) + rc = lock_pages(kbufs, kdata.num, pages, nr_pages, &pinned); + if (rc < 0) goto out; for (i = 0; i < kdata.num; i++) { @@ -698,7 +706,7 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata) xen_preemptible_hcall_end(); out: - unlock_pages(pages, nr_pages); + unlock_pages(pages, pinned); kfree(xbufs); kfree(pages); kfree(kbufs); @@ -723,34 +731,15 @@ static long privcmd_ioctl_restrict(struct file *file, void __user *udata) return 0; } -struct remap_pfn { - struct mm_struct *mm; - struct page **pages; - pgprot_t prot; - unsigned long i; -}; - -static int remap_pfn_fn(pte_t *ptep, pgtable_t token, unsigned long addr, - void *data) -{ - struct remap_pfn *r = data; - struct page *page = r->pages[r->i]; - pte_t pte = pte_mkspecial(pfn_pte(page_to_pfn(page), r->prot)); - - set_pte_at(r->mm, addr, ptep, pte); - r->i++; - - return 0; -} - -static long privcmd_ioctl_mmap_resource(struct file *file, void __user *udata) +static long privcmd_ioctl_mmap_resource(struct file *file, + struct privcmd_mmap_resource __user *udata) { struct privcmd_data *data = file->private_data; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct privcmd_mmap_resource kdata; xen_pfn_t *pfns = NULL; - struct xen_mem_acquire_resource xdata; + struct xen_mem_acquire_resource xdata = { }; int rc; if (copy_from_user(&kdata, udata, sizeof(kdata))) @@ -760,7 +749,23 @@ static long privcmd_ioctl_mmap_resource(struct file *file, void __user *udata) if (data->domid != DOMID_INVALID && data->domid != kdata.dom) return -EPERM; - down_write(&mm->mmap_sem); + /* Both fields must be set or unset */ + if (!!kdata.addr != !!kdata.num) + return -EINVAL; + + xdata.domid = kdata.dom; + xdata.type = kdata.type; + xdata.id = kdata.id; + + if (!kdata.addr && !kdata.num) { + /* Query the size of the resource. */ + rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata); + if (rc) + return rc; + return __put_user(xdata.nr_frames, &udata->num); + } + + mmap_write_lock(mm); vma = find_vma(mm, kdata.addr); if (!vma || vma->vm_ops != &privcmd_vm_ops) { @@ -768,13 +773,13 @@ static long privcmd_ioctl_mmap_resource(struct file *file, void __user *udata) goto out; } - pfns = kcalloc(kdata.num, sizeof(*pfns), GFP_KERNEL); + pfns = kcalloc(kdata.num, sizeof(*pfns), GFP_KERNEL | __GFP_NOWARN); if (!pfns) { rc = -ENOMEM; goto out; } - if (xen_feature(XENFEAT_auto_translated_physmap)) { + if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && !xen_pv_domain()) { unsigned int nr = DIV_ROUND_UP(kdata.num, XEN_PFN_PER_PAGE); struct page **pages; unsigned int i; @@ -784,6 +789,7 @@ static long privcmd_ioctl_mmap_resource(struct file *file, void __user *udata) goto out; pages = vma->vm_private_data; + for (i = 0; i < kdata.num; i++) { xen_pfn_t pfn = page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]); @@ -793,10 +799,6 @@ static long privcmd_ioctl_mmap_resource(struct file *file, void __user *udata) } else vma->vm_private_data = PRIV_VMA_LOCKED; - memset(&xdata, 0, sizeof(xdata)); - xdata.domid = kdata.dom; - xdata.type = kdata.type; - xdata.id = kdata.id; xdata.frame = kdata.idx; xdata.nr_frames = kdata.num; set_xen_guest_handle(xdata.frame_list, pfns); @@ -808,35 +810,27 @@ static long privcmd_ioctl_mmap_resource(struct file *file, void __user *udata) if (rc) goto out; - if (xen_feature(XENFEAT_auto_translated_physmap)) { - struct remap_pfn r = { - .mm = vma->vm_mm, - .pages = vma->vm_private_data, - .prot = vma->vm_page_prot, - }; - - rc = apply_to_page_range(r.mm, kdata.addr, - kdata.num << PAGE_SHIFT, - remap_pfn_fn, &r); + if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && !xen_pv_domain()) { + rc = xen_remap_vma_range(vma, kdata.addr, kdata.num << PAGE_SHIFT); } else { unsigned int domid = (xdata.flags & XENMEM_rsrc_acq_caller_owned) ? DOMID_SELF : kdata.dom; - int num; + int num, *errs = (int *)pfns; + BUILD_BUG_ON(sizeof(*errs) > sizeof(*pfns)); num = xen_remap_domain_mfn_array(vma, kdata.addr & PAGE_MASK, - pfns, kdata.num, (int *)pfns, + pfns, kdata.num, errs, vma->vm_page_prot, - domid, - vma->vm_private_data); + domid); if (num < 0) rc = num; else if (num != kdata.num) { unsigned int i; for (i = 0; i < num; i++) { - rc = pfns[i]; + rc = errs[i]; if (rc < 0) break; } @@ -845,12 +839,673 @@ static long privcmd_ioctl_mmap_resource(struct file *file, void __user *udata) } out: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); kfree(pfns); return rc; } +static long privcmd_ioctl_pcidev_get_gsi(struct file *file, void __user *udata) +{ +#if defined(CONFIG_XEN_ACPI) + int rc; + struct privcmd_pcidev_get_gsi kdata; + + if (copy_from_user(&kdata, udata, sizeof(kdata))) + return -EFAULT; + + rc = xen_acpi_get_gsi_from_sbdf(kdata.sbdf); + if (rc < 0) + return rc; + + kdata.gsi = rc; + if (copy_to_user(udata, &kdata, sizeof(kdata))) + return -EFAULT; + + return 0; +#else + return -EINVAL; +#endif +} + +#ifdef CONFIG_XEN_PRIVCMD_EVENTFD +/* Irqfd support */ +static struct workqueue_struct *irqfd_cleanup_wq; +static DEFINE_SPINLOCK(irqfds_lock); +DEFINE_STATIC_SRCU(irqfds_srcu); +static LIST_HEAD(irqfds_list); + +struct privcmd_kernel_irqfd { + struct xen_dm_op_buf xbufs; + domid_t dom; + bool error; + struct eventfd_ctx *eventfd; + struct work_struct shutdown; + wait_queue_entry_t wait; + struct list_head list; + poll_table pt; +}; + +static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd) +{ + lockdep_assert_held(&irqfds_lock); + + list_del_init(&kirqfd->list); + queue_work(irqfd_cleanup_wq, &kirqfd->shutdown); +} + +static void irqfd_shutdown(struct work_struct *work) +{ + struct privcmd_kernel_irqfd *kirqfd = + container_of(work, struct privcmd_kernel_irqfd, shutdown); + u64 cnt; + + /* Make sure irqfd has been initialized in assign path */ + synchronize_srcu(&irqfds_srcu); + + eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt); + eventfd_ctx_put(kirqfd->eventfd); + kfree(kirqfd); +} + +static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd) +{ + u64 cnt; + long rc; + + eventfd_ctx_do_read(kirqfd->eventfd, &cnt); + + xen_preemptible_hcall_begin(); + rc = HYPERVISOR_dm_op(kirqfd->dom, 1, &kirqfd->xbufs); + xen_preemptible_hcall_end(); + + /* Don't repeat the error message for consecutive failures */ + if (rc && !kirqfd->error) { + pr_err("Failed to configure irq for guest domain: %d\n", + kirqfd->dom); + } + + kirqfd->error = rc; +} + +static int +irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) +{ + struct privcmd_kernel_irqfd *kirqfd = + container_of(wait, struct privcmd_kernel_irqfd, wait); + __poll_t flags = key_to_poll(key); + + if (flags & EPOLLIN) + irqfd_inject(kirqfd); + + if (flags & EPOLLHUP) { + unsigned long flags; + + spin_lock_irqsave(&irqfds_lock, flags); + irqfd_deactivate(kirqfd); + spin_unlock_irqrestore(&irqfds_lock, flags); + } + + return 0; +} + +static void +irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) +{ + struct privcmd_kernel_irqfd *kirqfd = + container_of(pt, struct privcmd_kernel_irqfd, pt); + + add_wait_queue_priority(wqh, &kirqfd->wait); +} + +static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd) +{ + struct privcmd_kernel_irqfd *kirqfd, *tmp; + unsigned long flags; + __poll_t events; + void *dm_op; + int ret, idx; + + CLASS(fd, f)(irqfd->fd); + + kirqfd = kzalloc(sizeof(*kirqfd) + irqfd->size, GFP_KERNEL); + if (!kirqfd) + return -ENOMEM; + dm_op = kirqfd + 1; + + if (copy_from_user(dm_op, u64_to_user_ptr(irqfd->dm_op), irqfd->size)) { + ret = -EFAULT; + goto error_kfree; + } + + kirqfd->xbufs.size = irqfd->size; + set_xen_guest_handle(kirqfd->xbufs.h, dm_op); + kirqfd->dom = irqfd->dom; + INIT_WORK(&kirqfd->shutdown, irqfd_shutdown); + + if (fd_empty(f)) { + ret = -EBADF; + goto error_kfree; + } + + kirqfd->eventfd = eventfd_ctx_fileget(fd_file(f)); + if (IS_ERR(kirqfd->eventfd)) { + ret = PTR_ERR(kirqfd->eventfd); + goto error_kfree; + } + + /* + * Install our own custom wake-up handling so we are notified via a + * callback whenever someone signals the underlying eventfd. + */ + init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup); + init_poll_funcptr(&kirqfd->pt, irqfd_poll_func); + + spin_lock_irqsave(&irqfds_lock, flags); + + list_for_each_entry(tmp, &irqfds_list, list) { + if (kirqfd->eventfd == tmp->eventfd) { + ret = -EBUSY; + spin_unlock_irqrestore(&irqfds_lock, flags); + goto error_eventfd; + } + } + + idx = srcu_read_lock(&irqfds_srcu); + list_add_tail(&kirqfd->list, &irqfds_list); + spin_unlock_irqrestore(&irqfds_lock, flags); + + /* + * Check if there was an event already pending on the eventfd before we + * registered, and trigger it as if we didn't miss it. + */ + events = vfs_poll(fd_file(f), &kirqfd->pt); + if (events & EPOLLIN) + irqfd_inject(kirqfd); + + srcu_read_unlock(&irqfds_srcu, idx); + return 0; + +error_eventfd: + eventfd_ctx_put(kirqfd->eventfd); + +error_kfree: + kfree(kirqfd); + return ret; +} + +static int privcmd_irqfd_deassign(struct privcmd_irqfd *irqfd) +{ + struct privcmd_kernel_irqfd *kirqfd; + struct eventfd_ctx *eventfd; + unsigned long flags; + + eventfd = eventfd_ctx_fdget(irqfd->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + spin_lock_irqsave(&irqfds_lock, flags); + + list_for_each_entry(kirqfd, &irqfds_list, list) { + if (kirqfd->eventfd == eventfd) { + irqfd_deactivate(kirqfd); + break; + } + } + + spin_unlock_irqrestore(&irqfds_lock, flags); + + eventfd_ctx_put(eventfd); + + /* + * Block until we know all outstanding shutdown jobs have completed so + * that we guarantee there will not be any more interrupts once this + * deassign function returns. + */ + flush_workqueue(irqfd_cleanup_wq); + + return 0; +} + +static long privcmd_ioctl_irqfd(struct file *file, void __user *udata) +{ + struct privcmd_data *data = file->private_data; + struct privcmd_irqfd irqfd; + + if (copy_from_user(&irqfd, udata, sizeof(irqfd))) + return -EFAULT; + + /* No other flags should be set */ + if (irqfd.flags & ~PRIVCMD_IRQFD_FLAG_DEASSIGN) + return -EINVAL; + + /* If restriction is in place, check the domid matches */ + if (data->domid != DOMID_INVALID && data->domid != irqfd.dom) + return -EPERM; + + if (irqfd.flags & PRIVCMD_IRQFD_FLAG_DEASSIGN) + return privcmd_irqfd_deassign(&irqfd); + + return privcmd_irqfd_assign(&irqfd); +} + +static int privcmd_irqfd_init(void) +{ + irqfd_cleanup_wq = alloc_workqueue("privcmd-irqfd-cleanup", 0, 0); + if (!irqfd_cleanup_wq) + return -ENOMEM; + + return 0; +} + +static void privcmd_irqfd_exit(void) +{ + struct privcmd_kernel_irqfd *kirqfd, *tmp; + unsigned long flags; + + spin_lock_irqsave(&irqfds_lock, flags); + + list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list) + irqfd_deactivate(kirqfd); + + spin_unlock_irqrestore(&irqfds_lock, flags); + + destroy_workqueue(irqfd_cleanup_wq); +} + +/* Ioeventfd Support */ +#define QUEUE_NOTIFY_VQ_MASK 0xFFFF + +static DEFINE_MUTEX(ioreq_lock); +static LIST_HEAD(ioreq_list); + +/* per-eventfd structure */ +struct privcmd_kernel_ioeventfd { + struct eventfd_ctx *eventfd; + struct list_head list; + u64 addr; + unsigned int addr_len; + unsigned int vq; +}; + +/* per-guest CPU / port structure */ +struct ioreq_port { + int vcpu; + unsigned int port; + struct privcmd_kernel_ioreq *kioreq; +}; + +/* per-guest structure */ +struct privcmd_kernel_ioreq { + domid_t dom; + unsigned int vcpus; + u64 uioreq; + struct ioreq *ioreq; + spinlock_t lock; /* Protects ioeventfds list */ + struct list_head ioeventfds; + struct list_head list; + struct ioreq_port ports[] __counted_by(vcpus); +}; + +static irqreturn_t ioeventfd_interrupt(int irq, void *dev_id) +{ + struct ioreq_port *port = dev_id; + struct privcmd_kernel_ioreq *kioreq = port->kioreq; + struct ioreq *ioreq = &kioreq->ioreq[port->vcpu]; + struct privcmd_kernel_ioeventfd *kioeventfd; + unsigned int state = STATE_IOREQ_READY; + + if (ioreq->state != STATE_IOREQ_READY || + ioreq->type != IOREQ_TYPE_COPY || ioreq->dir != IOREQ_WRITE) + return IRQ_NONE; + + /* + * We need a barrier, smp_mb(), here to ensure reads are finished before + * `state` is updated. Since the lock implementation ensures that + * appropriate barrier will be added anyway, we can avoid adding + * explicit barrier here. + * + * Ideally we don't need to update `state` within the locks, but we do + * that here to avoid adding explicit barrier. + */ + + spin_lock(&kioreq->lock); + ioreq->state = STATE_IOREQ_INPROCESS; + + list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) { + if (ioreq->addr == kioeventfd->addr + VIRTIO_MMIO_QUEUE_NOTIFY && + ioreq->size == kioeventfd->addr_len && + (ioreq->data & QUEUE_NOTIFY_VQ_MASK) == kioeventfd->vq) { + eventfd_signal(kioeventfd->eventfd); + state = STATE_IORESP_READY; + break; + } + } + spin_unlock(&kioreq->lock); + + /* + * We need a barrier, smp_mb(), here to ensure writes are finished + * before `state` is updated. Since the lock implementation ensures that + * appropriate barrier will be added anyway, we can avoid adding + * explicit barrier here. + */ + + ioreq->state = state; + + if (state == STATE_IORESP_READY) { + notify_remote_via_evtchn(port->port); + return IRQ_HANDLED; + } + + return IRQ_NONE; +} + +static void ioreq_free(struct privcmd_kernel_ioreq *kioreq) +{ + struct ioreq_port *ports = kioreq->ports; + int i; + + lockdep_assert_held(&ioreq_lock); + + list_del(&kioreq->list); + + for (i = kioreq->vcpus - 1; i >= 0; i--) + unbind_from_irqhandler(irq_from_evtchn(ports[i].port), &ports[i]); + + kfree(kioreq); +} + +static +struct privcmd_kernel_ioreq *alloc_ioreq(struct privcmd_ioeventfd *ioeventfd) +{ + struct privcmd_kernel_ioreq *kioreq; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct page **pages; + unsigned int *ports; + int ret, size, i; + + lockdep_assert_held(&ioreq_lock); + + size = struct_size(kioreq, ports, ioeventfd->vcpus); + kioreq = kzalloc(size, GFP_KERNEL); + if (!kioreq) + return ERR_PTR(-ENOMEM); + + kioreq->dom = ioeventfd->dom; + kioreq->vcpus = ioeventfd->vcpus; + kioreq->uioreq = ioeventfd->ioreq; + spin_lock_init(&kioreq->lock); + INIT_LIST_HEAD(&kioreq->ioeventfds); + + /* The memory for ioreq server must have been mapped earlier */ + mmap_write_lock(mm); + vma = find_vma(mm, (unsigned long)ioeventfd->ioreq); + if (!vma) { + pr_err("Failed to find vma for ioreq page!\n"); + mmap_write_unlock(mm); + ret = -EFAULT; + goto error_kfree; + } + + pages = vma->vm_private_data; + kioreq->ioreq = (struct ioreq *)(page_to_virt(pages[0])); + mmap_write_unlock(mm); + + ports = memdup_array_user(u64_to_user_ptr(ioeventfd->ports), + kioreq->vcpus, sizeof(*ports)); + if (IS_ERR(ports)) { + ret = PTR_ERR(ports); + goto error_kfree; + } + + for (i = 0; i < kioreq->vcpus; i++) { + kioreq->ports[i].vcpu = i; + kioreq->ports[i].port = ports[i]; + kioreq->ports[i].kioreq = kioreq; + + ret = bind_evtchn_to_irqhandler_lateeoi(ports[i], + ioeventfd_interrupt, IRQF_SHARED, "ioeventfd", + &kioreq->ports[i]); + if (ret < 0) + goto error_unbind; + } + + kfree(ports); + + list_add_tail(&kioreq->list, &ioreq_list); + + return kioreq; + +error_unbind: + while (--i >= 0) + unbind_from_irqhandler(irq_from_evtchn(ports[i]), &kioreq->ports[i]); + + kfree(ports); +error_kfree: + kfree(kioreq); + return ERR_PTR(ret); +} + +static struct privcmd_kernel_ioreq * +get_ioreq(struct privcmd_ioeventfd *ioeventfd, struct eventfd_ctx *eventfd) +{ + struct privcmd_kernel_ioreq *kioreq; + unsigned long flags; + + list_for_each_entry(kioreq, &ioreq_list, list) { + struct privcmd_kernel_ioeventfd *kioeventfd; + + /* + * kioreq fields can be accessed here without a lock as they are + * never updated after being added to the ioreq_list. + */ + if (kioreq->uioreq != ioeventfd->ioreq) { + continue; + } else if (kioreq->dom != ioeventfd->dom || + kioreq->vcpus != ioeventfd->vcpus) { + pr_err("Invalid ioeventfd configuration mismatch, dom (%u vs %u), vcpus (%u vs %u)\n", + kioreq->dom, ioeventfd->dom, kioreq->vcpus, + ioeventfd->vcpus); + return ERR_PTR(-EINVAL); + } + + /* Look for a duplicate eventfd for the same guest */ + spin_lock_irqsave(&kioreq->lock, flags); + list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) { + if (eventfd == kioeventfd->eventfd) { + spin_unlock_irqrestore(&kioreq->lock, flags); + return ERR_PTR(-EBUSY); + } + } + spin_unlock_irqrestore(&kioreq->lock, flags); + + return kioreq; + } + + /* Matching kioreq isn't found, allocate a new one */ + return alloc_ioreq(ioeventfd); +} + +static void ioeventfd_free(struct privcmd_kernel_ioeventfd *kioeventfd) +{ + list_del(&kioeventfd->list); + eventfd_ctx_put(kioeventfd->eventfd); + kfree(kioeventfd); +} + +static int privcmd_ioeventfd_assign(struct privcmd_ioeventfd *ioeventfd) +{ + struct privcmd_kernel_ioeventfd *kioeventfd; + struct privcmd_kernel_ioreq *kioreq; + unsigned long flags; + int ret; + + /* Check for range overflow */ + if (ioeventfd->addr + ioeventfd->addr_len < ioeventfd->addr) + return -EINVAL; + + /* Vhost requires us to support length 1, 2, 4, and 8 */ + if (!(ioeventfd->addr_len == 1 || ioeventfd->addr_len == 2 || + ioeventfd->addr_len == 4 || ioeventfd->addr_len == 8)) + return -EINVAL; + + /* 4096 vcpus limit enough ? */ + if (!ioeventfd->vcpus || ioeventfd->vcpus > 4096) + return -EINVAL; + + kioeventfd = kzalloc(sizeof(*kioeventfd), GFP_KERNEL); + if (!kioeventfd) + return -ENOMEM; + + kioeventfd->eventfd = eventfd_ctx_fdget(ioeventfd->event_fd); + if (IS_ERR(kioeventfd->eventfd)) { + ret = PTR_ERR(kioeventfd->eventfd); + goto error_kfree; + } + + kioeventfd->addr = ioeventfd->addr; + kioeventfd->addr_len = ioeventfd->addr_len; + kioeventfd->vq = ioeventfd->vq; + + mutex_lock(&ioreq_lock); + kioreq = get_ioreq(ioeventfd, kioeventfd->eventfd); + if (IS_ERR(kioreq)) { + mutex_unlock(&ioreq_lock); + ret = PTR_ERR(kioreq); + goto error_eventfd; + } + + spin_lock_irqsave(&kioreq->lock, flags); + list_add_tail(&kioeventfd->list, &kioreq->ioeventfds); + spin_unlock_irqrestore(&kioreq->lock, flags); + + mutex_unlock(&ioreq_lock); + + return 0; + +error_eventfd: + eventfd_ctx_put(kioeventfd->eventfd); + +error_kfree: + kfree(kioeventfd); + return ret; +} + +static int privcmd_ioeventfd_deassign(struct privcmd_ioeventfd *ioeventfd) +{ + struct privcmd_kernel_ioreq *kioreq, *tkioreq; + struct eventfd_ctx *eventfd; + unsigned long flags; + int ret = 0; + + eventfd = eventfd_ctx_fdget(ioeventfd->event_fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + mutex_lock(&ioreq_lock); + list_for_each_entry_safe(kioreq, tkioreq, &ioreq_list, list) { + struct privcmd_kernel_ioeventfd *kioeventfd, *tmp; + /* + * kioreq fields can be accessed here without a lock as they are + * never updated after being added to the ioreq_list. + */ + if (kioreq->dom != ioeventfd->dom || + kioreq->uioreq != ioeventfd->ioreq || + kioreq->vcpus != ioeventfd->vcpus) + continue; + + spin_lock_irqsave(&kioreq->lock, flags); + list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list) { + if (eventfd == kioeventfd->eventfd) { + ioeventfd_free(kioeventfd); + spin_unlock_irqrestore(&kioreq->lock, flags); + + if (list_empty(&kioreq->ioeventfds)) + ioreq_free(kioreq); + goto unlock; + } + } + spin_unlock_irqrestore(&kioreq->lock, flags); + break; + } + + pr_err("Ioeventfd isn't already assigned, dom: %u, addr: %llu\n", + ioeventfd->dom, ioeventfd->addr); + ret = -ENODEV; + +unlock: + mutex_unlock(&ioreq_lock); + eventfd_ctx_put(eventfd); + + return ret; +} + +static long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata) +{ + struct privcmd_data *data = file->private_data; + struct privcmd_ioeventfd ioeventfd; + + if (copy_from_user(&ioeventfd, udata, sizeof(ioeventfd))) + return -EFAULT; + + /* No other flags should be set */ + if (ioeventfd.flags & ~PRIVCMD_IOEVENTFD_FLAG_DEASSIGN) + return -EINVAL; + + /* If restriction is in place, check the domid matches */ + if (data->domid != DOMID_INVALID && data->domid != ioeventfd.dom) + return -EPERM; + + if (ioeventfd.flags & PRIVCMD_IOEVENTFD_FLAG_DEASSIGN) + return privcmd_ioeventfd_deassign(&ioeventfd); + + return privcmd_ioeventfd_assign(&ioeventfd); +} + +static void privcmd_ioeventfd_exit(void) +{ + struct privcmd_kernel_ioreq *kioreq, *tmp; + unsigned long flags; + + mutex_lock(&ioreq_lock); + list_for_each_entry_safe(kioreq, tmp, &ioreq_list, list) { + struct privcmd_kernel_ioeventfd *kioeventfd, *tmp; + + spin_lock_irqsave(&kioreq->lock, flags); + list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list) + ioeventfd_free(kioeventfd); + spin_unlock_irqrestore(&kioreq->lock, flags); + + ioreq_free(kioreq); + } + mutex_unlock(&ioreq_lock); +} +#else +static inline long privcmd_ioctl_irqfd(struct file *file, void __user *udata) +{ + return -EOPNOTSUPP; +} + +static inline int privcmd_irqfd_init(void) +{ + return 0; +} + +static inline void privcmd_irqfd_exit(void) +{ +} + +static inline long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata) +{ + return -EOPNOTSUPP; +} + +static inline void privcmd_ioeventfd_exit(void) +{ +} +#endif /* CONFIG_XEN_PRIVCMD_EVENTFD */ + static long privcmd_ioctl(struct file *file, unsigned int cmd, unsigned long data) { @@ -886,6 +1541,18 @@ static long privcmd_ioctl(struct file *file, ret = privcmd_ioctl_mmap_resource(file, udata); break; + case IOCTL_PRIVCMD_IRQFD: + ret = privcmd_ioctl_irqfd(file, udata); + break; + + case IOCTL_PRIVCMD_IOEVENTFD: + ret = privcmd_ioctl_ioeventfd(file, udata); + break; + + case IOCTL_PRIVCMD_PCIDEV_GET_GSI: + ret = privcmd_ioctl_pcidev_get_gsi(file, udata); + break; + default: break; } @@ -922,16 +1589,16 @@ static void privcmd_close(struct vm_area_struct *vma) int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT; int rc; - if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) + if (xen_pv_domain() || !numpgs || !pages) return; rc = xen_unmap_domain_gfn_range(vma, numgfns, pages); if (rc == 0) - free_xenballooned_pages(numpgs, pages); + xen_free_unpopulated_pages(numpgs, pages); else pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n", numpgs, rc); - kfree(pages); + kvfree(pages); } static vm_fault_t privcmd_fault(struct vm_fault *vmf) @@ -952,8 +1619,8 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) { /* DONTCOPY is essential for Xen because copy_page_range doesn't know * how to recreate these mappings */ - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY | - VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTCOPY | + VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &privcmd_vm_ops; vma->vm_private_data = NULL; @@ -965,10 +1632,9 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) * on a per pfn/pte basis. Mapping calls that fail with ENOENT * can be then retried until success. */ -static int is_mapped_fn(pte_t *pte, struct page *pmd_page, - unsigned long addr, void *data) +static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data) { - return pte_none(*pte) ? 0 : -EBUSY; + return pte_none(ptep_get(pte)) ? 0 : -EBUSY; } static int privcmd_vma_range_is_mapped( @@ -1011,15 +1677,28 @@ static int __init privcmd_init(void) err = misc_register(&xen_privcmdbuf_dev); if (err != 0) { pr_err("Could not register Xen hypercall-buf device\n"); - misc_deregister(&privcmd_dev); - return err; + goto err_privcmdbuf; + } + + err = privcmd_irqfd_init(); + if (err != 0) { + pr_err("irqfd init failed\n"); + goto err_irqfd; } return 0; + +err_irqfd: + misc_deregister(&xen_privcmdbuf_dev); +err_privcmdbuf: + misc_deregister(&privcmd_dev); + return err; } static void __exit privcmd_exit(void) { + privcmd_ioeventfd_exit(); + privcmd_irqfd_exit(); misc_deregister(&privcmd_dev); misc_deregister(&xen_privcmdbuf_dev); } diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c index 7aa64d1b119c..c5b6f6fa11eb 100644 --- a/drivers/xen/pvcalls-back.c +++ b/drivers/xen/pvcalls-back.c @@ -1,15 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * (c) 2017 Stefano Stabellini <stefano@aporeto.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #include <linux/inet.h> @@ -23,6 +14,7 @@ #include <net/inet_common.h> #include <net/inet_connection_sock.h> #include <net/request_sock.h> +#include <trace/events/sock.h> #include <xen/events.h> #include <xen/grant_table.h> @@ -33,7 +25,7 @@ #define PVCALLS_VERSIONS "1" #define MAX_RING_ORDER XENBUS_MAX_RING_GRANT_ORDER -struct pvcalls_back_global { +static struct pvcalls_back_global { struct list_head frontends; struct semaphore frontends_lock; } pvcalls_back_global; @@ -75,6 +67,7 @@ struct sock_mapping { atomic_t write; atomic_t io; atomic_t release; + atomic_t eoi; void (*saved_data_ready)(struct sock *sk); struct pvcalls_ioworker ioworker; }; @@ -96,7 +89,7 @@ static int pvcalls_back_release_active(struct xenbus_device *dev, struct pvcalls_fedata *fedata, struct sock_mapping *map); -static void pvcalls_conn_back_read(void *opaque) +static bool pvcalls_conn_back_read(void *opaque) { struct sock_mapping *map = (struct sock_mapping *)opaque; struct msghdr msg; @@ -116,17 +109,17 @@ static void pvcalls_conn_back_read(void *opaque) virt_mb(); if (error) - return; + return false; size = pvcalls_queued(prod, cons, array_size); if (size >= array_size) - return; + return false; spin_lock_irqsave(&map->sock->sk->sk_receive_queue.lock, flags); if (skb_queue_empty(&map->sock->sk->sk_receive_queue)) { atomic_set(&map->read, 0); spin_unlock_irqrestore(&map->sock->sk->sk_receive_queue.lock, flags); - return; + return true; } spin_unlock_irqrestore(&map->sock->sk->sk_receive_queue.lock, flags); wanted = array_size - size; @@ -137,20 +130,20 @@ static void pvcalls_conn_back_read(void *opaque) if (masked_prod < masked_cons) { vec[0].iov_base = data->in + masked_prod; vec[0].iov_len = wanted; - iov_iter_kvec(&msg.msg_iter, WRITE, vec, 1, wanted); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, vec, 1, wanted); } else { vec[0].iov_base = data->in + masked_prod; vec[0].iov_len = array_size - masked_prod; vec[1].iov_base = data->in; vec[1].iov_len = wanted - vec[0].iov_len; - iov_iter_kvec(&msg.msg_iter, WRITE, vec, 2, wanted); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, vec, 2, wanted); } atomic_set(&map->read, 0); ret = inet_recvmsg(map->sock, &msg, wanted, MSG_DONTWAIT); WARN_ON(ret > wanted); if (ret == -EAGAIN) /* shouldn't happen */ - return; + return true; if (!ret) ret = -ENOTCONN; spin_lock_irqsave(&map->sock->sk->sk_receive_queue.lock, flags); @@ -169,10 +162,10 @@ static void pvcalls_conn_back_read(void *opaque) virt_wmb(); notify_remote_via_irq(map->irq); - return; + return true; } -static void pvcalls_conn_back_write(struct sock_mapping *map) +static bool pvcalls_conn_back_write(struct sock_mapping *map) { struct pvcalls_data_intf *intf = map->ring; struct pvcalls_data *data = &map->data; @@ -181,6 +174,8 @@ static void pvcalls_conn_back_write(struct sock_mapping *map) RING_IDX cons, prod, size, array_size; int ret; + atomic_set(&map->write, 0); + cons = intf->out_cons; prod = intf->out_prod; /* read the indexes before dealing with the data */ @@ -189,30 +184,28 @@ static void pvcalls_conn_back_write(struct sock_mapping *map) array_size = XEN_FLEX_RING_SIZE(map->ring_order); size = pvcalls_queued(prod, cons, array_size); if (size == 0) - return; + return false; memset(&msg, 0, sizeof(msg)); msg.msg_flags |= MSG_DONTWAIT; if (pvcalls_mask(prod, array_size) > pvcalls_mask(cons, array_size)) { vec[0].iov_base = data->out + pvcalls_mask(cons, array_size); vec[0].iov_len = size; - iov_iter_kvec(&msg.msg_iter, READ, vec, 1, size); + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, vec, 1, size); } else { vec[0].iov_base = data->out + pvcalls_mask(cons, array_size); vec[0].iov_len = array_size - pvcalls_mask(cons, array_size); vec[1].iov_base = data->out; vec[1].iov_len = size - vec[0].iov_len; - iov_iter_kvec(&msg.msg_iter, READ, vec, 2, size); + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, vec, 2, size); } - atomic_set(&map->write, 0); ret = inet_sendmsg(map->sock, &msg, size); - if (ret == -EAGAIN || (ret >= 0 && ret < size)) { + if (ret == -EAGAIN) { atomic_inc(&map->write); atomic_inc(&map->io); + return true; } - if (ret == -EAGAIN) - return; /* write the data, then update the indexes */ virt_wmb(); @@ -225,9 +218,13 @@ static void pvcalls_conn_back_write(struct sock_mapping *map) } /* update the indexes, then notify the other end */ virt_wmb(); - if (prod != cons + ret) + if (prod != cons + ret) { atomic_inc(&map->write); + atomic_inc(&map->io); + } notify_remote_via_irq(map->irq); + + return true; } static void pvcalls_back_ioworker(struct work_struct *work) @@ -236,6 +233,7 @@ static void pvcalls_back_ioworker(struct work_struct *work) struct pvcalls_ioworker, register_work); struct sock_mapping *map = container_of(ioworker, struct sock_mapping, ioworker); + unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS; while (atomic_read(&map->io) > 0) { if (atomic_read(&map->release) > 0) { @@ -243,10 +241,18 @@ static void pvcalls_back_ioworker(struct work_struct *work) return; } - if (atomic_read(&map->read) > 0) - pvcalls_conn_back_read(map); - if (atomic_read(&map->write) > 0) - pvcalls_conn_back_write(map); + if (atomic_read(&map->read) > 0 && + pvcalls_conn_back_read(map)) + eoi_flags = 0; + if (atomic_read(&map->write) > 0 && + pvcalls_conn_back_write(map)) + eoi_flags = 0; + + if (atomic_read(&map->eoi) > 0 && !atomic_read(&map->write)) { + atomic_set(&map->eoi, 0); + xen_irq_lateeoi(map->irq, eoi_flags); + eoi_flags = XEN_EOI_FLAG_SPURIOUS; + } atomic_dec(&map->io); } @@ -296,6 +302,8 @@ static void pvcalls_sk_data_ready(struct sock *sock) struct sock_mapping *map = sock->sk_user_data; struct pvcalls_ioworker *iow; + trace_sk_data_ready(sock); + if (map == NULL) return; @@ -309,7 +317,7 @@ static struct sock_mapping *pvcalls_new_active_socket( struct pvcalls_fedata *fedata, uint64_t id, grant_ref_t ref, - uint32_t evtchn, + evtchn_port_t evtchn, struct socket *sock) { int ret; @@ -317,8 +325,10 @@ static struct sock_mapping *pvcalls_new_active_socket( void *page; map = kzalloc(sizeof(*map), GFP_KERNEL); - if (map == NULL) + if (map == NULL) { + sock_release(sock); return NULL; + } map->fedata = fedata; map->sock = sock; @@ -343,12 +353,9 @@ static struct sock_mapping *pvcalls_new_active_socket( goto out; map->bytes = page; - ret = bind_interdomain_evtchn_to_irqhandler(fedata->dev->otherend_id, - evtchn, - pvcalls_back_conn_event, - 0, - "pvcalls-backend", - map); + ret = bind_interdomain_evtchn_to_irqhandler_lateeoi( + fedata->dev, evtchn, + pvcalls_back_conn_event, 0, "pvcalls-backend", map); if (ret < 0) goto out; map->irq = ret; @@ -356,7 +363,7 @@ static struct sock_mapping *pvcalls_new_active_socket( map->data.in = map->bytes; map->data.out = map->bytes + XEN_FLEX_RING_SIZE(map->ring_order); - map->ioworker.wq = alloc_workqueue("pvcalls_io", WQ_UNBOUND, 1); + map->ioworker.wq = alloc_ordered_workqueue("pvcalls_io", 0); if (!map->ioworker.wq) goto out; atomic_set(&map->io, 1); @@ -402,7 +409,7 @@ static int pvcalls_back_connect(struct xenbus_device *dev, ret = sock_create(AF_INET, SOCK_STREAM, 0, &sock); if (ret < 0) goto out; - ret = inet_stream_connect(sock, sa, req->u.connect.len, 0); + ret = inet_stream_connect(sock, (struct sockaddr_unsized *)sa, req->u.connect.len, 0); if (ret < 0) { sock_release(sock); goto out; @@ -413,10 +420,8 @@ static int pvcalls_back_connect(struct xenbus_device *dev, req->u.connect.ref, req->u.connect.evtchn, sock); - if (!map) { + if (!map) ret = -EFAULT; - sock_release(sock); - } out: rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++); @@ -464,7 +469,6 @@ static int pvcalls_back_release_passive(struct xenbus_device *dev, write_unlock_bh(&mappass->sock->sk->sk_callback_lock); } sock_release(mappass->sock); - flush_workqueue(mappass->wq); destroy_workqueue(mappass->wq); kfree(mappass); @@ -513,6 +517,10 @@ static void __pvcalls_back_accept(struct work_struct *work) { struct sockpass_mapping *mappass = container_of( work, struct sockpass_mapping, register_work); + struct proto_accept_arg arg = { + .flags = O_NONBLOCK, + .kern = true, + }; struct sock_mapping *map; struct pvcalls_ioworker *iow; struct pvcalls_fedata *fedata; @@ -544,7 +552,7 @@ static void __pvcalls_back_accept(struct work_struct *work) sock->type = mappass->sock->type; sock->ops = mappass->sock->ops; - ret = inet_accept(mappass->sock, sock, O_NONBLOCK, true); + ret = inet_accept(mappass->sock, sock, &arg); if (ret == -EAGAIN) { sock_release(sock); return; @@ -557,7 +565,6 @@ static void __pvcalls_back_accept(struct work_struct *work) sock); if (!map) { ret = -EFAULT; - sock_release(sock); goto out_error; } @@ -588,6 +595,8 @@ static void pvcalls_pass_sk_data_ready(struct sock *sock) unsigned long flags; int notify; + trace_sk_data_ready(sock); + if (mappass == NULL) return; @@ -631,7 +640,7 @@ static int pvcalls_back_bind(struct xenbus_device *dev, INIT_WORK(&map->register_work, __pvcalls_back_accept); spin_lock_init(&map->copy_lock); - map->wq = alloc_workqueue("pvcalls_wq", WQ_UNBOUND, 1); + map->wq = alloc_ordered_workqueue("pvcalls_wq", 0); if (!map->wq) { ret = -ENOMEM; goto out; @@ -641,7 +650,7 @@ static int pvcalls_back_bind(struct xenbus_device *dev, if (ret < 0) goto out; - ret = inet_bind(map->sock, (struct sockaddr *)&req->u.bind.addr, + ret = inet_bind(map->sock, (struct sockaddr_unsized *)&req->u.bind.addr, req->u.bind.len); if (ret < 0) goto out; @@ -784,7 +793,7 @@ static int pvcalls_back_poll(struct xenbus_device *dev, mappass->reqcopy = *req; icsk = inet_csk(mappass->sock->sk); queue = &icsk->icsk_accept_queue; - data = queue->rskq_accept_head != NULL; + data = READ_ONCE(queue->rskq_accept_head) != NULL; if (data) { mappass->reqcopy.cmd = 0; ret = 0; @@ -882,15 +891,18 @@ static irqreturn_t pvcalls_back_event(int irq, void *dev_id) { struct xenbus_device *dev = dev_id; struct pvcalls_fedata *fedata = NULL; + unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS; - if (dev == NULL) - return IRQ_HANDLED; + if (dev) { + fedata = dev_get_drvdata(&dev->dev); + if (fedata) { + pvcalls_back_work(fedata); + eoi_flags = 0; + } + } - fedata = dev_get_drvdata(&dev->dev); - if (fedata == NULL) - return IRQ_HANDLED; + xen_irq_lateeoi(irq, eoi_flags); - pvcalls_back_work(fedata); return IRQ_HANDLED; } @@ -900,12 +912,15 @@ static irqreturn_t pvcalls_back_conn_event(int irq, void *sock_map) struct pvcalls_ioworker *iow; if (map == NULL || map->sock == NULL || map->sock->sk == NULL || - map->sock->sk->sk_user_data != map) + map->sock->sk->sk_user_data != map) { + xen_irq_lateeoi(irq, 0); return IRQ_HANDLED; + } iow = &map->ioworker; atomic_inc(&map->write); + atomic_inc(&map->eoi); atomic_inc(&map->io); queue_work(iow->wq, &iow->register_work); @@ -914,7 +929,8 @@ static irqreturn_t pvcalls_back_conn_event(int irq, void *sock_map) static int backend_connect(struct xenbus_device *dev) { - int err, evtchn; + int err; + evtchn_port_t evtchn; grant_ref_t ring_ref; struct pvcalls_fedata *fedata = NULL; @@ -940,7 +956,7 @@ static int backend_connect(struct xenbus_device *dev) goto error; } - err = bind_interdomain_evtchn_to_irq(dev->otherend_id, evtchn); + err = bind_interdomain_evtchn_to_irq_lateeoi(dev, evtchn); if (err < 0) goto error; fedata->irq = err; @@ -1096,7 +1112,8 @@ static void set_backend_state(struct xenbus_device *dev, case XenbusStateInitialised: switch (state) { case XenbusStateConnected: - backend_connect(dev); + if (backend_connect(dev)) + return; xenbus_switch_state(dev, XenbusStateConnected); break; case XenbusStateClosing: @@ -1173,12 +1190,11 @@ static void pvcalls_back_changed(struct xenbus_device *dev, } } -static int pvcalls_back_remove(struct xenbus_device *dev) +static void pvcalls_back_remove(struct xenbus_device *dev) { - return 0; } -static int pvcalls_back_uevent(struct xenbus_device *xdev, +static int pvcalls_back_uevent(const struct xenbus_device *xdev, struct kobj_uevent_env *env) { return 0; diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 8a249c95c193..4926d4badc57 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -1,15 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * (c) 2017 Stefano Stabellini <stefano@aporeto.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #include <linux/module.h> @@ -234,22 +225,32 @@ again: return IRQ_HANDLED; } -static void pvcalls_front_free_map(struct pvcalls_bedata *bedata, - struct sock_mapping *map) +static void free_active_ring(struct sock_mapping *map); + +static void pvcalls_front_destroy_active(struct pvcalls_bedata *bedata, + struct sock_mapping *map) { int i; unbind_from_irqhandler(map->active.irq, map); - spin_lock(&bedata->socket_lock); - if (!list_empty(&map->list)) - list_del_init(&map->list); - spin_unlock(&bedata->socket_lock); + if (bedata) { + spin_lock(&bedata->socket_lock); + if (!list_empty(&map->list)) + list_del_init(&map->list); + spin_unlock(&bedata->socket_lock); + } for (i = 0; i < (1 << PVCALLS_RING_ORDER); i++) - gnttab_end_foreign_access(map->active.ring->ref[i], 0, 0); - gnttab_end_foreign_access(map->active.ref, 0, 0); - free_page((unsigned long)map->active.ring); + gnttab_end_foreign_access(map->active.ring->ref[i], NULL); + gnttab_end_foreign_access(map->active.ref, NULL); + free_active_ring(map); +} + +static void pvcalls_front_free_map(struct pvcalls_bedata *bedata, + struct sock_mapping *map) +{ + pvcalls_front_destroy_active(bedata, map); kfree(map); } @@ -340,14 +341,15 @@ int pvcalls_front_socket(struct socket *sock) pvcalls_exit(); return ret; } +EXPORT_SYMBOL_GPL(pvcalls_front_socket); static void free_active_ring(struct sock_mapping *map) { if (!map->active.ring) return; - free_pages((unsigned long)map->active.data.in, - map->active.ring->ring_order); + free_pages_exact(map->active.data.in, + PAGE_SIZE << map->active.ring->ring_order); free_page((unsigned long)map->active.ring); } @@ -361,8 +363,8 @@ static int alloc_active_ring(struct sock_mapping *map) goto out; map->active.ring->ring_order = PVCALLS_RING_ORDER; - bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - PVCALLS_RING_ORDER); + bytes = alloc_pages_exact(PAGE_SIZE << PVCALLS_RING_ORDER, + GFP_KERNEL | __GFP_ZERO); if (!bytes) goto out; @@ -377,12 +379,12 @@ out: return -ENOMEM; } -static int create_active(struct sock_mapping *map, int *evtchn) +static int create_active(struct sock_mapping *map, evtchn_port_t *evtchn) { void *bytes; - int ret = -ENOMEM, irq = -1, i; + int ret, irq = -1, i; - *evtchn = -1; + *evtchn = 0; init_waitqueue_head(&map->active.inflight_conn_req); bytes = map->active.data.in; @@ -413,7 +415,7 @@ static int create_active(struct sock_mapping *map, int *evtchn) return 0; out_error: - if (*evtchn >= 0) + if (*evtchn > 0) xenbus_free_evtchn(pvcalls_front_dev, *evtchn); return ret; } @@ -424,7 +426,8 @@ int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr, struct pvcalls_bedata *bedata; struct sock_mapping *map = NULL; struct xen_pvcalls_request *req; - int notify, req_id, ret, evtchn; + int notify, req_id, ret; + evtchn_port_t evtchn; if (addr->sa_family != AF_INET || sock->type != SOCK_STREAM) return -EOPNOTSUPP; @@ -439,19 +442,18 @@ int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr, pvcalls_exit_sock(sock); return ret; } - - spin_lock(&bedata->socket_lock); - ret = get_request(bedata, &req_id); + ret = create_active(map, &evtchn); if (ret < 0) { - spin_unlock(&bedata->socket_lock); free_active_ring(map); pvcalls_exit_sock(sock); return ret; } - ret = create_active(map, &evtchn); + + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); if (ret < 0) { spin_unlock(&bedata->socket_lock); - free_active_ring(map); + pvcalls_front_destroy_active(NULL, map); pvcalls_exit_sock(sock); return ret; } @@ -485,6 +487,7 @@ int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr, pvcalls_exit_sock(sock); return ret; } +EXPORT_SYMBOL_GPL(pvcalls_front_connect); static int __write_ring(struct pvcalls_data_intf *intf, struct pvcalls_data *data, @@ -540,7 +543,6 @@ out: int pvcalls_front_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { - struct pvcalls_bedata *bedata; struct sock_mapping *map; int sent, tot_sent = 0; int count = 0, flags; @@ -552,7 +554,6 @@ int pvcalls_front_sendmsg(struct socket *sock, struct msghdr *msg, map = pvcalls_enter_sock(sock); if (IS_ERR(map)) return PTR_ERR(map); - bedata = dev_get_drvdata(&pvcalls_front_dev->dev); mutex_lock(&map->active.out_mutex); if ((flags & MSG_DONTWAIT) && !pvcalls_front_write_todo(map)) { @@ -582,6 +583,7 @@ again: pvcalls_exit_sock(sock); return tot_sent; } +EXPORT_SYMBOL_GPL(pvcalls_front_sendmsg); static int __read_ring(struct pvcalls_data_intf *intf, struct pvcalls_data *data, @@ -635,7 +637,6 @@ out: int pvcalls_front_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { - struct pvcalls_bedata *bedata; int ret; struct sock_mapping *map; @@ -645,7 +646,6 @@ int pvcalls_front_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, map = pvcalls_enter_sock(sock); if (IS_ERR(map)) return PTR_ERR(map); - bedata = dev_get_drvdata(&pvcalls_front_dev->dev); mutex_lock(&map->active.in_mutex); if (len > XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER)) @@ -669,6 +669,7 @@ int pvcalls_front_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, pvcalls_exit_sock(sock); return ret; } +EXPORT_SYMBOL_GPL(pvcalls_front_recvmsg); int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { @@ -722,6 +723,7 @@ int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len) pvcalls_exit_sock(sock); return 0; } +EXPORT_SYMBOL_GPL(pvcalls_front_bind); int pvcalls_front_listen(struct socket *sock, int backlog) { @@ -771,14 +773,17 @@ int pvcalls_front_listen(struct socket *sock, int backlog) pvcalls_exit_sock(sock); return ret; } +EXPORT_SYMBOL_GPL(pvcalls_front_listen); -int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags) +int pvcalls_front_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct pvcalls_bedata *bedata; struct sock_mapping *map; struct sock_mapping *map2 = NULL; struct xen_pvcalls_request *req; - int notify, req_id, ret, evtchn, nonblock; + int notify, req_id, ret, nonblock; + evtchn_port_t evtchn; map = pvcalls_enter_sock(sock); if (IS_ERR(map)) @@ -790,7 +795,7 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags) return -EINVAL; } - nonblock = flags & SOCK_NONBLOCK; + nonblock = arg->flags & SOCK_NONBLOCK; /* * Backend only supports 1 inflight accept request, will return * errors for the others @@ -830,28 +835,27 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags) pvcalls_exit_sock(sock); return ret; } - spin_lock(&bedata->socket_lock); - ret = get_request(bedata, &req_id); + ret = create_active(map2, &evtchn); if (ret < 0) { - clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, - (void *)&map->passive.flags); - spin_unlock(&bedata->socket_lock); free_active_ring(map2); kfree(map2); + clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags); pvcalls_exit_sock(sock); return ret; } - ret = create_active(map2, &evtchn); + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); if (ret < 0) { - free_active_ring(map2); - kfree(map2); clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags); spin_unlock(&bedata->socket_lock); + pvcalls_front_free_map(bedata, map2); pvcalls_exit_sock(sock); return ret; } + list_add_tail(&map2->list, &bedata->socket_mappings); req = RING_GET_REQUEST(&bedata->ring, req_id); @@ -907,6 +911,7 @@ received: pvcalls_exit_sock(sock); return ret; } +EXPORT_SYMBOL_GPL(pvcalls_front_accept); static __poll_t pvcalls_front_poll_passive(struct file *file, struct pvcalls_bedata *bedata, @@ -1007,6 +1012,7 @@ __poll_t pvcalls_front_poll(struct file *file, struct socket *sock, pvcalls_exit_sock(sock); return ret; } +EXPORT_SYMBOL_GPL(pvcalls_front_poll); int pvcalls_front_release(struct socket *sock) { @@ -1090,13 +1096,14 @@ int pvcalls_front_release(struct socket *sock) pvcalls_exit(); return 0; } +EXPORT_SYMBOL_GPL(pvcalls_front_release); static const struct xenbus_device_id pvcalls_front_ids[] = { { "pvcalls" }, { "" } }; -static int pvcalls_front_remove(struct xenbus_device *dev) +static void pvcalls_front_remove(struct xenbus_device *dev) { struct pvcalls_bedata *bedata; struct sock_mapping *map = NULL, *n; @@ -1128,17 +1135,17 @@ static int pvcalls_front_remove(struct xenbus_device *dev) } } if (bedata->ref != -1) - gnttab_end_foreign_access(bedata->ref, 0, 0); + gnttab_end_foreign_access(bedata->ref, NULL); kfree(bedata->ring.sring); kfree(bedata); xenbus_switch_state(dev, XenbusStateClosed); - return 0; } static int pvcalls_front_probe(struct xenbus_device *dev, const struct xenbus_device_id *id) { - int ret = -ENOMEM, evtchn, i; + int ret = -ENOMEM, i; + evtchn_port_t evtchn; unsigned int max_page_order, function_calls, len; char *versions; grant_ref_t gref_head = 0; @@ -1273,7 +1280,7 @@ static void pvcalls_front_changed(struct xenbus_device *dev, if (dev->state == XenbusStateClosed) break; /* Missed the backend's CLOSING state */ - /* fall through */ + fallthrough; case XenbusStateClosing: xenbus_frontend_closed(dev); break; @@ -1285,6 +1292,7 @@ static struct xenbus_driver pvcalls_front_driver = { .probe = pvcalls_front_probe, .remove = pvcalls_front_remove, .otherend_changed = pvcalls_front_changed, + .not_essential = true, }; static int __init pvcalls_frontend_init(void) diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h index f694ad77379f..881ef14660bc 100644 --- a/drivers/xen/pvcalls-front.h +++ b/drivers/xen/pvcalls-front.h @@ -12,7 +12,7 @@ int pvcalls_front_bind(struct socket *sock, int pvcalls_front_listen(struct socket *sock, int backlog); int pvcalls_front_accept(struct socket *sock, struct socket *newsock, - int flags); + struct proto_accept_arg *arg); int pvcalls_front_sendmsg(struct socket *sock, struct msghdr *msg, size_t len); diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 989cf872b98c..ccf25027bec1 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -1,18 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2010 * by Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> * * This code provides a IOMMU for Xen PV guests with PCI passthrough. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License v2.0 as published by - * the Free Software Foundation - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * * PV guests under Xen are running in an non-contiguous memory architecture. * * When PCI pass-through is utilized, this necessitates an IOMMU for @@ -30,13 +22,13 @@ * and PFN+1==MFN+1. Lastly with Xen 4.0, pages (in debug mode) are * allocated in descending order (high to low), meaning the guest might * never get any MFN's under the 4GB mark. - * */ #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt #include <linux/memblock.h> #include <linux/dma-direct.h> +#include <linux/dma-map-ops.h> #include <linux/export.h> #include <xen/swiotlb-xen.h> #include <xen/page.h> @@ -44,466 +36,315 @@ #include <xen/hvc-console.h> #include <asm/dma-mapping.h> -#include <asm/xen/page-coherent.h> #include <trace/events/swiotlb.h> -/* - * Used to do a quick range check in swiotlb_tbl_unmap_single and - * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this - * API. - */ +#define MAX_DMA_BITS 32 -static char *xen_io_tlb_start, *xen_io_tlb_end; -static unsigned long xen_io_tlb_nslabs; /* * Quick lookup value of the bus address of the IOTLB. */ -static u64 start_dma_addr; - -/* - * Both of these functions should avoid XEN_PFN_PHYS because phys_addr_t - * can be 32bit when dma_addr_t is 64bit leading to a loss in - * information if the shift is done before casting to 64bit. - */ -static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr) +static inline phys_addr_t xen_phys_to_bus(struct device *dev, phys_addr_t paddr) { unsigned long bfn = pfn_to_bfn(XEN_PFN_DOWN(paddr)); - dma_addr_t dma = (dma_addr_t)bfn << XEN_PAGE_SHIFT; + phys_addr_t baddr = (phys_addr_t)bfn << XEN_PAGE_SHIFT; - dma |= paddr & ~XEN_PAGE_MASK; + baddr |= paddr & ~XEN_PAGE_MASK; + return baddr; +} - return dma; +static inline dma_addr_t xen_phys_to_dma(struct device *dev, phys_addr_t paddr) +{ + return phys_to_dma(dev, xen_phys_to_bus(dev, paddr)); } -static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr) +static inline phys_addr_t xen_bus_to_phys(struct device *dev, + phys_addr_t baddr) { unsigned long xen_pfn = bfn_to_pfn(XEN_PFN_DOWN(baddr)); - dma_addr_t dma = (dma_addr_t)xen_pfn << XEN_PAGE_SHIFT; - phys_addr_t paddr = dma; - - paddr |= baddr & ~XEN_PAGE_MASK; + phys_addr_t paddr = (xen_pfn << XEN_PAGE_SHIFT) | + (baddr & ~XEN_PAGE_MASK); return paddr; } -static inline dma_addr_t xen_virt_to_bus(void *address) +static inline phys_addr_t xen_dma_to_phys(struct device *dev, + dma_addr_t dma_addr) { - return xen_phys_to_bus(virt_to_phys(address)); + return xen_bus_to_phys(dev, dma_to_phys(dev, dma_addr)); } -static int check_pages_physically_contiguous(unsigned long xen_pfn, - unsigned int offset, - size_t length) +static inline bool range_requires_alignment(phys_addr_t p, size_t size) { - unsigned long next_bfn; - int i; - int nr_pages; - - next_bfn = pfn_to_bfn(xen_pfn); - nr_pages = (offset + length + XEN_PAGE_SIZE-1) >> XEN_PAGE_SHIFT; + phys_addr_t algn = 1ULL << (get_order(size) + PAGE_SHIFT); + phys_addr_t bus_addr = pfn_to_bfn(XEN_PFN_DOWN(p)) << XEN_PAGE_SHIFT; - for (i = 1; i < nr_pages; i++) { - if (pfn_to_bfn(++xen_pfn) != ++next_bfn) - return 0; - } - return 1; + return IS_ALIGNED(p, algn) && !IS_ALIGNED(bus_addr, algn); } static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) { - unsigned long xen_pfn = XEN_PFN_DOWN(p); - unsigned int offset = p & ~XEN_PAGE_MASK; - - if (offset + size <= XEN_PAGE_SIZE) - return 0; - if (check_pages_physically_contiguous(xen_pfn, offset, size)) - return 0; - return 1; + unsigned long next_bfn, xen_pfn = XEN_PFN_DOWN(p); + unsigned int i, nr_pages = XEN_PFN_UP(xen_offset_in_page(p) + size); + + next_bfn = pfn_to_bfn(xen_pfn); + + for (i = 1; i < nr_pages; i++) + if (pfn_to_bfn(++xen_pfn) != ++next_bfn) + return 1; + + return 0; } -static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) +static struct io_tlb_pool *xen_swiotlb_find_pool(struct device *dev, + dma_addr_t dma_addr) { - unsigned long bfn = XEN_PFN_DOWN(dma_addr); + unsigned long bfn = XEN_PFN_DOWN(dma_to_phys(dev, dma_addr)); unsigned long xen_pfn = bfn_to_local_pfn(bfn); - phys_addr_t paddr = XEN_PFN_PHYS(xen_pfn); + phys_addr_t paddr = (phys_addr_t)xen_pfn << XEN_PAGE_SHIFT; /* If the address is outside our domain, it CAN * have the same virtual address as another address * in our domain. Therefore _only_ check address within our domain. */ - if (pfn_valid(PFN_DOWN(paddr))) { - return paddr >= virt_to_phys(xen_io_tlb_start) && - paddr < virt_to_phys(xen_io_tlb_end); - } - return 0; + if (pfn_valid(PFN_DOWN(paddr))) + return swiotlb_find_pool(dev, paddr); + return NULL; } -static int max_dma_bits = 32; - -static int -xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) +#ifdef CONFIG_X86 +int __init xen_swiotlb_fixup(void *buf, unsigned long nslabs) { - int i, rc; - int dma_bits; + int rc; + unsigned int order = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT); + unsigned int i, dma_bits = order + PAGE_SHIFT; dma_addr_t dma_handle; phys_addr_t p = virt_to_phys(buf); - dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; + BUILD_BUG_ON(IO_TLB_SEGSIZE & (IO_TLB_SEGSIZE - 1)); + BUG_ON(nslabs % IO_TLB_SEGSIZE); i = 0; do { - int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE); - do { rc = xen_create_contiguous_region( - p + (i << IO_TLB_SHIFT), - get_order(slabs << IO_TLB_SHIFT), + p + (i << IO_TLB_SHIFT), order, dma_bits, &dma_handle); - } while (rc && dma_bits++ < max_dma_bits); + } while (rc && dma_bits++ < MAX_DMA_BITS); if (rc) return rc; - i += slabs; + i += IO_TLB_SEGSIZE; } while (i < nslabs); return 0; } -static unsigned long xen_set_nslabs(unsigned long nr_tbl) -{ - if (!nr_tbl) { - xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); - xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); - } else - xen_io_tlb_nslabs = nr_tbl; - - return xen_io_tlb_nslabs << IO_TLB_SHIFT; -} - -enum xen_swiotlb_err { - XEN_SWIOTLB_UNKNOWN = 0, - XEN_SWIOTLB_ENOMEM, - XEN_SWIOTLB_EFIXUP -}; - -static const char *xen_swiotlb_error(enum xen_swiotlb_err err) -{ - switch (err) { - case XEN_SWIOTLB_ENOMEM: - return "Cannot allocate Xen-SWIOTLB buffer\n"; - case XEN_SWIOTLB_EFIXUP: - return "Failed to get contiguous memory for DMA from Xen!\n"\ - "You either: don't have the permissions, do not have"\ - " enough free memory under 4GB, or the hypervisor memory"\ - " is too fragmented!"; - default: - break; - } - return ""; -} -int __ref xen_swiotlb_init(int verbose, bool early) -{ - unsigned long bytes, order; - int rc = -ENOMEM; - enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN; - unsigned int repeat = 3; - - xen_io_tlb_nslabs = swiotlb_nr_tbl(); -retry: - bytes = xen_set_nslabs(xen_io_tlb_nslabs); - order = get_order(xen_io_tlb_nslabs << IO_TLB_SHIFT); - /* - * Get IO TLB memory from any location. - */ - if (early) - xen_io_tlb_start = memblock_alloc(PAGE_ALIGN(bytes), - PAGE_SIZE); - else { -#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) -#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) - while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { - xen_io_tlb_start = (void *)xen_get_swiotlb_free_pages(order); - if (xen_io_tlb_start) - break; - order--; - } - if (order != get_order(bytes)) { - pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n", - (PAGE_SIZE << order) >> 20); - xen_io_tlb_nslabs = SLABS_PER_PAGE << order; - bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; - } - } - if (!xen_io_tlb_start) { - m_ret = XEN_SWIOTLB_ENOMEM; - goto error; - } - xen_io_tlb_end = xen_io_tlb_start + bytes; - /* - * And replace that memory with pages under 4GB. - */ - rc = xen_swiotlb_fixup(xen_io_tlb_start, - bytes, - xen_io_tlb_nslabs); - if (rc) { - if (early) - memblock_free(__pa(xen_io_tlb_start), - PAGE_ALIGN(bytes)); - else { - free_pages((unsigned long)xen_io_tlb_start, order); - xen_io_tlb_start = NULL; - } - m_ret = XEN_SWIOTLB_EFIXUP; - goto error; - } - start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); - if (early) { - if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, - verbose)) - panic("Cannot allocate SWIOTLB buffer"); - rc = 0; - } else - rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs); - - if (!rc) - swiotlb_set_max_segment(PAGE_SIZE); - - return rc; -error: - if (repeat--) { - xen_io_tlb_nslabs = max(1024UL, /* Min is 2MB */ - (xen_io_tlb_nslabs >> 1)); - pr_info("Lowering to %luMB\n", - (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20); - goto retry; - } - pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc); - if (early) - panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); - else - free_pages((unsigned long)xen_io_tlb_start, order); - return rc; -} static void * -xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags, - unsigned long attrs) +xen_swiotlb_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flags, unsigned long attrs) { - void *ret; + u64 dma_mask = dev->coherent_dma_mask; int order = get_order(size); - u64 dma_mask = DMA_BIT_MASK(32); phys_addr_t phys; - dma_addr_t dev_addr; - - /* - * Ignore region specifiers - the kernel's ideas of - * pseudo-phys memory layout has nothing to do with the - * machine physical layout. We can't allocate highmem - * because we can't return a pointer to it. - */ - flags &= ~(__GFP_DMA | __GFP_HIGHMEM); - - /* Convert the size to actually allocated. */ - size = 1UL << (order + XEN_PAGE_SHIFT); + void *ret; - /* On ARM this function returns an ioremap'ped virtual address for - * which virt_to_phys doesn't return the corresponding physical - * address. In fact on ARM virt_to_phys only works for kernel direct - * mapped RAM memory. Also see comment below. - */ - ret = xen_alloc_coherent_pages(hwdev, size, dma_handle, flags, attrs); + /* Align the allocation to the Xen page size */ + size = ALIGN(size, XEN_PAGE_SIZE); + ret = (void *)__get_free_pages(flags, get_order(size)); if (!ret) return ret; - - if (hwdev && hwdev->coherent_dma_mask) - dma_mask = hwdev->coherent_dma_mask; - - /* At this point dma_handle is the physical address, next we are - * going to set it to the machine address. - * Do not use virt_to_phys(ret) because on ARM it doesn't correspond - * to *dma_handle. */ - phys = *dma_handle; - dev_addr = xen_phys_to_bus(phys); - if (((dev_addr + size - 1 <= dma_mask)) && - !range_straddles_page_boundary(phys, size)) - *dma_handle = dev_addr; - else { - if (xen_create_contiguous_region(phys, order, - fls64(dma_mask), dma_handle) != 0) { - xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs); - return NULL; - } + phys = virt_to_phys(ret); + + *dma_handle = xen_phys_to_dma(dev, phys); + if (*dma_handle + size - 1 > dma_mask || + range_straddles_page_boundary(phys, size) || + range_requires_alignment(phys, size)) { + if (xen_create_contiguous_region(phys, order, fls64(dma_mask), + dma_handle) != 0) + goto out_free_pages; + SetPageXenRemapped(virt_to_page(ret)); } + memset(ret, 0, size); return ret; + +out_free_pages: + free_pages((unsigned long)ret, get_order(size)); + return NULL; } static void -xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, - dma_addr_t dev_addr, unsigned long attrs) +xen_swiotlb_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, unsigned long attrs) { + phys_addr_t phys = virt_to_phys(vaddr); int order = get_order(size); - phys_addr_t phys; - u64 dma_mask = DMA_BIT_MASK(32); - - if (hwdev && hwdev->coherent_dma_mask) - dma_mask = hwdev->coherent_dma_mask; - - /* do not use virt_to_phys because on ARM it doesn't return you the - * physical address */ - phys = xen_bus_to_phys(dev_addr); /* Convert the size to actually allocated. */ - size = 1UL << (order + XEN_PAGE_SHIFT); + size = ALIGN(size, XEN_PAGE_SIZE); - if (((dev_addr + size - 1 <= dma_mask)) || - range_straddles_page_boundary(phys, size)) - xen_destroy_contiguous_region(phys, order); + if (WARN_ON_ONCE(dma_handle + size - 1 > dev->coherent_dma_mask) || + WARN_ON_ONCE(range_straddles_page_boundary(phys, size) || + range_requires_alignment(phys, size))) + return; - xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs); + if (TestClearPageXenRemapped(virt_to_page(vaddr))) + xen_destroy_contiguous_region(phys, order); + free_pages((unsigned long)vaddr, get_order(size)); } +#endif /* CONFIG_X86 */ /* * Map a single buffer of the indicated size for DMA in streaming mode. The * physical address to use is returned. * * Once the device is given the dma address, the device owns this memory until - * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed. + * either xen_swiotlb_unmap_phys or xen_swiotlb_dma_sync_single is performed. */ -static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, +static dma_addr_t xen_swiotlb_map_phys(struct device *dev, phys_addr_t phys, + size_t size, enum dma_data_direction dir, unsigned long attrs) { - phys_addr_t map, phys = page_to_phys(page) + offset; - dma_addr_t dev_addr = xen_phys_to_bus(phys); + dma_addr_t dev_addr; + phys_addr_t map; BUG_ON(dir == DMA_NONE); + + if (attrs & DMA_ATTR_MMIO) { + if (unlikely(!dma_capable(dev, phys, size, false))) { + dev_err_once( + dev, + "DMA addr %pa+%zu overflow (mask %llx, bus limit %llx).\n", + &phys, size, *dev->dma_mask, + dev->bus_dma_limit); + WARN_ON_ONCE(1); + return DMA_MAPPING_ERROR; + } + return phys; + } + + dev_addr = xen_phys_to_dma(dev, phys); + /* * If the address happens to be in the device's DMA window, * we can safely return the device addr and not worry about bounce * buffering it. */ - if (dma_capable(dev, dev_addr, size) && + if (dma_capable(dev, dev_addr, size, true) && + !dma_kmalloc_needs_bounce(dev, size, dir) && !range_straddles_page_boundary(phys, size) && !xen_arch_need_swiotlb(dev, phys, dev_addr) && - (swiotlb_force != SWIOTLB_FORCE)) { - /* we are not interested in the dma_addr returned by - * xen_dma_map_page, only in the potential cache flushes executed - * by the function. */ - xen_dma_map_page(dev, page, dev_addr, offset, size, dir, attrs); - return dev_addr; - } + !is_swiotlb_force_bounce(dev)) + goto done; /* * Oh well, have to allocate and map a bounce buffer. */ - trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); + trace_swiotlb_bounced(dev, dev_addr, size); - map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir, - attrs); - if (map == DMA_MAPPING_ERROR) + map = swiotlb_tbl_map_single(dev, phys, size, 0, dir, attrs); + if (map == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; - dev_addr = xen_phys_to_bus(map); - xen_dma_map_page(dev, pfn_to_page(map >> PAGE_SHIFT), - dev_addr, map & ~PAGE_MASK, size, dir, attrs); + phys = map; + dev_addr = xen_phys_to_dma(dev, map); /* * Ensure that the address returned is DMA'ble */ - if (dma_capable(dev, dev_addr, size)) - return dev_addr; - - attrs |= DMA_ATTR_SKIP_CPU_SYNC; - swiotlb_tbl_unmap_single(dev, map, size, dir, attrs); + if (unlikely(!dma_capable(dev, dev_addr, size, true))) { + __swiotlb_tbl_unmap_single(dev, map, size, dir, + attrs | DMA_ATTR_SKIP_CPU_SYNC, + swiotlb_find_pool(dev, map)); + return DMA_MAPPING_ERROR; + } - return DMA_MAPPING_ERROR; +done: + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dev_addr)))) + arch_sync_dma_for_device(phys, size, dir); + else + xen_dma_sync_for_device(dev, dev_addr, size, dir); + } + return dev_addr; } /* * Unmap a single streaming mode DMA translation. The dma_addr and size must - * match what was provided for in a previous xen_swiotlb_map_page call. All + * match what was provided for in a previous xen_swiotlb_map_phys call. All * other usages are undefined. * * After this call, reads by the cpu to the buffer are guaranteed to see * whatever the device wrote there. */ -static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) +static void xen_swiotlb_unmap_phys(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) { - phys_addr_t paddr = xen_bus_to_phys(dev_addr); + phys_addr_t paddr = xen_dma_to_phys(hwdev, dev_addr); + struct io_tlb_pool *pool; BUG_ON(dir == DMA_NONE); - xen_dma_unmap_page(hwdev, dev_addr, size, dir, attrs); + if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { + if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr)))) + arch_sync_dma_for_cpu(paddr, size, dir); + else + xen_dma_sync_for_cpu(hwdev, dev_addr, size, dir); + } /* NOTE: We use dev_addr here, not paddr! */ - if (is_xen_swiotlb_buffer(dev_addr)) - swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs); + pool = xen_swiotlb_find_pool(hwdev, dev_addr); + if (pool) + __swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, + attrs, pool); } -static void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - xen_unmap_single(hwdev, dev_addr, size, dir, attrs); -} - -/* - * Make physical memory consistent for a single streaming mode DMA translation - * after a transfer. - * - * If you perform a xen_swiotlb_map_page() but wish to interrogate the buffer - * using the cpu, yet do not wish to teardown the dma mapping, you must - * call this function before doing so. At the next point you give the dma - * address back to the card, you must first perform a - * xen_swiotlb_dma_sync_for_device, and then the device again owns the buffer - */ static void -xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - enum dma_sync_target target) +xen_swiotlb_sync_single_for_cpu(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir) { - phys_addr_t paddr = xen_bus_to_phys(dev_addr); - - BUG_ON(dir == DMA_NONE); - - if (target == SYNC_FOR_CPU) - xen_dma_sync_single_for_cpu(hwdev, dev_addr, size, dir); - - /* NOTE: We use dev_addr here, not paddr! */ - if (is_xen_swiotlb_buffer(dev_addr)) - swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); - - if (target == SYNC_FOR_DEVICE) - xen_dma_sync_single_for_device(hwdev, dev_addr, size, dir); -} + phys_addr_t paddr = xen_dma_to_phys(dev, dma_addr); + struct io_tlb_pool *pool; + + if (!dev_is_dma_coherent(dev)) { + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) + arch_sync_dma_for_cpu(paddr, size, dir); + else + xen_dma_sync_for_cpu(dev, dma_addr, size, dir); + } -void -xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir) -{ - xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU); + pool = xen_swiotlb_find_pool(dev, dma_addr); + if (pool) + __swiotlb_sync_single_for_cpu(dev, paddr, size, dir, pool); } -void -xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir) +static void +xen_swiotlb_sync_single_for_device(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir) { - xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE); + phys_addr_t paddr = xen_dma_to_phys(dev, dma_addr); + struct io_tlb_pool *pool; + + pool = xen_swiotlb_find_pool(dev, dma_addr); + if (pool) + __swiotlb_sync_single_for_device(dev, paddr, size, dir, pool); + + if (!dev_is_dma_coherent(dev)) { + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) + arch_sync_dma_for_device(paddr, size, dir); + else + xen_dma_sync_for_device(dev, dma_addr, size, dir); + } } /* * Unmap a set of streaming mode DMA translations. Again, cpu read rules - * concerning calls here are the same as for swiotlb_unmap_page() above. + * concerning calls here are the same as for swiotlb_unmap_phys() above. */ static void -xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, - int nelems, enum dma_data_direction dir, - unsigned long attrs) +xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, + enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *sg; int i; @@ -511,30 +352,14 @@ xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, BUG_ON(dir == DMA_NONE); for_each_sg(sgl, sg, nelems, i) - xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, attrs); + xen_swiotlb_unmap_phys(hwdev, sg->dma_address, sg_dma_len(sg), + dir, attrs); } -/* - * Map a set of buffers described by scatterlist in streaming mode for DMA. - * This is the scatter-gather version of the above xen_swiotlb_map_page - * interface. Here the scatter gather list elements are each tagged with the - * appropriate dma address and length. They are obtained via - * sg_dma_{address,length}(SG). - * - * NOTE: An implementation may be able to use a smaller number of - * DMA address/length pairs than there are SG table elements. - * (for example via virtual mapping capabilities) - * The routine returns the number of addr/length pairs actually - * used, at most nents. - * - * Device ownership issues as mentioned above for xen_swiotlb_map_page are the - * same here. - */ static int -xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, - int nelems, enum dma_data_direction dir, - unsigned long attrs) +xen_swiotlb_map_sg(struct device *dev, struct scatterlist *sgl, int nelems, + enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *sg; int i; @@ -542,85 +367,44 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, BUG_ON(dir == DMA_NONE); for_each_sg(sgl, sg, nelems, i) { - phys_addr_t paddr = sg_phys(sg); - dma_addr_t dev_addr = xen_phys_to_bus(paddr); - - if (swiotlb_force == SWIOTLB_FORCE || - xen_arch_need_swiotlb(hwdev, paddr, dev_addr) || - !dma_capable(hwdev, dev_addr, sg->length) || - range_straddles_page_boundary(paddr, sg->length)) { - phys_addr_t map = swiotlb_tbl_map_single(hwdev, - start_dma_addr, - sg_phys(sg), - sg->length, - dir, attrs); - if (map == DMA_MAPPING_ERROR) { - dev_warn(hwdev, "swiotlb buffer is full\n"); - /* Don't panic here, we expect map_sg users - to do proper error handling. */ - attrs |= DMA_ATTR_SKIP_CPU_SYNC; - xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, - attrs); - sg_dma_len(sgl) = 0; - return 0; - } - dev_addr = xen_phys_to_bus(map); - xen_dma_map_page(hwdev, pfn_to_page(map >> PAGE_SHIFT), - dev_addr, - map & ~PAGE_MASK, - sg->length, - dir, - attrs); - sg->dma_address = dev_addr; - } else { - /* we are not interested in the dma_addr returned by - * xen_dma_map_page, only in the potential cache flushes executed - * by the function. */ - xen_dma_map_page(hwdev, pfn_to_page(paddr >> PAGE_SHIFT), - dev_addr, - paddr & ~PAGE_MASK, - sg->length, - dir, - attrs); - sg->dma_address = dev_addr; - } + sg->dma_address = xen_swiotlb_map_phys(dev, sg_phys(sg), + sg->length, dir, attrs); + if (sg->dma_address == DMA_MAPPING_ERROR) + goto out_unmap; sg_dma_len(sg) = sg->length; } + return nelems; +out_unmap: + xen_swiotlb_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); + sg_dma_len(sgl) = 0; + return -EIO; } -/* - * Make physical memory consistent for a set of streaming mode DMA translations - * after a transfer. - * - * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules - * and usage. - */ static void -xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, - int nelems, enum dma_data_direction dir, - enum dma_sync_target target) +xen_swiotlb_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir) { struct scatterlist *sg; int i; - for_each_sg(sgl, sg, nelems, i) - xen_swiotlb_sync_single(hwdev, sg->dma_address, - sg_dma_len(sg), dir, target); -} - -static void -xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir) -{ - xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU); + for_each_sg(sgl, sg, nelems, i) { + xen_swiotlb_sync_single_for_cpu(dev, sg->dma_address, + sg->length, dir); + } } static void -xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, +xen_swiotlb_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir) { - xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nelems, i) { + xen_swiotlb_sync_single_for_device(dev, sg->dma_address, + sg->length, dir); + } } /* @@ -632,66 +416,29 @@ xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, static int xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) { - return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask; -} - -/* - * Create userspace mapping for the DMA-coherent memory. - * This function should be called with the pages from the current domain only, - * passing pages mapped from other domains would lead to memory corruption. - */ -static int -xen_swiotlb_dma_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size, - unsigned long attrs) -{ -#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) - if (xen_get_dma_ops(dev)->mmap) - return xen_get_dma_ops(dev)->mmap(dev, vma, cpu_addr, - dma_addr, size, attrs); -#endif - return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); -} - -/* - * This function should be called with the pages from the current domain only, - * passing pages mapped from other domains would lead to memory corruption. - */ -static int -xen_swiotlb_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t handle, size_t size, - unsigned long attrs) -{ -#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) - if (xen_get_dma_ops(dev)->get_sgtable) { -#if 0 - /* - * This check verifies that the page belongs to the current domain and - * is not one mapped from another domain. - * This check is for debug only, and should not go to production build - */ - unsigned long bfn = PHYS_PFN(dma_to_phys(dev, handle)); - BUG_ON (!page_is_ram(bfn)); -#endif - return xen_get_dma_ops(dev)->get_sgtable(dev, sgt, cpu_addr, - handle, size, attrs); - } -#endif - return dma_common_get_sgtable(dev, sgt, cpu_addr, handle, size, attrs); + return xen_phys_to_dma(hwdev, default_swiotlb_limit()) <= mask; } const struct dma_map_ops xen_swiotlb_dma_ops = { +#ifdef CONFIG_X86 .alloc = xen_swiotlb_alloc_coherent, .free = xen_swiotlb_free_coherent, +#else + .alloc = dma_direct_alloc, + .free = dma_direct_free, +#endif .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, .sync_single_for_device = xen_swiotlb_sync_single_for_device, .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, .sync_sg_for_device = xen_swiotlb_sync_sg_for_device, - .map_sg = xen_swiotlb_map_sg_attrs, - .unmap_sg = xen_swiotlb_unmap_sg_attrs, - .map_page = xen_swiotlb_map_page, - .unmap_page = xen_swiotlb_unmap_page, + .map_sg = xen_swiotlb_map_sg, + .unmap_sg = xen_swiotlb_unmap_sg, + .map_phys = xen_swiotlb_map_phys, + .unmap_phys = xen_swiotlb_unmap_phys, .dma_supported = xen_swiotlb_dma_supported, - .mmap = xen_swiotlb_dma_mmap, - .get_sgtable = xen_swiotlb_get_sgtable, + .mmap = dma_common_mmap, + .get_sgtable = dma_common_get_sgtable, + .alloc_pages_op = dma_common_alloc_pages, + .free_pages = dma_common_free_pages, + .max_mapping_size = swiotlb_max_mapping_size, }; diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c index 9d314bba7c4e..2f880374b463 100644 --- a/drivers/xen/sys-hypervisor.c +++ b/drivers/xen/sys-hypervisor.c @@ -1,10 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * copyright (c) 2006 IBM Corporation * Authored by: Mike D. Day <ncmike@us.ibm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/slab.h> @@ -25,17 +22,19 @@ #endif #define HYPERVISOR_ATTR_RO(_name) \ -static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name) +static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name) #define HYPERVISOR_ATTR_RW(_name) \ -static struct hyp_sysfs_attr _name##_attr = \ - __ATTR(_name, 0644, _name##_show, _name##_store) +static struct hyp_sysfs_attr _name##_attr = __ATTR_RW(_name) struct hyp_sysfs_attr { struct attribute attr; ssize_t (*show)(struct hyp_sysfs_attr *, char *); ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t); - void *hyp_attr_data; + union { + void *hyp_attr_data; + unsigned long hyp_attr_value; + }; }; static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer) @@ -403,6 +402,60 @@ static int __init xen_sysfs_properties_init(void) return sysfs_create_group(hypervisor_kobj, &xen_properties_group); } +#define FLAG_UNAME "unknown" +#define FLAG_UNAME_FMT FLAG_UNAME "%02u" +#define FLAG_UNAME_MAX sizeof(FLAG_UNAME "XX") +#define FLAG_COUNT (sizeof(xen_start_flags) * BITS_PER_BYTE) +static_assert(sizeof(xen_start_flags) <= + sizeof_field(struct hyp_sysfs_attr, hyp_attr_value)); + +static ssize_t flag_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + char *p = buffer; + + *p++ = '0' + ((xen_start_flags & attr->hyp_attr_value) != 0); + *p++ = '\n'; + return p - buffer; +} + +#define FLAG_NODE(flag, node) \ + [ilog2(flag)] = { \ + .attr = { .name = #node, .mode = 0444 },\ + .show = flag_show, \ + .hyp_attr_value = flag \ + } + +/* + * Add new, known flags here. No other changes are required, but + * note that each known flag wastes one entry in flag_unames[]. + * The code/complexity machinations to avoid this isn't worth it + * for a few entries, but keep it in mind. + */ +static struct hyp_sysfs_attr flag_attrs[FLAG_COUNT] = { + FLAG_NODE(SIF_PRIVILEGED, privileged), + FLAG_NODE(SIF_INITDOMAIN, initdomain) +}; +static struct attribute_group xen_flags_group = { + .name = "start_flags", + .attrs = (struct attribute *[FLAG_COUNT + 1]){} +}; +static char flag_unames[FLAG_COUNT][FLAG_UNAME_MAX]; + +static int __init xen_sysfs_flags_init(void) +{ + for (unsigned fnum = 0; fnum != FLAG_COUNT; fnum++) { + if (likely(flag_attrs[fnum].attr.name == NULL)) { + sprintf(flag_unames[fnum], FLAG_UNAME_FMT, fnum); + flag_attrs[fnum].attr.name = flag_unames[fnum]; + flag_attrs[fnum].attr.mode = 0444; + flag_attrs[fnum].show = flag_show; + flag_attrs[fnum].hyp_attr_value = 1 << fnum; + } + xen_flags_group.attrs[fnum] = &flag_attrs[fnum].attr; + } + return sysfs_create_group(hypervisor_kobj, &xen_flags_group); +} + #ifdef CONFIG_XEN_HAVE_VPMU struct pmu_mode { const char *name; @@ -543,18 +596,22 @@ static int __init hyper_sysfs_init(void) ret = xen_sysfs_properties_init(); if (ret) goto prop_out; + ret = xen_sysfs_flags_init(); + if (ret) + goto flags_out; #ifdef CONFIG_XEN_HAVE_VPMU if (xen_initial_domain()) { ret = xen_sysfs_pmu_init(); if (ret) { - sysfs_remove_group(hypervisor_kobj, - &xen_properties_group); - goto prop_out; + sysfs_remove_group(hypervisor_kobj, &xen_flags_group); + goto flags_out; } } #endif goto out; +flags_out: + sysfs_remove_group(hypervisor_kobj, &xen_properties_group); prop_out: sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr); uuid_out: @@ -598,7 +655,7 @@ static const struct sysfs_ops hyp_sysfs_ops = { .store = hyp_sysfs_store, }; -static struct kobj_type hyp_sysfs_kobj_type = { +static const struct kobj_type hyp_sysfs_kobj_type = { .sysfs_ops = &hyp_sysfs_ops, }; diff --git a/drivers/xen/time.c b/drivers/xen/time.c index 0968859c29d0..5683383d2305 100644 --- a/drivers/xen/time.c +++ b/drivers/xen/time.c @@ -7,6 +7,7 @@ #include <linux/math64.h> #include <linux/gfp.h> #include <linux/slab.h> +#include <linux/static_call.h> #include <asm/paravirt.h> #include <asm/xen/hypervisor.h> @@ -64,7 +65,7 @@ static void xen_get_runstate_snapshot_cpu_delta( do { state_time = get64(&state->state_entry_time); rmb(); /* Hypervisor might update data. */ - *res = READ_ONCE(*state); + *res = __READ_ONCE(*state); rmb(); /* Hypervisor might update data. */ } while (get64(&state->state_entry_time) != state_time || (state_time & XEN_RUNSTATE_UPDATE)); @@ -135,14 +136,6 @@ void xen_manage_runstate_time(int action) } } -/* - * Runstate accounting - */ -void xen_get_runstate_snapshot(struct vcpu_runstate_info *res) -{ - xen_get_runstate_snapshot_cpu(res, smp_processor_id()); -} - /* return true when a vcpu could run but has no real cpu to run on */ bool xen_vcpu_stolen(int vcpu) { @@ -175,7 +168,7 @@ void __init xen_time_setup_guest(void) xen_runstate_remote = !HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_runstate_update_flag); - pv_ops.time.steal_clock = xen_steal_clock; + static_call_update(pv_steal_clock, xen_steal_clock); static_key_slow_inc(¶virt_steal_enabled); if (xen_runstate_remote) diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c deleted file mode 100644 index 04e7b3b29bac..000000000000 --- a/drivers/xen/tmem.c +++ /dev/null @@ -1,418 +0,0 @@ -/* - * Xen implementation for transcendent memory (tmem) - * - * Copyright (C) 2009-2011 Oracle Corp. All rights reserved. - * Author: Dan Magenheimer - */ - -#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/pagemap.h> -#include <linux/cleancache.h> -#include <linux/frontswap.h> - -#include <xen/xen.h> -#include <xen/interface/xen.h> -#include <xen/page.h> -#include <asm/xen/hypercall.h> -#include <asm/xen/hypervisor.h> -#include <xen/tmem.h> - -#ifndef CONFIG_XEN_TMEM_MODULE -bool __read_mostly tmem_enabled = false; - -static int __init enable_tmem(char *s) -{ - tmem_enabled = true; - return 1; -} -__setup("tmem", enable_tmem); -#endif - -#ifdef CONFIG_CLEANCACHE -static bool cleancache __read_mostly = true; -module_param(cleancache, bool, S_IRUGO); -static bool selfballooning __read_mostly = true; -module_param(selfballooning, bool, S_IRUGO); -#endif /* CONFIG_CLEANCACHE */ - -#ifdef CONFIG_FRONTSWAP -static bool frontswap __read_mostly = true; -module_param(frontswap, bool, S_IRUGO); -#else /* CONFIG_FRONTSWAP */ -#define frontswap (0) -#endif /* CONFIG_FRONTSWAP */ - -#ifdef CONFIG_XEN_SELFBALLOONING -static bool selfshrinking __read_mostly = true; -module_param(selfshrinking, bool, S_IRUGO); -#endif /* CONFIG_XEN_SELFBALLOONING */ - -#define TMEM_CONTROL 0 -#define TMEM_NEW_POOL 1 -#define TMEM_DESTROY_POOL 2 -#define TMEM_NEW_PAGE 3 -#define TMEM_PUT_PAGE 4 -#define TMEM_GET_PAGE 5 -#define TMEM_FLUSH_PAGE 6 -#define TMEM_FLUSH_OBJECT 7 -#define TMEM_READ 8 -#define TMEM_WRITE 9 -#define TMEM_XCHG 10 - -/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */ -#define TMEM_POOL_PERSIST 1 -#define TMEM_POOL_SHARED 2 -#define TMEM_POOL_PAGESIZE_SHIFT 4 -#define TMEM_VERSION_SHIFT 24 - - -struct tmem_pool_uuid { - u64 uuid_lo; - u64 uuid_hi; -}; - -struct tmem_oid { - u64 oid[3]; -}; - -#define TMEM_POOL_PRIVATE_UUID { 0, 0 } - -/* flags for tmem_ops.new_pool */ -#define TMEM_POOL_PERSIST 1 -#define TMEM_POOL_SHARED 2 - -/* xen tmem foundation ops/hypercalls */ - -static inline int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid oid, - u32 index, unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len) -{ - struct tmem_op op; - int rc = 0; - - op.cmd = tmem_cmd; - op.pool_id = tmem_pool; - op.u.gen.oid[0] = oid.oid[0]; - op.u.gen.oid[1] = oid.oid[1]; - op.u.gen.oid[2] = oid.oid[2]; - op.u.gen.index = index; - op.u.gen.tmem_offset = tmem_offset; - op.u.gen.pfn_offset = pfn_offset; - op.u.gen.len = len; - set_xen_guest_handle(op.u.gen.gmfn, (void *)gmfn); - rc = HYPERVISOR_tmem_op(&op); - return rc; -} - -static int xen_tmem_new_pool(struct tmem_pool_uuid uuid, - u32 flags, unsigned long pagesize) -{ - struct tmem_op op; - int rc = 0, pageshift; - - for (pageshift = 0; pagesize != 1; pageshift++) - pagesize >>= 1; - flags |= (pageshift - 12) << TMEM_POOL_PAGESIZE_SHIFT; - flags |= TMEM_SPEC_VERSION << TMEM_VERSION_SHIFT; - op.cmd = TMEM_NEW_POOL; - op.u.new.uuid[0] = uuid.uuid_lo; - op.u.new.uuid[1] = uuid.uuid_hi; - op.u.new.flags = flags; - rc = HYPERVISOR_tmem_op(&op); - return rc; -} - -/* xen generic tmem ops */ - -static int xen_tmem_put_page(u32 pool_id, struct tmem_oid oid, - u32 index, struct page *page) -{ - return xen_tmem_op(TMEM_PUT_PAGE, pool_id, oid, index, - xen_page_to_gfn(page), 0, 0, 0); -} - -static int xen_tmem_get_page(u32 pool_id, struct tmem_oid oid, - u32 index, struct page *page) -{ - return xen_tmem_op(TMEM_GET_PAGE, pool_id, oid, index, - xen_page_to_gfn(page), 0, 0, 0); -} - -static int xen_tmem_flush_page(u32 pool_id, struct tmem_oid oid, u32 index) -{ - return xen_tmem_op(TMEM_FLUSH_PAGE, pool_id, oid, index, - 0, 0, 0, 0); -} - -static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid) -{ - return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0); -} - - -#ifdef CONFIG_CLEANCACHE -static int xen_tmem_destroy_pool(u32 pool_id) -{ - struct tmem_oid oid = { { 0 } }; - - return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0); -} - -/* cleancache ops */ - -static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key, - pgoff_t index, struct page *page) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - - if (pool < 0) - return; - if (ind != index) - return; - mb(); /* ensure page is quiescent; tmem may address it with an alias */ - (void)xen_tmem_put_page((u32)pool, oid, ind, page); -} - -static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key, - pgoff_t index, struct page *page) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - int ret; - - /* translate return values to linux semantics */ - if (pool < 0) - return -1; - if (ind != index) - return -1; - ret = xen_tmem_get_page((u32)pool, oid, ind, page); - if (ret == 1) - return 0; - else - return -1; -} - -static void tmem_cleancache_flush_page(int pool, struct cleancache_filekey key, - pgoff_t index) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - - if (pool < 0) - return; - if (ind != index) - return; - (void)xen_tmem_flush_page((u32)pool, oid, ind); -} - -static void tmem_cleancache_flush_inode(int pool, struct cleancache_filekey key) -{ - struct tmem_oid oid = *(struct tmem_oid *)&key; - - if (pool < 0) - return; - (void)xen_tmem_flush_object((u32)pool, oid); -} - -static void tmem_cleancache_flush_fs(int pool) -{ - if (pool < 0) - return; - (void)xen_tmem_destroy_pool((u32)pool); -} - -static int tmem_cleancache_init_fs(size_t pagesize) -{ - struct tmem_pool_uuid uuid_private = TMEM_POOL_PRIVATE_UUID; - - return xen_tmem_new_pool(uuid_private, 0, pagesize); -} - -static int tmem_cleancache_init_shared_fs(uuid_t *uuid, size_t pagesize) -{ - struct tmem_pool_uuid shared_uuid; - - shared_uuid.uuid_lo = *(u64 *)&uuid->b[0]; - shared_uuid.uuid_hi = *(u64 *)&uuid->b[8]; - return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize); -} - -static const struct cleancache_ops tmem_cleancache_ops = { - .put_page = tmem_cleancache_put_page, - .get_page = tmem_cleancache_get_page, - .invalidate_page = tmem_cleancache_flush_page, - .invalidate_inode = tmem_cleancache_flush_inode, - .invalidate_fs = tmem_cleancache_flush_fs, - .init_shared_fs = tmem_cleancache_init_shared_fs, - .init_fs = tmem_cleancache_init_fs -}; -#endif - -#ifdef CONFIG_FRONTSWAP -/* frontswap tmem operations */ - -/* a single tmem poolid is used for all frontswap "types" (swapfiles) */ -static int tmem_frontswap_poolid; - -/* - * Swizzling increases objects per swaptype, increasing tmem concurrency - * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS - */ -#define SWIZ_BITS 4 -#define SWIZ_MASK ((1 << SWIZ_BITS) - 1) -#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK)) -#define iswiz(_ind) (_ind >> SWIZ_BITS) - -static inline struct tmem_oid oswiz(unsigned type, u32 ind) -{ - struct tmem_oid oid = { .oid = { 0 } }; - oid.oid[0] = _oswiz(type, ind); - return oid; -} - -/* returns 0 if the page was successfully put into frontswap, -1 if not */ -static int tmem_frontswap_store(unsigned type, pgoff_t offset, - struct page *page) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - int pool = tmem_frontswap_poolid; - int ret; - - /* THP isn't supported */ - if (PageTransHuge(page)) - return -1; - - if (pool < 0) - return -1; - if (ind64 != ind) - return -1; - mb(); /* ensure page is quiescent; tmem may address it with an alias */ - ret = xen_tmem_put_page(pool, oswiz(type, ind), iswiz(ind), page); - /* translate Xen tmem return values to linux semantics */ - if (ret == 1) - return 0; - else - return -1; -} - -/* - * returns 0 if the page was successfully gotten from frontswap, -1 if - * was not present (should never happen!) - */ -static int tmem_frontswap_load(unsigned type, pgoff_t offset, - struct page *page) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - int pool = tmem_frontswap_poolid; - int ret; - - if (pool < 0) - return -1; - if (ind64 != ind) - return -1; - ret = xen_tmem_get_page(pool, oswiz(type, ind), iswiz(ind), page); - /* translate Xen tmem return values to linux semantics */ - if (ret == 1) - return 0; - else - return -1; -} - -/* flush a single page from frontswap */ -static void tmem_frontswap_flush_page(unsigned type, pgoff_t offset) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - int pool = tmem_frontswap_poolid; - - if (pool < 0) - return; - if (ind64 != ind) - return; - (void) xen_tmem_flush_page(pool, oswiz(type, ind), iswiz(ind)); -} - -/* flush all pages from the passed swaptype */ -static void tmem_frontswap_flush_area(unsigned type) -{ - int pool = tmem_frontswap_poolid; - int ind; - - if (pool < 0) - return; - for (ind = SWIZ_MASK; ind >= 0; ind--) - (void)xen_tmem_flush_object(pool, oswiz(type, ind)); -} - -static void tmem_frontswap_init(unsigned ignored) -{ - struct tmem_pool_uuid private = TMEM_POOL_PRIVATE_UUID; - - /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ - if (tmem_frontswap_poolid < 0) - tmem_frontswap_poolid = - xen_tmem_new_pool(private, TMEM_POOL_PERSIST, PAGE_SIZE); -} - -static struct frontswap_ops tmem_frontswap_ops = { - .store = tmem_frontswap_store, - .load = tmem_frontswap_load, - .invalidate_page = tmem_frontswap_flush_page, - .invalidate_area = tmem_frontswap_flush_area, - .init = tmem_frontswap_init -}; -#endif - -static int __init xen_tmem_init(void) -{ - if (!xen_domain()) - return 0; -#ifdef CONFIG_FRONTSWAP - if (tmem_enabled && frontswap) { - char *s = ""; - - tmem_frontswap_poolid = -1; - frontswap_register_ops(&tmem_frontswap_ops); - pr_info("frontswap enabled, RAM provided by Xen Transcendent Memory%s\n", - s); - } -#endif -#ifdef CONFIG_CLEANCACHE - BUILD_BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid)); - if (tmem_enabled && cleancache) { - int err; - - err = cleancache_register_ops(&tmem_cleancache_ops); - if (err) - pr_warn("xen-tmem: failed to enable cleancache: %d\n", - err); - else - pr_info("cleancache enabled, RAM provided by " - "Xen Transcendent Memory\n"); - } -#endif -#ifdef CONFIG_XEN_SELFBALLOONING - /* - * There is no point of driving pages to the swap system if they - * aren't going anywhere in tmem universe. - */ - if (!frontswap) { - selfshrinking = false; - selfballooning = false; - } - xen_selfballoon_init(selfballooning, selfshrinking); -#endif - return 0; -} - -module_init(xen_tmem_init) -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Dan Magenheimer <dan.magenheimer@oracle.com>"); -MODULE_DESCRIPTION("Shim to Xen transcendent memory"); diff --git a/drivers/xen/unpopulated-alloc.c b/drivers/xen/unpopulated-alloc.c new file mode 100644 index 000000000000..d6fc2aefe264 --- /dev/null +++ b/drivers/xen/unpopulated-alloc.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/errno.h> +#include <linux/gfp.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/memremap.h> +#include <linux/slab.h> + +#include <asm/page.h> + +#include <xen/balloon.h> +#include <xen/page.h> +#include <xen/xen.h> + +static DEFINE_MUTEX(list_lock); +static struct page *page_list; +static unsigned int list_count; + +static struct resource *target_resource; + +/* + * If arch is not happy with system "iomem_resource" being used for + * the region allocation it can provide it's own view by creating specific + * Xen resource with unused regions of guest physical address space provided + * by the hypervisor. + */ +int __weak __init arch_xen_unpopulated_init(struct resource **res) +{ + *res = &iomem_resource; + + return 0; +} + +static int fill_list(unsigned int nr_pages) +{ + struct dev_pagemap *pgmap; + struct resource *res, *tmp_res = NULL; + void *vaddr; + unsigned int i, alloc_pages = round_up(nr_pages, PAGES_PER_SECTION); + struct range mhp_range; + int ret; + + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return -ENOMEM; + + res->name = "Xen scratch"; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + + mhp_range = mhp_get_pluggable_range(true); + + ret = allocate_resource(target_resource, res, + alloc_pages * PAGE_SIZE, mhp_range.start, mhp_range.end, + PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL); + if (ret < 0) { + pr_err("Cannot allocate new IOMEM resource\n"); + goto err_resource; + } + + /* + * Reserve the region previously allocated from Xen resource to avoid + * re-using it by someone else. + */ + if (target_resource != &iomem_resource) { + tmp_res = kzalloc(sizeof(*tmp_res), GFP_KERNEL); + if (!tmp_res) { + ret = -ENOMEM; + goto err_insert; + } + + tmp_res->name = res->name; + tmp_res->start = res->start; + tmp_res->end = res->end; + tmp_res->flags = res->flags; + + ret = request_resource(&iomem_resource, tmp_res); + if (ret < 0) { + pr_err("Cannot request resource %pR (%d)\n", tmp_res, ret); + kfree(tmp_res); + goto err_insert; + } + } + + pgmap = kzalloc(sizeof(*pgmap), GFP_KERNEL); + if (!pgmap) { + ret = -ENOMEM; + goto err_pgmap; + } + + pgmap->type = MEMORY_DEVICE_GENERIC; + pgmap->range = (struct range) { + .start = res->start, + .end = res->end, + }; + pgmap->nr_range = 1; + pgmap->owner = res; + +#ifdef CONFIG_XEN_HAVE_PVMMU + /* + * memremap will build page tables for the new memory so + * the p2m must contain invalid entries so the correct + * non-present PTEs will be written. + * + * If a failure occurs, the original (identity) p2m entries + * are not restored since this region is now known not to + * conflict with any devices. + */ + if (xen_pv_domain()) { + xen_pfn_t pfn = PFN_DOWN(res->start); + + for (i = 0; i < alloc_pages; i++) { + if (!set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY)) { + pr_warn("set_phys_to_machine() failed, no memory added\n"); + ret = -ENOMEM; + goto err_memremap; + } + } + } +#endif + + vaddr = memremap_pages(pgmap, NUMA_NO_NODE); + if (IS_ERR(vaddr)) { + pr_err("Cannot remap memory range\n"); + ret = PTR_ERR(vaddr); + goto err_memremap; + } + + for (i = 0; i < alloc_pages; i++) { + struct page *pg = virt_to_page(vaddr + PAGE_SIZE * i); + + pg->zone_device_data = page_list; + page_list = pg; + list_count++; + } + + return 0; + +err_memremap: + kfree(pgmap); +err_pgmap: + if (tmp_res) { + release_resource(tmp_res); + kfree(tmp_res); + } +err_insert: + release_resource(res); +err_resource: + kfree(res); + return ret; +} + +/** + * xen_alloc_unpopulated_pages - alloc unpopulated pages + * @nr_pages: Number of pages + * @pages: pages returned + * @return 0 on success, error otherwise + */ +int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages) +{ + unsigned int i; + int ret = 0; + + /* + * Fallback to default behavior if we do not have any suitable resource + * to allocate required region from and as the result we won't be able to + * construct pages. + */ + if (!target_resource) + return xen_alloc_ballooned_pages(nr_pages, pages); + + mutex_lock(&list_lock); + if (list_count < nr_pages) { + ret = fill_list(nr_pages - list_count); + if (ret) + goto out; + } + + for (i = 0; i < nr_pages; i++) { + struct page *pg = page_list; + + BUG_ON(!pg); + page_list = pg->zone_device_data; + list_count--; + pages[i] = pg; + +#ifdef CONFIG_XEN_HAVE_PVMMU + if (xen_pv_domain()) { + ret = xen_alloc_p2m_entry(page_to_pfn(pg)); + if (ret < 0) { + unsigned int j; + + for (j = 0; j <= i; j++) { + pages[j]->zone_device_data = page_list; + page_list = pages[j]; + list_count++; + } + goto out; + } + } +#endif + } + +out: + mutex_unlock(&list_lock); + return ret; +} +EXPORT_SYMBOL(xen_alloc_unpopulated_pages); + +/** + * xen_free_unpopulated_pages - return unpopulated pages + * @nr_pages: Number of pages + * @pages: pages to return + */ +void xen_free_unpopulated_pages(unsigned int nr_pages, struct page **pages) +{ + unsigned int i; + + if (!target_resource) { + xen_free_ballooned_pages(nr_pages, pages); + return; + } + + mutex_lock(&list_lock); + for (i = 0; i < nr_pages; i++) { + pages[i]->zone_device_data = page_list; + page_list = pages[i]; + list_count++; + } + mutex_unlock(&list_lock); +} +EXPORT_SYMBOL(xen_free_unpopulated_pages); + +static int __init unpopulated_init(void) +{ + int ret; + + if (!xen_domain()) + return -ENODEV; + + ret = arch_xen_unpopulated_init(&target_resource); + if (ret) { + pr_err("xen:unpopulated: Cannot initialize target resource\n"); + target_resource = NULL; + } + + return ret; +} +early_initcall(unpopulated_init); diff --git a/drivers/xen/xen-acpi-cpuhotplug.c b/drivers/xen/xen-acpi-cpuhotplug.c deleted file mode 100644 index fdc9e67b842d..000000000000 --- a/drivers/xen/xen-acpi-cpuhotplug.c +++ /dev/null @@ -1,456 +0,0 @@ -/* - * Copyright (C) 2012 Intel Corporation - * Author: Liu Jinsong <jinsong.liu@intel.com> - * Author: Jiang Yunhong <yunhong.jiang@intel.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/cpu.h> -#include <linux/acpi.h> -#include <linux/uaccess.h> -#include <acpi/processor.h> -#include <xen/acpi.h> -#include <xen/interface/platform.h> -#include <asm/xen/hypercall.h> - -#define PREFIX "ACPI:xen_cpu_hotplug:" - -#define INSTALL_NOTIFY_HANDLER 0 -#define UNINSTALL_NOTIFY_HANDLER 1 - -static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr); - -/* -------------------------------------------------------------------------- - Driver Interface --------------------------------------------------------------------------- */ - -static int xen_acpi_processor_enable(struct acpi_device *device) -{ - acpi_status status = 0; - unsigned long long value; - union acpi_object object = { 0 }; - struct acpi_buffer buffer = { sizeof(union acpi_object), &object }; - struct acpi_processor *pr = acpi_driver_data(device); - - if (!strcmp(acpi_device_hid(device), ACPI_PROCESSOR_OBJECT_HID)) { - /* Declared with "Processor" statement; match ProcessorID */ - status = acpi_evaluate_object(pr->handle, NULL, NULL, &buffer); - if (ACPI_FAILURE(status)) { - pr_err(PREFIX "Evaluating processor object\n"); - return -ENODEV; - } - - pr->acpi_id = object.processor.proc_id; - } else { - /* Declared with "Device" statement; match _UID */ - status = acpi_evaluate_integer(pr->handle, METHOD_NAME__UID, - NULL, &value); - if (ACPI_FAILURE(status)) { - pr_err(PREFIX "Evaluating processor _UID\n"); - return -ENODEV; - } - - pr->acpi_id = value; - } - - pr->id = xen_pcpu_id(pr->acpi_id); - - if (invalid_logical_cpuid(pr->id)) - /* This cpu is not presented at hypervisor, try to hotadd it */ - if (ACPI_FAILURE(xen_acpi_cpu_hotadd(pr))) { - pr_err(PREFIX "Hotadd CPU (acpi_id = %d) failed.\n", - pr->acpi_id); - return -ENODEV; - } - - return 0; -} - -static int xen_acpi_processor_add(struct acpi_device *device) -{ - int ret; - struct acpi_processor *pr; - - if (!device) - return -EINVAL; - - pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL); - if (!pr) - return -ENOMEM; - - pr->handle = device->handle; - strcpy(acpi_device_name(device), ACPI_PROCESSOR_DEVICE_NAME); - strcpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS); - device->driver_data = pr; - - ret = xen_acpi_processor_enable(device); - if (ret) - pr_err(PREFIX "Error when enabling Xen processor\n"); - - return ret; -} - -static int xen_acpi_processor_remove(struct acpi_device *device) -{ - struct acpi_processor *pr; - - if (!device) - return -EINVAL; - - pr = acpi_driver_data(device); - if (!pr) - return -EINVAL; - - kfree(pr); - return 0; -} - -/*-------------------------------------------------------------- - Acpi processor hotplug support ---------------------------------------------------------------*/ - -static int is_processor_present(acpi_handle handle) -{ - acpi_status status; - unsigned long long sta = 0; - - - status = acpi_evaluate_integer(handle, "_STA", NULL, &sta); - - if (ACPI_SUCCESS(status) && (sta & ACPI_STA_DEVICE_PRESENT)) - return 1; - - /* - * _STA is mandatory for a processor that supports hot plug - */ - if (status == AE_NOT_FOUND) - pr_info(PREFIX "Processor does not support hot plug\n"); - else - pr_info(PREFIX "Processor Device is not present"); - return 0; -} - -static int xen_apic_id(acpi_handle handle) -{ - struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; - union acpi_object *obj; - struct acpi_madt_local_apic *lapic; - int apic_id; - - if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) - return -EINVAL; - - if (!buffer.length || !buffer.pointer) - return -EINVAL; - - obj = buffer.pointer; - if (obj->type != ACPI_TYPE_BUFFER || - obj->buffer.length < sizeof(*lapic)) { - kfree(buffer.pointer); - return -EINVAL; - } - - lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer; - - if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC || - !(lapic->lapic_flags & ACPI_MADT_ENABLED)) { - kfree(buffer.pointer); - return -EINVAL; - } - - apic_id = (uint32_t)lapic->id; - kfree(buffer.pointer); - buffer.length = ACPI_ALLOCATE_BUFFER; - buffer.pointer = NULL; - - return apic_id; -} - -static int xen_hotadd_cpu(struct acpi_processor *pr) -{ - int cpu_id, apic_id, pxm; - struct xen_platform_op op; - - apic_id = xen_apic_id(pr->handle); - if (apic_id < 0) { - pr_err(PREFIX "Failed to get apic_id for acpi_id %d\n", - pr->acpi_id); - return -ENODEV; - } - - pxm = xen_acpi_get_pxm(pr->handle); - if (pxm < 0) { - pr_err(PREFIX "Failed to get _PXM for acpi_id %d\n", - pr->acpi_id); - return pxm; - } - - op.cmd = XENPF_cpu_hotadd; - op.u.cpu_add.apic_id = apic_id; - op.u.cpu_add.acpi_id = pr->acpi_id; - op.u.cpu_add.pxm = pxm; - - cpu_id = HYPERVISOR_platform_op(&op); - if (cpu_id < 0) - pr_err(PREFIX "Failed to hotadd CPU for acpi_id %d\n", - pr->acpi_id); - - return cpu_id; -} - -static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr) -{ - if (!is_processor_present(pr->handle)) - return AE_ERROR; - - pr->id = xen_hotadd_cpu(pr); - if (invalid_logical_cpuid(pr->id)) - return AE_ERROR; - - /* - * Sync with Xen hypervisor, providing new /sys/.../xen_cpuX - * interface after cpu hotadded. - */ - xen_pcpu_hotplug_sync(); - - return AE_OK; -} - -static int acpi_processor_device_remove(struct acpi_device *device) -{ - pr_debug(PREFIX "Xen does not support CPU hotremove\n"); - - return -ENOSYS; -} - -static void acpi_processor_hotplug_notify(acpi_handle handle, - u32 event, void *data) -{ - struct acpi_processor *pr; - struct acpi_device *device = NULL; - u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */ - int result; - - acpi_scan_lock_acquire(); - - switch (event) { - case ACPI_NOTIFY_BUS_CHECK: - case ACPI_NOTIFY_DEVICE_CHECK: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Processor driver received %s event\n", - (event == ACPI_NOTIFY_BUS_CHECK) ? - "ACPI_NOTIFY_BUS_CHECK" : "ACPI_NOTIFY_DEVICE_CHECK")); - - if (!is_processor_present(handle)) - break; - - acpi_bus_get_device(handle, &device); - if (acpi_device_enumerated(device)) - break; - - result = acpi_bus_scan(handle); - if (result) { - pr_err(PREFIX "Unable to add the device\n"); - break; - } - device = NULL; - acpi_bus_get_device(handle, &device); - if (!acpi_device_enumerated(device)) { - pr_err(PREFIX "Missing device object\n"); - break; - } - ost_code = ACPI_OST_SC_SUCCESS; - break; - - case ACPI_NOTIFY_EJECT_REQUEST: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "received ACPI_NOTIFY_EJECT_REQUEST\n")); - - if (acpi_bus_get_device(handle, &device)) { - pr_err(PREFIX "Device don't exist, dropping EJECT\n"); - break; - } - pr = acpi_driver_data(device); - if (!pr) { - pr_err(PREFIX "Driver data is NULL, dropping EJECT\n"); - break; - } - - /* - * TBD: implement acpi_processor_device_remove if Xen support - * CPU hotremove in the future. - */ - acpi_processor_device_remove(device); - break; - - default: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Unsupported event [0x%x]\n", event)); - - /* non-hotplug event; possibly handled by other handler */ - goto out; - } - - (void) acpi_evaluate_ost(handle, event, ost_code, NULL); - -out: - acpi_scan_lock_release(); -} - -static acpi_status is_processor_device(acpi_handle handle) -{ - struct acpi_device_info *info; - char *hid; - acpi_status status; - - status = acpi_get_object_info(handle, &info); - if (ACPI_FAILURE(status)) - return status; - - if (info->type == ACPI_TYPE_PROCESSOR) { - kfree(info); - return AE_OK; /* found a processor object */ - } - - if (!(info->valid & ACPI_VALID_HID)) { - kfree(info); - return AE_ERROR; - } - - hid = info->hardware_id.string; - if ((hid == NULL) || strcmp(hid, ACPI_PROCESSOR_DEVICE_HID)) { - kfree(info); - return AE_ERROR; - } - - kfree(info); - return AE_OK; /* found a processor device object */ -} - -static acpi_status -processor_walk_namespace_cb(acpi_handle handle, - u32 lvl, void *context, void **rv) -{ - acpi_status status; - int *action = context; - - status = is_processor_device(handle); - if (ACPI_FAILURE(status)) - return AE_OK; /* not a processor; continue to walk */ - - switch (*action) { - case INSTALL_NOTIFY_HANDLER: - acpi_install_notify_handler(handle, - ACPI_SYSTEM_NOTIFY, - acpi_processor_hotplug_notify, - NULL); - break; - case UNINSTALL_NOTIFY_HANDLER: - acpi_remove_notify_handler(handle, - ACPI_SYSTEM_NOTIFY, - acpi_processor_hotplug_notify); - break; - default: - break; - } - - /* found a processor; skip walking underneath */ - return AE_CTRL_DEPTH; -} - -static -void acpi_processor_install_hotplug_notify(void) -{ - int action = INSTALL_NOTIFY_HANDLER; - acpi_walk_namespace(ACPI_TYPE_ANY, - ACPI_ROOT_OBJECT, - ACPI_UINT32_MAX, - processor_walk_namespace_cb, NULL, &action, NULL); -} - -static -void acpi_processor_uninstall_hotplug_notify(void) -{ - int action = UNINSTALL_NOTIFY_HANDLER; - acpi_walk_namespace(ACPI_TYPE_ANY, - ACPI_ROOT_OBJECT, - ACPI_UINT32_MAX, - processor_walk_namespace_cb, NULL, &action, NULL); -} - -static const struct acpi_device_id processor_device_ids[] = { - {ACPI_PROCESSOR_OBJECT_HID, 0}, - {ACPI_PROCESSOR_DEVICE_HID, 0}, - {"", 0}, -}; -MODULE_DEVICE_TABLE(acpi, processor_device_ids); - -static struct acpi_driver xen_acpi_processor_driver = { - .name = "processor", - .class = ACPI_PROCESSOR_CLASS, - .ids = processor_device_ids, - .ops = { - .add = xen_acpi_processor_add, - .remove = xen_acpi_processor_remove, - }, -}; - -static int __init xen_acpi_processor_init(void) -{ - int result = 0; - - if (!xen_initial_domain()) - return -ENODEV; - - /* unregister the stub which only used to reserve driver space */ - xen_stub_processor_exit(); - - result = acpi_bus_register_driver(&xen_acpi_processor_driver); - if (result < 0) { - xen_stub_processor_init(); - return result; - } - - acpi_processor_install_hotplug_notify(); - return 0; -} - -static void __exit xen_acpi_processor_exit(void) -{ - if (!xen_initial_domain()) - return; - - acpi_processor_uninstall_hotplug_notify(); - - acpi_bus_unregister_driver(&xen_acpi_processor_driver); - - /* - * stub reserve space again to prevent any chance of native - * driver loading. - */ - xen_stub_processor_init(); - return; -} - -module_init(xen_acpi_processor_init); -module_exit(xen_acpi_processor_exit); -ACPI_MODULE_NAME("xen-acpi-cpuhotplug"); -MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>"); -MODULE_DESCRIPTION("Xen Hotplug CPU Driver"); -MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xen-acpi-memhotplug.c b/drivers/xen/xen-acpi-memhotplug.c deleted file mode 100644 index 4fc886cd5586..000000000000 --- a/drivers/xen/xen-acpi-memhotplug.c +++ /dev/null @@ -1,485 +0,0 @@ -/* - * Copyright (C) 2012 Intel Corporation - * Author: Liu Jinsong <jinsong.liu@intel.com> - * Author: Jiang Yunhong <yunhong.jiang@intel.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/acpi.h> -#include <xen/acpi.h> -#include <xen/interface/platform.h> -#include <asm/xen/hypercall.h> - -#define PREFIX "ACPI:xen_memory_hotplug:" - -struct acpi_memory_info { - struct list_head list; - u64 start_addr; /* Memory Range start physical addr */ - u64 length; /* Memory Range length */ - unsigned short caching; /* memory cache attribute */ - unsigned short write_protect; /* memory read/write attribute */ - /* copied from buffer getting from _CRS */ - unsigned int enabled:1; -}; - -struct acpi_memory_device { - struct acpi_device *device; - struct list_head res_list; -}; - -static bool acpi_hotmem_initialized __read_mostly; - -static int xen_hotadd_memory(int pxm, struct acpi_memory_info *info) -{ - int rc; - struct xen_platform_op op; - - op.cmd = XENPF_mem_hotadd; - op.u.mem_add.spfn = info->start_addr >> PAGE_SHIFT; - op.u.mem_add.epfn = (info->start_addr + info->length) >> PAGE_SHIFT; - op.u.mem_add.pxm = pxm; - - rc = HYPERVISOR_dom0_op(&op); - if (rc) - pr_err(PREFIX "Xen Hotplug Memory Add failed on " - "0x%lx -> 0x%lx, _PXM: %d, error: %d\n", - (unsigned long)info->start_addr, - (unsigned long)(info->start_addr + info->length), - pxm, rc); - - return rc; -} - -static int xen_acpi_memory_enable_device(struct acpi_memory_device *mem_device) -{ - int pxm, result; - int num_enabled = 0; - struct acpi_memory_info *info; - - if (!mem_device) - return -EINVAL; - - pxm = xen_acpi_get_pxm(mem_device->device->handle); - if (pxm < 0) - return pxm; - - list_for_each_entry(info, &mem_device->res_list, list) { - if (info->enabled) { /* just sanity check...*/ - num_enabled++; - continue; - } - - if (!info->length) - continue; - - result = xen_hotadd_memory(pxm, info); - if (result) - continue; - info->enabled = 1; - num_enabled++; - } - - if (!num_enabled) - return -ENODEV; - - return 0; -} - -static acpi_status -acpi_memory_get_resource(struct acpi_resource *resource, void *context) -{ - struct acpi_memory_device *mem_device = context; - struct acpi_resource_address64 address64; - struct acpi_memory_info *info, *new; - acpi_status status; - - status = acpi_resource_to_address64(resource, &address64); - if (ACPI_FAILURE(status) || - (address64.resource_type != ACPI_MEMORY_RANGE)) - return AE_OK; - - list_for_each_entry(info, &mem_device->res_list, list) { - if ((info->caching == address64.info.mem.caching) && - (info->write_protect == address64.info.mem.write_protect) && - (info->start_addr + info->length == address64.address.minimum)) { - info->length += address64.address.address_length; - return AE_OK; - } - } - - new = kzalloc(sizeof(struct acpi_memory_info), GFP_KERNEL); - if (!new) - return AE_ERROR; - - INIT_LIST_HEAD(&new->list); - new->caching = address64.info.mem.caching; - new->write_protect = address64.info.mem.write_protect; - new->start_addr = address64.address.minimum; - new->length = address64.address.address_length; - list_add_tail(&new->list, &mem_device->res_list); - - return AE_OK; -} - -static int -acpi_memory_get_device_resources(struct acpi_memory_device *mem_device) -{ - acpi_status status; - struct acpi_memory_info *info, *n; - - if (!list_empty(&mem_device->res_list)) - return 0; - - status = acpi_walk_resources(mem_device->device->handle, - METHOD_NAME__CRS, acpi_memory_get_resource, mem_device); - - if (ACPI_FAILURE(status)) { - list_for_each_entry_safe(info, n, &mem_device->res_list, list) - kfree(info); - INIT_LIST_HEAD(&mem_device->res_list); - return -EINVAL; - } - - return 0; -} - -static int acpi_memory_get_device(acpi_handle handle, - struct acpi_memory_device **mem_device) -{ - struct acpi_device *device = NULL; - int result = 0; - - acpi_scan_lock_acquire(); - - acpi_bus_get_device(handle, &device); - if (acpi_device_enumerated(device)) - goto end; - - /* - * Now add the notified device. This creates the acpi_device - * and invokes .add function - */ - result = acpi_bus_scan(handle); - if (result) { - pr_warn(PREFIX "ACPI namespace scan failed\n"); - result = -EINVAL; - goto out; - } - device = NULL; - acpi_bus_get_device(handle, &device); - if (!acpi_device_enumerated(device)) { - pr_warn(PREFIX "Missing device object\n"); - result = -EINVAL; - goto out; - } - -end: - *mem_device = acpi_driver_data(device); - if (!(*mem_device)) { - pr_err(PREFIX "driver data not found\n"); - result = -ENODEV; - goto out; - } - -out: - acpi_scan_lock_release(); - return result; -} - -static int acpi_memory_check_device(struct acpi_memory_device *mem_device) -{ - unsigned long long current_status; - - /* Get device present/absent information from the _STA */ - if (ACPI_FAILURE(acpi_evaluate_integer(mem_device->device->handle, - "_STA", NULL, ¤t_status))) - return -ENODEV; - /* - * Check for device status. Device should be - * present/enabled/functioning. - */ - if (!((current_status & ACPI_STA_DEVICE_PRESENT) - && (current_status & ACPI_STA_DEVICE_ENABLED) - && (current_status & ACPI_STA_DEVICE_FUNCTIONING))) - return -ENODEV; - - return 0; -} - -static int acpi_memory_disable_device(struct acpi_memory_device *mem_device) -{ - pr_debug(PREFIX "Xen does not support memory hotremove\n"); - - return -ENOSYS; -} - -static void acpi_memory_device_notify(acpi_handle handle, u32 event, void *data) -{ - struct acpi_memory_device *mem_device; - struct acpi_device *device; - u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */ - - switch (event) { - case ACPI_NOTIFY_BUS_CHECK: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "\nReceived BUS CHECK notification for device\n")); - /* Fall Through */ - case ACPI_NOTIFY_DEVICE_CHECK: - if (event == ACPI_NOTIFY_DEVICE_CHECK) - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "\nReceived DEVICE CHECK notification for device\n")); - - if (acpi_memory_get_device(handle, &mem_device)) { - pr_err(PREFIX "Cannot find driver data\n"); - break; - } - - ost_code = ACPI_OST_SC_SUCCESS; - break; - - case ACPI_NOTIFY_EJECT_REQUEST: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "\nReceived EJECT REQUEST notification for device\n")); - - acpi_scan_lock_acquire(); - if (acpi_bus_get_device(handle, &device)) { - acpi_scan_lock_release(); - pr_err(PREFIX "Device doesn't exist\n"); - break; - } - mem_device = acpi_driver_data(device); - if (!mem_device) { - acpi_scan_lock_release(); - pr_err(PREFIX "Driver Data is NULL\n"); - break; - } - - /* - * TBD: implement acpi_memory_disable_device and invoke - * acpi_bus_remove if Xen support hotremove in the future - */ - acpi_memory_disable_device(mem_device); - acpi_scan_lock_release(); - break; - - default: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Unsupported event [0x%x]\n", event)); - /* non-hotplug event; possibly handled by other handler */ - return; - } - - (void) acpi_evaluate_ost(handle, event, ost_code, NULL); - return; -} - -static int xen_acpi_memory_device_add(struct acpi_device *device) -{ - int result; - struct acpi_memory_device *mem_device = NULL; - - - if (!device) - return -EINVAL; - - mem_device = kzalloc(sizeof(struct acpi_memory_device), GFP_KERNEL); - if (!mem_device) - return -ENOMEM; - - INIT_LIST_HEAD(&mem_device->res_list); - mem_device->device = device; - sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME); - sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS); - device->driver_data = mem_device; - - /* Get the range from the _CRS */ - result = acpi_memory_get_device_resources(mem_device); - if (result) { - kfree(mem_device); - return result; - } - - /* - * For booting existed memory devices, early boot code has recognized - * memory area by EFI/E820. If DSDT shows these memory devices on boot, - * hotplug is not necessary for them. - * For hot-added memory devices during runtime, it need hypercall to - * Xen hypervisor to add memory. - */ - if (!acpi_hotmem_initialized) - return 0; - - if (!acpi_memory_check_device(mem_device)) - result = xen_acpi_memory_enable_device(mem_device); - - return result; -} - -static int xen_acpi_memory_device_remove(struct acpi_device *device) -{ - struct acpi_memory_device *mem_device = NULL; - - if (!device || !acpi_driver_data(device)) - return -EINVAL; - - mem_device = acpi_driver_data(device); - kfree(mem_device); - - return 0; -} - -/* - * Helper function to check for memory device - */ -static acpi_status is_memory_device(acpi_handle handle) -{ - char *hardware_id; - acpi_status status; - struct acpi_device_info *info; - - status = acpi_get_object_info(handle, &info); - if (ACPI_FAILURE(status)) - return status; - - if (!(info->valid & ACPI_VALID_HID)) { - kfree(info); - return AE_ERROR; - } - - hardware_id = info->hardware_id.string; - if ((hardware_id == NULL) || - (strcmp(hardware_id, ACPI_MEMORY_DEVICE_HID))) - status = AE_ERROR; - - kfree(info); - return status; -} - -static acpi_status -acpi_memory_register_notify_handler(acpi_handle handle, - u32 level, void *ctxt, void **retv) -{ - acpi_status status; - - status = is_memory_device(handle); - if (ACPI_FAILURE(status)) - return AE_OK; /* continue */ - - status = acpi_install_notify_handler(handle, ACPI_SYSTEM_NOTIFY, - acpi_memory_device_notify, NULL); - /* continue */ - return AE_OK; -} - -static acpi_status -acpi_memory_deregister_notify_handler(acpi_handle handle, - u32 level, void *ctxt, void **retv) -{ - acpi_status status; - - status = is_memory_device(handle); - if (ACPI_FAILURE(status)) - return AE_OK; /* continue */ - - status = acpi_remove_notify_handler(handle, - ACPI_SYSTEM_NOTIFY, - acpi_memory_device_notify); - - return AE_OK; /* continue */ -} - -static const struct acpi_device_id memory_device_ids[] = { - {ACPI_MEMORY_DEVICE_HID, 0}, - {"", 0}, -}; -MODULE_DEVICE_TABLE(acpi, memory_device_ids); - -static struct acpi_driver xen_acpi_memory_device_driver = { - .name = "acpi_memhotplug", - .class = ACPI_MEMORY_DEVICE_CLASS, - .ids = memory_device_ids, - .ops = { - .add = xen_acpi_memory_device_add, - .remove = xen_acpi_memory_device_remove, - }, -}; - -static int __init xen_acpi_memory_device_init(void) -{ - int result; - acpi_status status; - - if (!xen_initial_domain()) - return -ENODEV; - - /* unregister the stub which only used to reserve driver space */ - xen_stub_memory_device_exit(); - - result = acpi_bus_register_driver(&xen_acpi_memory_device_driver); - if (result < 0) { - xen_stub_memory_device_init(); - return -ENODEV; - } - - status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, - ACPI_UINT32_MAX, - acpi_memory_register_notify_handler, - NULL, NULL, NULL); - - if (ACPI_FAILURE(status)) { - pr_warn(PREFIX "walk_namespace failed\n"); - acpi_bus_unregister_driver(&xen_acpi_memory_device_driver); - xen_stub_memory_device_init(); - return -ENODEV; - } - - acpi_hotmem_initialized = true; - return 0; -} - -static void __exit xen_acpi_memory_device_exit(void) -{ - acpi_status status; - - if (!xen_initial_domain()) - return; - - status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, - ACPI_UINT32_MAX, - acpi_memory_deregister_notify_handler, - NULL, NULL, NULL); - if (ACPI_FAILURE(status)) - pr_warn(PREFIX "walk_namespace failed\n"); - - acpi_bus_unregister_driver(&xen_acpi_memory_device_driver); - - /* - * stub reserve space again to prevent any chance of native - * driver loading. - */ - xen_stub_memory_device_init(); - return; -} - -module_init(xen_acpi_memory_device_init); -module_exit(xen_acpi_memory_device_exit); -ACPI_MODULE_NAME("xen-acpi-memhotplug"); -MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>"); -MODULE_DESCRIPTION("Xen Hotplug Mem Driver"); -MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xen-acpi-pad.c b/drivers/xen/xen-acpi-pad.c index e25ab76b9c99..ede69a5278d3 100644 --- a/drivers/xen/xen-acpi-pad.c +++ b/drivers/xen/xen-acpi-pad.c @@ -1,17 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * xen-acpi-pad.c - Xen pad interface * * Copyright (c) 2012, Intel Corporation. * Author: Liu, Jinsong <jinsong.liu@intel.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -130,7 +122,7 @@ static int acpi_pad_add(struct acpi_device *device) return 0; } -static int acpi_pad_remove(struct acpi_device *device) +static void acpi_pad_remove(struct acpi_device *device) { mutex_lock(&xen_cpu_lock); xen_acpi_pad_idle_cpus(0); @@ -138,7 +130,6 @@ static int acpi_pad_remove(struct acpi_device *device) acpi_remove_notify_handler(device->handle, ACPI_DEVICE_NOTIFY, acpi_pad_notify); - return 0; } static const struct acpi_device_id pad_device_ids[] = { diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c index fbb9137c7d02..f2e8eaf684ba 100644 --- a/drivers/xen/xen-acpi-processor.c +++ b/drivers/xen/xen-acpi-processor.c @@ -1,20 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2012 by Oracle Inc * Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> * - * This code borrows ideas from https://lkml.org/lkml/2011/11/30/249 + * This code borrows ideas from + * https://lore.kernel.org/lkml/1322673664-14642-6-git-send-email-konrad.wilk@oracle.com * so many thanks go to Kevin Tian <kevin.tian@intel.com> * and Yu Ke <ke.yu@intel.com>. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -410,21 +402,21 @@ static int check_acpi_ids(struct acpi_processor *pr_backup) /* All online CPUs have been processed at this stage. Now verify * whether in fact "online CPUs" == physical CPUs. */ - acpi_id_present = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL); + acpi_id_present = bitmap_zalloc(nr_acpi_bits, GFP_KERNEL); if (!acpi_id_present) return -ENOMEM; - acpi_id_cst_present = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL); + acpi_id_cst_present = bitmap_zalloc(nr_acpi_bits, GFP_KERNEL); if (!acpi_id_cst_present) { - kfree(acpi_id_present); + bitmap_free(acpi_id_present); return -ENOMEM; } acpi_psd = kcalloc(nr_acpi_bits, sizeof(struct acpi_psd_package), GFP_KERNEL); if (!acpi_psd) { - kfree(acpi_id_present); - kfree(acpi_id_cst_present); + bitmap_free(acpi_id_present); + bitmap_free(acpi_id_cst_present); return -ENOMEM; } @@ -458,7 +450,7 @@ static struct acpi_processor_performance __percpu *acpi_perf_data; static void free_acpi_perf_data(void) { - unsigned int i; + int i; /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */ for_each_possible_cpu(i) @@ -470,7 +462,7 @@ static void free_acpi_perf_data(void) static int xen_upload_processor_pm_data(void) { struct acpi_processor *pr_backup = NULL; - unsigned int i; + int i; int rc = 0; pr_info("Uploading Xen processor PM info\n"); @@ -481,11 +473,8 @@ static int xen_upload_processor_pm_data(void) if (!_pr) continue; - if (!pr_backup) { - pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL); - if (pr_backup) - memcpy(pr_backup, _pr, sizeof(struct acpi_processor)); - } + if (!pr_backup) + pr_backup = kmemdup(_pr, sizeof(*_pr), GFP_KERNEL); (void)upload_pm_data(_pr); } @@ -506,7 +495,7 @@ static void xen_acpi_processor_resume_worker(struct work_struct *dummy) pr_info("ACPI data upload failed, error = %d\n", rc); } -static void xen_acpi_processor_resume(void) +static void xen_acpi_processor_resume(void *data) { static DECLARE_WORK(wq, xen_acpi_processor_resume_worker); @@ -520,27 +509,31 @@ static void xen_acpi_processor_resume(void) schedule_work(&wq); } -static struct syscore_ops xap_syscore_ops = { +static const struct syscore_ops xap_syscore_ops = { .resume = xen_acpi_processor_resume, }; +static struct syscore xap_syscore = { + .ops = &xap_syscore_ops, +}; + static int __init xen_acpi_processor_init(void) { - unsigned int i; + int i; int rc; if (!xen_initial_domain()) return -ENODEV; nr_acpi_bits = get_max_acpi_id() + 1; - acpi_ids_done = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL); + acpi_ids_done = bitmap_zalloc(nr_acpi_bits, GFP_KERNEL); if (!acpi_ids_done) return -ENOMEM; acpi_perf_data = alloc_percpu(struct acpi_processor_performance); if (!acpi_perf_data) { pr_debug("Memory allocation error for acpi_perf_data\n"); - kfree(acpi_ids_done); + bitmap_free(acpi_ids_done); return -ENOMEM; } for_each_possible_cpu(i) { @@ -574,7 +567,7 @@ static int __init xen_acpi_processor_init(void) if (rc) goto err_unregister; - register_syscore_ops(&xap_syscore_ops); + register_syscore(&xap_syscore); return 0; err_unregister: @@ -584,17 +577,17 @@ err_unregister: err_out: /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */ free_acpi_perf_data(); - kfree(acpi_ids_done); + bitmap_free(acpi_ids_done); return rc; } static void __exit xen_acpi_processor_exit(void) { int i; - unregister_syscore_ops(&xap_syscore_ops); - kfree(acpi_ids_done); - kfree(acpi_id_present); - kfree(acpi_id_cst_present); + unregister_syscore(&xap_syscore); + bitmap_free(acpi_ids_done); + bitmap_free(acpi_id_present); + bitmap_free(acpi_id_cst_present); kfree(acpi_psd); for_each_possible_cpu(i) acpi_processor_unregister_performance(i); diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c index 2acbfe104e46..b293d7652f15 100644 --- a/drivers/xen/xen-balloon.c +++ b/drivers/xen/xen-balloon.c @@ -37,6 +37,7 @@ #include <linux/mm_types.h> #include <linux/init.h> #include <linux/capability.h> +#include <linux/memory_hotplug.h> #include <xen/xen.h> #include <xen/interface/xen.h> @@ -50,6 +51,10 @@ #define BALLOON_CLASS_NAME "xen_memory" +#ifdef CONFIG_MEMORY_HOTPLUG +u64 xen_saved_max_mem_size = 0; +#endif + static struct device balloon_dev; static int register_balloon(struct device *dev); @@ -63,6 +68,12 @@ static void watch_target(struct xenbus_watch *watch, static bool watch_fired; static long target_diff; +#ifdef CONFIG_MEMORY_HOTPLUG + /* The balloon driver will take care of adding memory now. */ + if (xen_saved_max_mem_size) + max_mem_size = xen_saved_max_mem_size; +#endif + err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target); if (err != 1) { /* This is ok (for domain0 at least) - so just return */ @@ -83,7 +94,7 @@ static void watch_target(struct xenbus_watch *watch, "%llu", &static_max) == 1)) static_max >>= PAGE_SHIFT - 10; else - static_max = new_target; + static_max = balloon_stats.current_pages; target_diff = (xen_pv_domain() || xen_initial_domain()) ? 0 : static_max - balloon_stats.target_pages; @@ -118,20 +129,18 @@ void xen_balloon_init(void) { register_balloon(&balloon_dev); - register_xen_selfballooning(&balloon_dev); - register_xenstore_notifier(&xenstore_notifier); } EXPORT_SYMBOL_GPL(xen_balloon_init); #define BALLOON_SHOW(name, format, args...) \ - static ssize_t show_##name(struct device *dev, \ + static ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, \ char *buf) \ { \ return sprintf(buf, format, ##args); \ } \ - static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + static DEVICE_ATTR_RO(name) BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages)); BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low)); @@ -143,16 +152,15 @@ static DEVICE_ULONG_ATTR(retry_count, 0444, balloon_stats.retry_count); static DEVICE_ULONG_ATTR(max_retry_count, 0644, balloon_stats.max_retry_count); static DEVICE_BOOL_ATTR(scrub_pages, 0644, xen_scrub_pages); -static ssize_t show_target_kb(struct device *dev, struct device_attribute *attr, +static ssize_t target_kb_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages)); } -static ssize_t store_target_kb(struct device *dev, +static ssize_t target_kb_store(struct device *dev, struct device_attribute *attr, - const char *buf, - size_t count) + const char *buf, size_t count) { char *endchar; unsigned long long target_bytes; @@ -167,22 +175,19 @@ static ssize_t store_target_kb(struct device *dev, return count; } -static DEVICE_ATTR(target_kb, S_IRUGO | S_IWUSR, - show_target_kb, store_target_kb); +static DEVICE_ATTR_RW(target_kb); - -static ssize_t show_target(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t target_show(struct device *dev, struct device_attribute *attr, + char *buf) { return sprintf(buf, "%llu\n", (unsigned long long)balloon_stats.target_pages << PAGE_SHIFT); } -static ssize_t store_target(struct device *dev, +static ssize_t target_store(struct device *dev, struct device_attribute *attr, - const char *buf, - size_t count) + const char *buf, size_t count) { char *endchar; unsigned long long target_bytes; @@ -197,9 +202,7 @@ static ssize_t store_target(struct device *dev, return count; } -static DEVICE_ATTR(target, S_IRUGO | S_IWUSR, - show_target, store_target); - +static DEVICE_ATTR_RW(target); static struct attribute *balloon_attrs[] = { &dev_attr_target_kb.attr, @@ -234,7 +237,7 @@ static const struct attribute_group *balloon_groups[] = { NULL }; -static struct bus_type balloon_subsys = { +static const struct bus_type balloon_subsys = { .name = BALLOON_CLASS_NAME, .dev_name = BALLOON_CLASS_NAME, }; diff --git a/drivers/xen/xen-front-pgdir-shbuf.c b/drivers/xen/xen-front-pgdir-shbuf.c index 48a658dc7ccf..223870a0111b 100644 --- a/drivers/xen/xen-front-pgdir-shbuf.c +++ b/drivers/xen/xen-front-pgdir-shbuf.c @@ -21,16 +21,7 @@ #include <xen/xen-front-pgdir-shbuf.h> -#ifndef GRANT_INVALID_REF /* - * FIXME: usage of grant reference 0 as invalid grant reference: - * grant reference 0 is valid, but never exposed to a PV driver, - * because of the fact it is already in use/reserved by the PV console. - */ -#define GRANT_INVALID_REF 0 -#endif - -/** * This structure represents the structure of a shared page * that contains grant references to the pages of the shared * buffer. This structure is common to many Xen para-virtualized @@ -38,10 +29,11 @@ */ struct xen_page_directory { grant_ref_t gref_dir_next_page; - grant_ref_t gref[1]; /* Variable length */ +#define XEN_GREF_LIST_END 0 + grant_ref_t gref[]; /* Variable length */ }; -/** +/* * Shared buffer ops which are differently implemented * depending on the allocation mode, e.g. if the buffer * is allocated by the corresponding backend or frontend. @@ -69,7 +61,7 @@ struct xen_front_pgdir_shbuf_ops { int (*unmap)(struct xen_front_pgdir_shbuf *buf); }; -/** +/* * Get granted reference to the very first page of the * page directory. Usually this is passed to the backend, * so it can find/fill the grant references to the buffer's @@ -83,13 +75,13 @@ grant_ref_t xen_front_pgdir_shbuf_get_dir_start(struct xen_front_pgdir_shbuf *buf) { if (!buf->grefs) - return GRANT_INVALID_REF; + return INVALID_GRANT_REF; return buf->grefs[0]; } EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_get_dir_start); -/** +/* * Map granted references of the shared buffer. * * Depending on the shared buffer mode of allocation @@ -97,7 +89,7 @@ EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_get_dir_start); * shared by the frontend itself) or map the provided granted * references onto the backing storage (buf->pages). * - * \param buf shared buffer which grants to be maped. + * \param buf shared buffer which grants to be mapped. * \return zero on success or a negative number on failure. */ int xen_front_pgdir_shbuf_map(struct xen_front_pgdir_shbuf *buf) @@ -110,7 +102,7 @@ int xen_front_pgdir_shbuf_map(struct xen_front_pgdir_shbuf *buf) } EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_map); -/** +/* * Unmap granted references of the shared buffer. * * Depending on the shared buffer mode of allocation @@ -118,7 +110,7 @@ EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_map); * shared by the frontend itself) or unmap the provided granted * references. * - * \param buf shared buffer which grants to be unmaped. + * \param buf shared buffer which grants to be unmapped. * \return zero on success or a negative number on failure. */ int xen_front_pgdir_shbuf_unmap(struct xen_front_pgdir_shbuf *buf) @@ -131,7 +123,7 @@ int xen_front_pgdir_shbuf_unmap(struct xen_front_pgdir_shbuf *buf) } EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_unmap); -/** +/* * Free all the resources of the shared buffer. * * \param buf shared buffer which resources to be freed. @@ -142,9 +134,8 @@ void xen_front_pgdir_shbuf_free(struct xen_front_pgdir_shbuf *buf) int i; for (i = 0; i < buf->num_grefs; i++) - if (buf->grefs[i] != GRANT_INVALID_REF) - gnttab_end_foreign_access(buf->grefs[i], - 0, 0UL); + if (buf->grefs[i] != INVALID_GRANT_REF) + gnttab_end_foreign_access(buf->grefs[i], NULL); } kfree(buf->grefs); kfree(buf->directory); @@ -159,7 +150,7 @@ EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_free); offsetof(struct xen_page_directory, \ gref)) / sizeof(grant_ref_t)) -/** +/* * Get the number of pages the page directory consumes itself. * * \param buf shared buffer. @@ -169,7 +160,7 @@ static int get_num_pages_dir(struct xen_front_pgdir_shbuf *buf) return DIV_ROUND_UP(buf->num_pages, XEN_NUM_GREFS_PER_PAGE); } -/** +/* * Calculate the number of grant references needed to share the buffer * and its pages when backend allocates the buffer. * @@ -181,7 +172,7 @@ static void backend_calc_num_grefs(struct xen_front_pgdir_shbuf *buf) buf->num_grefs = get_num_pages_dir(buf); } -/** +/* * Calculate the number of grant references needed to share the buffer * and its pages when frontend allocates the buffer. * @@ -199,7 +190,7 @@ static void guest_calc_num_grefs(struct xen_front_pgdir_shbuf *buf) #define xen_page_to_vaddr(page) \ ((uintptr_t)pfn_to_kaddr(page_to_xen_pfn(page))) -/** +/* * Unmap the buffer previously mapped with grant references * provided by the backend. * @@ -247,7 +238,7 @@ static int backend_unmap(struct xen_front_pgdir_shbuf *buf) return ret; } -/** +/* * Map the buffer with grant references provided by the backend. * * \param buf shared buffer. @@ -305,11 +296,18 @@ static int backend_map(struct xen_front_pgdir_shbuf *buf) /* Save handles even if error, so we can unmap. */ for (cur_page = 0; cur_page < buf->num_pages; cur_page++) { - buf->backend_map_handles[cur_page] = map_ops[cur_page].handle; - if (unlikely(map_ops[cur_page].status != GNTST_okay)) + if (likely(map_ops[cur_page].status == GNTST_okay)) { + buf->backend_map_handles[cur_page] = + map_ops[cur_page].handle; + } else { + buf->backend_map_handles[cur_page] = + INVALID_GRANT_HANDLE; + if (!ret) + ret = -ENXIO; dev_err(&buf->xb_dev->dev, "Failed to map page %d: %d\n", cur_page, map_ops[cur_page].status); + } } if (ret) { @@ -322,7 +320,7 @@ static int backend_map(struct xen_front_pgdir_shbuf *buf) return ret; } -/** +/* * Fill page directory with grant references to the pages of the * page directory itself. * @@ -349,10 +347,10 @@ static void backend_fill_page_dir(struct xen_front_pgdir_shbuf *buf) } /* Last page must say there is no more pages. */ page_dir = (struct xen_page_directory *)ptr; - page_dir->gref_dir_next_page = GRANT_INVALID_REF; + page_dir->gref_dir_next_page = XEN_GREF_LIST_END; } -/** +/* * Fill page directory with grant references to the pages of the * page directory and the buffer we share with the backend. * @@ -378,7 +376,7 @@ static void guest_fill_page_dir(struct xen_front_pgdir_shbuf *buf) if (grefs_left <= XEN_NUM_GREFS_PER_PAGE) { to_copy = grefs_left; - page_dir->gref_dir_next_page = GRANT_INVALID_REF; + page_dir->gref_dir_next_page = XEN_GREF_LIST_END; } else { to_copy = XEN_NUM_GREFS_PER_PAGE; page_dir->gref_dir_next_page = buf->grefs[i + 1]; @@ -391,7 +389,7 @@ static void guest_fill_page_dir(struct xen_front_pgdir_shbuf *buf) } } -/** +/* * Grant references to the frontend's buffer pages. * * These will be shared with the backend, so it can @@ -420,7 +418,7 @@ static int guest_grant_refs_for_buffer(struct xen_front_pgdir_shbuf *buf, return 0; } -/** +/* * Grant all the references needed to share the buffer. * * Grant references to the page directory pages and, if @@ -468,7 +466,7 @@ static int grant_references(struct xen_front_pgdir_shbuf *buf) return 0; } -/** +/* * Allocate all required structures to mange shared buffer. * * \param buf shared buffer. @@ -508,7 +506,7 @@ static const struct xen_front_pgdir_shbuf_ops local_ops = { .grant_refs_for_buffer = guest_grant_refs_for_buffer, }; -/** +/* * Allocate a new instance of a shared buffer. * * \param cfg configuration to be used while allocating a new shared buffer. diff --git a/drivers/xen/xen-pciback/Makefile b/drivers/xen/xen-pciback/Makefile index e8d981d43235..d63df09de81c 100644 --- a/drivers/xen/xen-pciback/Makefile +++ b/drivers/xen/xen-pciback/Makefile @@ -1,5 +1,12 @@ # SPDX-License-Identifier: GPL-2.0 + +# N.B. The below cannot be expressed with a single line using +# CONFIG_XEN_PCI_STUB as it always remains in "y" state, +# thus preventing the driver to be built as a module. +# Please note, that CONFIG_XEN_PCIDEV_BACKEND and +# CONFIG_XEN_PCIDEV_STUB are mutually exclusive. obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o +obj-$(CONFIG_XEN_PCIDEV_STUB) += xen-pciback.o xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o xen-pciback-y += conf_space.o conf_space_header.o \ diff --git a/drivers/xen/xen-pciback/conf_space.c b/drivers/xen/xen-pciback/conf_space.c index 60111719b01f..d47eee6c5143 100644 --- a/drivers/xen/xen-pciback/conf_space.c +++ b/drivers/xen/xen-pciback/conf_space.c @@ -10,6 +10,8 @@ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> */ +#define dev_fmt(fmt) DRV_NAME ": " fmt + #include <linux/kernel.h> #include <linux/moduleparam.h> #include <linux/pci.h> @@ -154,9 +156,7 @@ int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size, * (as if device didn't respond) */ u32 value = 0, tmp_val; - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: read %d bytes at 0x%x\n", - pci_name(dev), size, offset); + dev_dbg(&dev->dev, "read %d bytes at 0x%x\n", size, offset); if (!valid_request(offset, size)) { err = XEN_PCI_ERR_invalid_offset; @@ -195,9 +195,7 @@ int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size, } out: - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: read %d bytes at 0x%x = %x\n", - pci_name(dev), size, offset, value); + dev_dbg(&dev->dev, "read %d bytes at 0x%x = %x\n", size, offset, value); *ret_val = value; return xen_pcibios_err_to_errno(err); @@ -212,10 +210,8 @@ int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, u32 value) u32 tmp_val; int field_start, field_end; - if (unlikely(verbose_request)) - printk(KERN_DEBUG - DRV_NAME ": %s: write request %d bytes at 0x%x = %x\n", - pci_name(dev), size, offset, value); + dev_dbg(&dev->dev, "write request %d bytes at 0x%x = %x\n", + size, offset, value); if (!valid_request(offset, size)) return XEN_PCI_ERR_invalid_offset; @@ -286,6 +282,50 @@ int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, u32 value) return xen_pcibios_err_to_errno(err); } +int xen_pcibk_get_interrupt_type(struct pci_dev *dev) +{ + int err; + u16 val; + int ret = 0; + + /* + * Do not trust dev->msi(x)_enabled here, as enabling could be done + * bypassing the pci_*msi* functions, by the qemu. + */ + if (dev->msi_cap) { + err = pci_read_config_word(dev, + dev->msi_cap + PCI_MSI_FLAGS, + &val); + if (err) + return err; + if (val & PCI_MSI_FLAGS_ENABLE) + ret |= INTERRUPT_TYPE_MSI; + } + if (dev->msix_cap) { + err = pci_read_config_word(dev, + dev->msix_cap + PCI_MSIX_FLAGS, + &val); + if (err) + return err; + if (val & PCI_MSIX_FLAGS_ENABLE) + ret |= INTERRUPT_TYPE_MSIX; + } + + /* + * PCIe spec says device cannot use INTx if MSI/MSI-X is enabled, + * so check for INTx only when both are disabled. + */ + if (!ret) { + err = pci_read_config_word(dev, PCI_COMMAND, &val); + if (err) + return err; + if (!(val & PCI_COMMAND_INTX_DISABLE)) + ret |= INTERRUPT_TYPE_INTX; + } + + return ret ?: INTERRUPT_TYPE_NONE; +} + void xen_pcibk_config_free_dyn_fields(struct pci_dev *dev) { struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); diff --git a/drivers/xen/xen-pciback/conf_space.h b/drivers/xen/xen-pciback/conf_space.h index 22db630717ea..5fe431c79f25 100644 --- a/drivers/xen/xen-pciback/conf_space.h +++ b/drivers/xen/xen-pciback/conf_space.h @@ -65,6 +65,11 @@ struct config_field_entry { void *data; }; +#define INTERRUPT_TYPE_NONE (0) +#define INTERRUPT_TYPE_INTX (1<<0) +#define INTERRUPT_TYPE_MSI (1<<1) +#define INTERRUPT_TYPE_MSIX (1<<2) + extern bool xen_pcibk_permissive; #define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset) @@ -126,4 +131,6 @@ int xen_pcibk_config_capability_init(void); int xen_pcibk_config_header_add_fields(struct pci_dev *dev); int xen_pcibk_config_capability_add_fields(struct pci_dev *dev); +int xen_pcibk_get_interrupt_type(struct pci_dev *dev); + #endif /* __XEN_PCIBACK_CONF_SPACE_H__ */ diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c index 73427d8e0116..cf568e899ee2 100644 --- a/drivers/xen/xen-pciback/conf_space_capability.c +++ b/drivers/xen/xen-pciback/conf_space_capability.c @@ -116,14 +116,13 @@ static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value, { int err; u16 old_value; - pci_power_t new_state, old_state; + pci_power_t new_state; err = pci_read_config_word(dev, offset, &old_value); if (err) goto out; - old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK); - new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK); + new_state = (__force pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK); new_value &= PM_OK_BITS; if ((old_value & PM_OK_BITS) != new_value) { @@ -161,7 +160,7 @@ static void *pm_ctrl_init(struct pci_dev *dev, int offset) } out: - return ERR_PTR(err); + return err ? ERR_PTR(err) : NULL; } static const struct config_field caplist_pm[] = { @@ -190,6 +189,94 @@ static const struct config_field caplist_pm[] = { {} }; +static struct msi_msix_field_config { + u16 enable_bit; /* bit for enabling MSI/MSI-X */ + u16 allowed_bits; /* bits allowed to be changed */ + unsigned int int_type; /* interrupt type for exclusiveness check */ +} msi_field_config = { + .enable_bit = PCI_MSI_FLAGS_ENABLE, + .allowed_bits = PCI_MSI_FLAGS_ENABLE, + .int_type = INTERRUPT_TYPE_MSI, +}, msix_field_config = { + .enable_bit = PCI_MSIX_FLAGS_ENABLE, + .allowed_bits = PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL, + .int_type = INTERRUPT_TYPE_MSIX, +}; + +static void *msi_field_init(struct pci_dev *dev, int offset) +{ + return &msi_field_config; +} + +static void *msix_field_init(struct pci_dev *dev, int offset) +{ + return &msix_field_config; +} + +static int msi_msix_flags_write(struct pci_dev *dev, int offset, u16 new_value, + void *data) +{ + int err; + u16 old_value; + const struct msi_msix_field_config *field_config = data; + const struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + + if (xen_pcibk_permissive || dev_data->permissive) + goto write; + + err = pci_read_config_word(dev, offset, &old_value); + if (err) + return err; + + if (new_value == old_value) + return 0; + + if (!dev_data->allow_interrupt_control || + (new_value ^ old_value) & ~field_config->allowed_bits) + return PCIBIOS_SET_FAILED; + + if (new_value & field_config->enable_bit) { + /* + * Don't allow enabling together with other interrupt type, but do + * allow enabling MSI(-X) while INTx is still active to please Linuxes + * MSI(-X) startup sequence. It is safe to do, as according to PCI + * spec, device with enabled MSI(-X) shouldn't use INTx. + */ + int int_type = xen_pcibk_get_interrupt_type(dev); + + if (int_type == INTERRUPT_TYPE_NONE || + int_type == INTERRUPT_TYPE_INTX || + int_type == field_config->int_type) + goto write; + return PCIBIOS_SET_FAILED; + } + +write: + return pci_write_config_word(dev, offset, new_value); +} + +static const struct config_field caplist_msix[] = { + { + .offset = PCI_MSIX_FLAGS, + .size = 2, + .init = msix_field_init, + .u.w.read = xen_pcibk_read_config_word, + .u.w.write = msi_msix_flags_write, + }, + {} +}; + +static const struct config_field caplist_msi[] = { + { + .offset = PCI_MSI_FLAGS, + .size = 2, + .init = msi_field_init, + .u.w.read = xen_pcibk_read_config_word, + .u.w.write = msi_msix_flags_write, + }, + {} +}; + static struct xen_pcibk_config_capability xen_pcibk_config_capability_pm = { .capability = PCI_CAP_ID_PM, .fields = caplist_pm, @@ -198,11 +285,21 @@ static struct xen_pcibk_config_capability xen_pcibk_config_capability_vpd = { .capability = PCI_CAP_ID_VPD, .fields = caplist_vpd, }; +static struct xen_pcibk_config_capability xen_pcibk_config_capability_msi = { + .capability = PCI_CAP_ID_MSI, + .fields = caplist_msi, +}; +static struct xen_pcibk_config_capability xen_pcibk_config_capability_msix = { + .capability = PCI_CAP_ID_MSIX, + .fields = caplist_msix, +}; int xen_pcibk_config_capability_init(void) { register_capability(&xen_pcibk_config_capability_vpd); register_capability(&xen_pcibk_config_capability_pm); + register_capability(&xen_pcibk_config_capability_msi); + register_capability(&xen_pcibk_config_capability_msix); return 0; } diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c index 10ae24b5a76e..fc0332645966 100644 --- a/drivers/xen/xen-pciback/conf_space_header.c +++ b/drivers/xen/xen-pciback/conf_space_header.c @@ -6,6 +6,7 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define dev_fmt pr_fmt #include <linux/kernel.h> #include <linux/pci.h> @@ -67,56 +68,46 @@ static int command_write(struct pci_dev *dev, int offset, u16 value, void *data) dev_data = pci_get_drvdata(dev); if (!pci_is_enabled(dev) && is_enable_cmd(value)) { - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: enable\n", - pci_name(dev)); + dev_dbg(&dev->dev, "enable\n"); err = pci_enable_device(dev); if (err) return err; if (dev_data) dev_data->enable_intx = 1; } else if (pci_is_enabled(dev) && !is_enable_cmd(value)) { - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: disable\n", - pci_name(dev)); + dev_dbg(&dev->dev, "disable\n"); pci_disable_device(dev); if (dev_data) dev_data->enable_intx = 0; } if (!dev->is_busmaster && is_master_cmd(value)) { - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: set bus master\n", - pci_name(dev)); + dev_dbg(&dev->dev, "set bus master\n"); pci_set_master(dev); } else if (dev->is_busmaster && !is_master_cmd(value)) { - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: clear bus master\n", - pci_name(dev)); + dev_dbg(&dev->dev, "clear bus master\n"); pci_clear_master(dev); } if (!(cmd->val & PCI_COMMAND_INVALIDATE) && (value & PCI_COMMAND_INVALIDATE)) { - if (unlikely(verbose_request)) - printk(KERN_DEBUG - DRV_NAME ": %s: enable memory-write-invalidate\n", - pci_name(dev)); + dev_dbg(&dev->dev, "enable memory-write-invalidate\n"); err = pci_set_mwi(dev); if (err) { - pr_warn("%s: cannot enable memory-write-invalidate (%d)\n", - pci_name(dev), err); + dev_warn(&dev->dev, "cannot enable memory-write-invalidate (%d)\n", + err); value &= ~PCI_COMMAND_INVALIDATE; } } else if ((cmd->val & PCI_COMMAND_INVALIDATE) && !(value & PCI_COMMAND_INVALIDATE)) { - if (unlikely(verbose_request)) - printk(KERN_DEBUG - DRV_NAME ": %s: disable memory-write-invalidate\n", - pci_name(dev)); + dev_dbg(&dev->dev, "disable memory-write-invalidate\n"); pci_clear_mwi(dev); } + if (dev_data && dev_data->allow_interrupt_control && + ((cmd->val ^ value) & PCI_COMMAND_INTX_DISABLE)) + pci_intx(dev, !(value & PCI_COMMAND_INTX_DISABLE)); + cmd->val = value; if (!xen_pcibk_permissive && (!dev_data || !dev_data->permissive)) @@ -138,8 +129,7 @@ static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data) struct pci_bar_info *bar = data; if (unlikely(!bar)) { - pr_warn(DRV_NAME ": driver data not found for %s\n", - pci_name(dev)); + dev_warn(&dev->dev, "driver data not found\n"); return XEN_PCI_ERR_op_failed; } @@ -175,8 +165,7 @@ static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data) u32 mask; if (unlikely(!bar)) { - pr_warn(DRV_NAME ": driver data not found for %s\n", - pci_name(dev)); + dev_warn(&dev->dev, "driver data not found\n"); return XEN_PCI_ERR_op_failed; } @@ -209,8 +198,7 @@ static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data) struct pci_bar_info *bar = data; if (unlikely(!bar)) { - pr_warn(DRV_NAME ": driver data not found for %s\n", - pci_name(dev)); + dev_warn(&dev->dev, "driver data not found\n"); return XEN_PCI_ERR_op_failed; } @@ -233,8 +221,12 @@ static void *bar_init(struct pci_dev *dev, int offset) else { pos = (offset - PCI_BASE_ADDRESS_0) / 4; if (pos && (res[pos - 1].flags & IORESOURCE_MEM_64)) { - bar->val = res[pos - 1].start >> 32; - bar->len_val = -resource_size(&res[pos - 1]) >> 32; + /* + * Use ">> 16 >> 16" instead of direct ">> 32" shift + * to avoid warnings on 32-bit architectures. + */ + bar->val = res[pos - 1].start >> 16 >> 16; + bar->len_val = -resource_size(&res[pos - 1]) >> 16 >> 16; return bar; } } @@ -414,8 +406,8 @@ int xen_pcibk_config_header_add_fields(struct pci_dev *dev) default: err = -EINVAL; - pr_err("%s: Unsupported header type %d!\n", - pci_name(dev), dev->hdr_type); + dev_err(&dev->dev, "Unsupported header type %d!\n", + dev->hdr_type); break; } diff --git a/drivers/xen/xen-pciback/conf_space_quirks.c b/drivers/xen/xen-pciback/conf_space_quirks.c index ed593d1042a6..7dc281086302 100644 --- a/drivers/xen/xen-pciback/conf_space_quirks.c +++ b/drivers/xen/xen-pciback/conf_space_quirks.c @@ -6,6 +6,8 @@ * Author: Chris Bookholt <hap10@epoch.ncsc.mil> */ +#define dev_fmt(fmt) DRV_NAME ": " fmt + #include <linux/kernel.h> #include <linux/pci.h> #include "pciback.h" @@ -35,8 +37,8 @@ static struct xen_pcibk_config_quirk *xen_pcibk_find_quirk(struct pci_dev *dev) if (match_one_device(&tmp_quirk->devid, dev) != NULL) goto out; tmp_quirk = NULL; - printk(KERN_DEBUG DRV_NAME - ": quirk didn't match any device known\n"); + dev_printk(KERN_DEBUG, &dev->dev, + "quirk didn't match any device known\n"); out: return tmp_quirk; } diff --git a/drivers/xen/xen-pciback/conf_space_quirks.h b/drivers/xen/xen-pciback/conf_space_quirks.h index d873abe35bf6..fc1557dfef49 100644 --- a/drivers/xen/xen-pciback/conf_space_quirks.h +++ b/drivers/xen/xen-pciback/conf_space_quirks.h @@ -21,8 +21,6 @@ struct xen_pcibk_config_quirk { int xen_pcibk_config_quirks_add_field(struct pci_dev *dev, struct config_field *field); -int xen_pcibk_config_quirks_remove_field(struct pci_dev *dev, int reg); - int xen_pcibk_config_quirks_init(struct pci_dev *dev); void xen_pcibk_config_field_free(struct config_field *field); diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c index 097410a7cdb7..045e74847fe6 100644 --- a/drivers/xen/xen-pciback/pci_stub.c +++ b/drivers/xen/xen-pciback/pci_stub.c @@ -6,6 +6,7 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define dev_fmt pr_fmt #include <linux/module.h> #include <linux/init.h> @@ -18,7 +19,11 @@ #include <linux/sched.h> #include <linux/atomic.h> #include <xen/events.h> -#include <asm/xen/pci.h> +#include <xen/pci.h> +#include <xen/xen.h> +#ifdef CONFIG_XEN_ACPI +#include <xen/acpi.h> +#endif #include <asm/xen/hypervisor.h> #include <xen/interface/physdev.h> #include "pciback.h" @@ -51,6 +56,9 @@ struct pcistub_device { struct pci_dev *dev; struct xen_pcibk_device *pdev;/* non-NULL if struct pci_dev is in use */ +#ifdef CONFIG_XEN_ACPI + int gsi; +#endif }; /* Access to pcistub_devices & seized_devices lists and the initialize_devices @@ -83,10 +91,23 @@ static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev) kref_init(&psdev->kref); spin_lock_init(&psdev->lock); +#ifdef CONFIG_XEN_ACPI + psdev->gsi = -1; +#endif return psdev; } +static int pcistub_reset_device_state(struct pci_dev *dev) +{ + __pci_reset_function_locked(dev); + + if (!xen_pv_domain()) + return xen_reset_device(dev); + else + return 0; +} + /* Don't call this directly as it's called by pcistub_device_put */ static void pcistub_device_release(struct kref *kref) { @@ -105,7 +126,7 @@ static void pcistub_device_release(struct kref *kref) /* Call the reset function which does not take lock as this * is called from "unbind" which takes a device_lock mutex. */ - __pci_reset_function_locked(dev); + pcistub_reset_device_state(dev); if (dev_data && pci_load_and_free_saved_state(dev, &dev_data->pci_saved_state)) dev_info(&dev->dev, "Could not reload PCI state\n"); @@ -192,8 +213,6 @@ static struct pci_dev *pcistub_device_get_pci_dev(struct xen_pcibk_device *pdev, struct pci_dev *pci_dev = NULL; unsigned long flags; - pcistub_device_get(psdev); - spin_lock_irqsave(&psdev->lock, flags); if (!psdev->pdev) { psdev->pdev = pdev; @@ -201,32 +220,33 @@ static struct pci_dev *pcistub_device_get_pci_dev(struct xen_pcibk_device *pdev, } spin_unlock_irqrestore(&psdev->lock, flags); - if (!pci_dev) - pcistub_device_put(psdev); + if (pci_dev) + pcistub_device_get(psdev); return pci_dev; } -struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev, - int domain, int bus, - int slot, int func) +#ifdef CONFIG_XEN_ACPI +static int pcistub_get_gsi_from_sbdf(unsigned int sbdf) { struct pcistub_device *psdev; - struct pci_dev *found_dev = NULL; - unsigned long flags; + int domain = (sbdf >> 16) & 0xffff; + int bus = PCI_BUS_NUM(sbdf); + int slot = PCI_SLOT(sbdf); + int func = PCI_FUNC(sbdf); - spin_lock_irqsave(&pcistub_devices_lock, flags); + psdev = pcistub_device_find(domain, bus, slot, func); - psdev = pcistub_device_find_locked(domain, bus, slot, func); - if (psdev) - found_dev = pcistub_device_get_pci_dev(pdev, psdev); + if (!psdev) + return -ENODEV; - spin_unlock_irqrestore(&pcistub_devices_lock, flags); - return found_dev; + return psdev->gsi; } +#endif -struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev, - struct pci_dev *dev) +struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev, + int domain, int bus, + int slot, int func) { struct pcistub_device *psdev; struct pci_dev *found_dev = NULL; @@ -234,12 +254,9 @@ struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev, spin_lock_irqsave(&pcistub_devices_lock, flags); - list_for_each_entry(psdev, &pcistub_devices, dev_list) { - if (psdev->dev == dev) { - found_dev = pcistub_device_get_pci_dev(pdev, psdev); - break; - } - } + psdev = pcistub_device_find_locked(domain, bus, slot, func); + if (psdev) + found_dev = pcistub_device_get_pci_dev(pdev, psdev); spin_unlock_irqrestore(&pcistub_devices_lock, flags); return found_dev; @@ -284,7 +301,7 @@ void pcistub_put_pci_dev(struct pci_dev *dev) * (so it's ready for the next domain) */ device_lock_assert(&dev->dev); - __pci_reset_function_locked(dev); + pcistub_reset_device_state(dev); dev_data = pci_get_drvdata(dev); ret = pci_load_saved_state(dev, dev_data->pci_saved_state); @@ -304,6 +321,8 @@ void pcistub_put_pci_dev(struct pci_dev *dev) xen_pcibk_config_reset_dev(dev); xen_pcibk_config_free_dyn_fields(dev); + dev_data->allow_interrupt_control = 0; + xen_unregister_device_domain_owner(dev); spin_lock_irqsave(&found_psdev->lock, flags); @@ -352,11 +371,20 @@ static int pcistub_match(struct pci_dev *dev) return found; } -static int pcistub_init_device(struct pci_dev *dev) +static int pcistub_init_device(struct pcistub_device *psdev) { struct xen_pcibk_dev_data *dev_data; + struct pci_dev *dev; +#ifdef CONFIG_XEN_ACPI + int gsi, trigger, polarity; +#endif int err = 0; + if (!psdev) + return -EINVAL; + + dev = psdev->dev; + dev_dbg(&dev->dev, "initializing...\n"); /* The PCI backend is not intended to be a module (or to work with @@ -418,9 +446,26 @@ static int pcistub_init_device(struct pci_dev *dev) dev_err(&dev->dev, "Could not store PCI conf saved state!\n"); else { dev_dbg(&dev->dev, "resetting (FLR, D3, etc) the device\n"); - __pci_reset_function_locked(dev); + err = pcistub_reset_device_state(dev); + if (err) + goto config_release; pci_restore_state(dev); } + +#ifdef CONFIG_XEN_ACPI + if (xen_initial_domain() && xen_pvh_domain()) { + err = xen_acpi_get_gsi_info(dev, &gsi, &trigger, &polarity); + if (err) { + dev_err(&dev->dev, "Fail to get gsi info!\n"); + goto config_release; + } + err = xen_pvh_setup_gsi(gsi, trigger, polarity); + if (err) + goto config_release; + psdev->gsi = gsi; + } +#endif + /* Now disable the device (this also ensures some private device * data is setup before we export) */ @@ -460,7 +505,7 @@ static int __init pcistub_init_devices_late(void) spin_unlock_irqrestore(&pcistub_devices_lock, flags); - err = pcistub_init_device(psdev->dev); + err = pcistub_init_device(psdev); if (err) { dev_err(&psdev->dev->dev, "error %d initializing device\n", err); @@ -530,7 +575,7 @@ static int pcistub_seize(struct pci_dev *dev, spin_unlock_irqrestore(&pcistub_devices_lock, flags); /* don't want irqs disabled when calling pcistub_init_device */ - err = pcistub_init_device(psdev->dev); + err = pcistub_init_device(psdev); spin_lock_irqsave(&pcistub_devices_lock, flags); @@ -624,11 +669,11 @@ static void pcistub_remove(struct pci_dev *dev) if (found_psdev->pdev) { int domid = xen_find_device_domain_owner(dev); - pr_warn("****** removing device %s while still in-use by domain %d! ******\n", + dev_warn(&dev->dev, "****** removing device %s while still in-use by domain %d! ******\n", pci_name(found_psdev->dev), domid); - pr_warn("****** driver domain may still access this device's i/o resources!\n"); - pr_warn("****** shutdown driver domain before binding device\n"); - pr_warn("****** to other drivers or domains\n"); + dev_warn(&dev->dev, "****** driver domain may still access this device's i/o resources!\n"); + dev_warn(&dev->dev, "****** shutdown driver domain before binding device\n"); + dev_warn(&dev->dev, "****** to other drivers or domains\n"); /* N.B. This ends up calling pcistub_put_pci_dev which ends up * doing the FLR. */ @@ -709,14 +754,12 @@ static pci_ers_result_t common_process(struct pcistub_device *psdev, ret = xen_pcibk_get_pcifront_dev(psdev->dev, psdev->pdev, &aer_op->domain, &aer_op->bus, &aer_op->devfn); if (!ret) { - dev_err(&psdev->dev->dev, - DRV_NAME ": failed to get pcifront device\n"); + dev_err(&psdev->dev->dev, "failed to get pcifront device\n"); return PCI_ERS_RESULT_NONE; } wmb(); - dev_dbg(&psdev->dev->dev, - DRV_NAME ": aer_op %x dom %x bus %x devfn %x\n", + dev_dbg(&psdev->dev->dev, "aer_op %x dom %x bus %x devfn %x\n", aer_cmd, aer_op->domain, aer_op->bus, aer_op->devfn); /*local flag to mark there's aer request, xen_pcibk callback will use * this flag to judge whether we need to check pci-front give aer @@ -733,10 +776,17 @@ static pci_ers_result_t common_process(struct pcistub_device *psdev, wmb(); notify_remote_via_irq(pdev->evtchn_irq); + /* Enable IRQ to signal "request done". */ + xen_pcibk_lateeoi(pdev, 0); + ret = wait_event_timeout(xen_pcibk_aer_wait_queue, !(test_bit(_XEN_PCIB_active, (unsigned long *) &sh_info->flags)), 300*HZ); + /* Enable IRQ for pcifront request if not already active. */ + if (!test_bit(_PDEVF_op_active, &pdev->flags)) + xen_pcibk_lateeoi(pdev, 0); + if (!ret) { if (test_bit(_XEN_PCIB_active, (unsigned long *)&sh_info->flags)) { @@ -750,14 +800,7 @@ static pci_ers_result_t common_process(struct pcistub_device *psdev, } clear_bit(_PCIB_op_pending, (unsigned long *)&pdev->flags); - if (test_bit(_XEN_PCIF_active, - (unsigned long *)&sh_info->flags)) { - dev_dbg(&psdev->dev->dev, - "schedule pci_conf service in " DRV_NAME "\n"); - xen_pcibk_test_and_schedule_op(psdev->pdev); - } - - res = (pci_ers_result_t)aer_op->err; + res = (__force pci_ers_result_t)aer_op->err; return res; } @@ -784,13 +827,12 @@ static pci_ers_result_t xen_pcibk_slot_reset(struct pci_dev *dev) PCI_FUNC(dev->devfn)); if (!psdev || !psdev->pdev) { - dev_err(&dev->dev, - DRV_NAME " device is not found/assigned\n"); + dev_err(&dev->dev, "device is not found/assigned\n"); goto end; } if (!psdev->pdev->sh_info) { - dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + dev_err(&dev->dev, "device is not connected or owned" " by HVM, kill it\n"); kill_domain_by_device(psdev); goto end; @@ -802,7 +844,7 @@ static pci_ers_result_t xen_pcibk_slot_reset(struct pci_dev *dev) "guest with no AER driver should have been killed\n"); goto end; } - result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result); + result = common_process(psdev, pci_channel_io_normal, XEN_PCI_OP_aer_slotreset, result); if (result == PCI_ERS_RESULT_NONE || result == PCI_ERS_RESULT_DISCONNECT) { @@ -842,13 +884,12 @@ static pci_ers_result_t xen_pcibk_mmio_enabled(struct pci_dev *dev) PCI_FUNC(dev->devfn)); if (!psdev || !psdev->pdev) { - dev_err(&dev->dev, - DRV_NAME " device is not found/assigned\n"); + dev_err(&dev->dev, "device is not found/assigned\n"); goto end; } if (!psdev->pdev->sh_info) { - dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + dev_err(&dev->dev, "device is not connected or owned" " by HVM, kill it\n"); kill_domain_by_device(psdev); goto end; @@ -860,7 +901,7 @@ static pci_ers_result_t xen_pcibk_mmio_enabled(struct pci_dev *dev) "guest with no AER driver should have been killed\n"); goto end; } - result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result); + result = common_process(psdev, pci_channel_io_normal, XEN_PCI_OP_aer_mmio, result); if (result == PCI_ERS_RESULT_NONE || result == PCI_ERS_RESULT_DISCONNECT) { @@ -900,13 +941,12 @@ static pci_ers_result_t xen_pcibk_error_detected(struct pci_dev *dev, PCI_FUNC(dev->devfn)); if (!psdev || !psdev->pdev) { - dev_err(&dev->dev, - DRV_NAME " device is not found/assigned\n"); + dev_err(&dev->dev, "device is not found/assigned\n"); goto end; } if (!psdev->pdev->sh_info) { - dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + dev_err(&dev->dev, "device is not connected or owned" " by HVM, kill it\n"); kill_domain_by_device(psdev); goto end; @@ -954,13 +994,12 @@ static void xen_pcibk_error_resume(struct pci_dev *dev) PCI_FUNC(dev->devfn)); if (!psdev || !psdev->pdev) { - dev_err(&dev->dev, - DRV_NAME " device is not found/assigned\n"); + dev_err(&dev->dev, "device is not found/assigned\n"); goto end; } if (!psdev->pdev->sh_info) { - dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + dev_err(&dev->dev, "device is not connected or owned" " by HVM, kill it\n"); kill_domain_by_device(psdev); goto end; @@ -973,7 +1012,7 @@ static void xen_pcibk_error_resume(struct pci_dev *dev) kill_domain_by_device(psdev); goto end; } - common_process(psdev, 1, XEN_PCI_OP_aer_resume, + common_process(psdev, pci_channel_io_normal, XEN_PCI_OP_aer_resume, PCI_ERS_RESULT_RECOVERED); end: if (psdev) @@ -1222,7 +1261,7 @@ static ssize_t slots_show(struct device_driver *drv, char *buf) if (count >= PAGE_SIZE) break; - count += scnprintf(buf + count, PAGE_SIZE - count, + count += sysfs_emit_at(buf, count, "%04x:%02x:%02x.%d\n", pci_dev_id->domain, pci_dev_id->bus, PCI_SLOT(pci_dev_id->devfn), @@ -1251,7 +1290,7 @@ static ssize_t irq_handlers_show(struct device_driver *drv, char *buf) if (!dev_data) continue; count += - scnprintf(buf + count, PAGE_SIZE - count, + sysfs_emit_at(buf, count, "%s:%s:%sing:%ld\n", pci_name(psdev->dev), dev_data->isr_on ? "on" : "off", @@ -1336,7 +1375,7 @@ static ssize_t quirks_show(struct device_driver *drv, char *buf) if (count >= PAGE_SIZE) goto out; - count += scnprintf(buf + count, PAGE_SIZE - count, + count += sysfs_emit_at(buf, count, "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n", quirk->pdev->bus->number, PCI_SLOT(quirk->pdev->devfn), @@ -1352,7 +1391,7 @@ static ssize_t quirks_show(struct device_driver *drv, char *buf) if (count >= PAGE_SIZE) goto out; - count += scnprintf(buf + count, PAGE_SIZE - count, + count += sysfs_emit_at(buf, count, "\t\t%08x:%01x:%08x\n", cfg_entry->base_offset + field->offset, field->size, @@ -1423,7 +1462,7 @@ static ssize_t permissive_show(struct device_driver *drv, char *buf) if (!dev_data || !dev_data->permissive) continue; count += - scnprintf(buf + count, PAGE_SIZE - count, "%s\n", + sysfs_emit_at(buf, count, "%s\n", pci_name(psdev->dev)); } spin_unlock_irqrestore(&pcistub_devices_lock, flags); @@ -1431,6 +1470,65 @@ static ssize_t permissive_show(struct device_driver *drv, char *buf) } static DRIVER_ATTR_RW(permissive); +static ssize_t allow_interrupt_control_store(struct device_driver *drv, + const char *buf, size_t count) +{ + int domain, bus, slot, func; + int err; + struct pcistub_device *psdev; + struct xen_pcibk_dev_data *dev_data; + + err = str_to_slot(buf, &domain, &bus, &slot, &func); + if (err) + goto out; + + psdev = pcistub_device_find(domain, bus, slot, func); + if (!psdev) { + err = -ENODEV; + goto out; + } + + dev_data = pci_get_drvdata(psdev->dev); + /* the driver data for a device should never be null at this point */ + if (!dev_data) { + err = -ENXIO; + goto release; + } + dev_data->allow_interrupt_control = 1; +release: + pcistub_device_put(psdev); +out: + if (!err) + err = count; + return err; +} + +static ssize_t allow_interrupt_control_show(struct device_driver *drv, + char *buf) +{ + struct pcistub_device *psdev; + struct xen_pcibk_dev_data *dev_data; + size_t count = 0; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (count >= PAGE_SIZE) + break; + if (!psdev->dev) + continue; + dev_data = pci_get_drvdata(psdev->dev); + if (!dev_data || !dev_data->allow_interrupt_control) + continue; + count += + sysfs_emit_at(buf, count, "%s\n", + pci_name(psdev->dev)); + } + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return count; +} +static DRIVER_ATTR_RW(allow_interrupt_control); + static void pcistub_exit(void) { driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_new_slot); @@ -1441,6 +1539,8 @@ static void pcistub_exit(void) driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_permissive); driver_remove_file(&xen_pcibk_pci_driver.driver, + &driver_attr_allow_interrupt_control); + driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_irq_handlers); driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_irq_handler_state); @@ -1530,6 +1630,9 @@ static int __init pcistub_init(void) if (!err) err = driver_create_file(&xen_pcibk_pci_driver.driver, &driver_attr_permissive); + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_allow_interrupt_control); if (!err) err = driver_create_file(&xen_pcibk_pci_driver.driver, @@ -1633,11 +1736,19 @@ static int __init xen_pcibk_init(void) bus_register_notifier(&pci_bus_type, &pci_stub_nb); #endif +#ifdef CONFIG_XEN_ACPI + xen_acpi_register_get_gsi_func(pcistub_get_gsi_from_sbdf); +#endif + return err; } static void __exit xen_pcibk_cleanup(void) { +#ifdef CONFIG_XEN_ACPI + xen_acpi_register_get_gsi_func(NULL); +#endif + #ifdef CONFIG_PCI_IOV bus_unregister_notifier(&pci_bus_type, &pci_stub_nb); #endif @@ -1648,5 +1759,6 @@ static void __exit xen_pcibk_cleanup(void) module_init(xen_pcibk_init); module_exit(xen_pcibk_cleanup); +MODULE_DESCRIPTION("Xen PCI-device stub driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS("xen-backend:pci"); diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h index 263c059bff90..b786c1f74f85 100644 --- a/drivers/xen/xen-pciback/pciback.h +++ b/drivers/xen/xen-pciback/pciback.h @@ -14,6 +14,7 @@ #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/atomic.h> +#include <xen/events.h> #include <xen/interface/io/pciif.h> #define DRV_NAME "xen-pciback" @@ -27,6 +28,8 @@ struct pci_dev_entry { #define PDEVF_op_active (1<<(_PDEVF_op_active)) #define _PCIB_op_pending (1) #define PCIB_op_pending (1<<(_PCIB_op_pending)) +#define _EOI_pending (2) +#define EOI_pending (1<<(_EOI_pending)) struct xen_pcibk_device { void *pci_dev_data; @@ -45,13 +48,14 @@ struct xen_pcibk_dev_data { struct list_head config_fields; struct pci_saved_state *pci_saved_state; unsigned int permissive:1; + unsigned int allow_interrupt_control:1; unsigned int warned_on_write:1; unsigned int enable_intx:1; unsigned int isr_on:1; /* Whether the IRQ handler is installed. */ unsigned int ack_intr:1; /* .. and ACK-ing */ unsigned long handled; unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */ - char irq_name[0]; /* xen-pcibk[000:04:00.0] */ + char irq_name[]; /* xen-pcibk[000:04:00.0] */ }; /* Used by XenBus and xen_pcibk_ops.c */ @@ -63,10 +67,13 @@ extern struct list_head xen_pcibk_quirks; struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev, int domain, int bus, int slot, int func); -struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev, - struct pci_dev *dev); void pcistub_put_pci_dev(struct pci_dev *dev); +static inline bool xen_pcibk_pv_support(void) +{ + return IS_ENABLED(CONFIG_XEN_PCIDEV_BACKEND); +} + /* Ensure a device is turned off or reset */ void xen_pcibk_reset_device(struct pci_dev *pdev); @@ -182,13 +189,13 @@ static inline void xen_pcibk_release_devices(struct xen_pcibk_device *pdev) irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id); void xen_pcibk_do_op(struct work_struct *data); +static inline void xen_pcibk_lateeoi(struct xen_pcibk_device *pdev, + unsigned int eoi_flag) +{ + if (test_and_clear_bit(_EOI_pending, &pdev->flags)) + xen_irq_lateeoi(pdev->evtchn_irq, eoi_flag); +} + int xen_pcibk_xenbus_register(void); void xen_pcibk_xenbus_unregister(void); - -extern int verbose_request; - -void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev); #endif - -/* Handles shared IRQs that can to device domain and control domain. */ -void xen_pcibk_irq_handler(struct pci_dev *dev, int reset); diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c index ea4a08b83fa0..84e014490950 100644 --- a/drivers/xen/xen-pciback/pciback_ops.c +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -6,6 +6,7 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define dev_fmt pr_fmt #include <linux/moduleparam.h> #include <linux/wait.h> @@ -14,9 +15,6 @@ #include <linux/sched.h> #include "pciback.h" -int verbose_request; -module_param(verbose_request, int, 0644); - static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id); /* Ensure a device is has the fake IRQ handler "turned on/off" and is @@ -127,8 +125,6 @@ void xen_pcibk_reset_device(struct pci_dev *dev) if (pci_is_enabled(dev)) pci_disable_device(dev); - pci_write_config_word(dev, PCI_COMMAND, 0); - dev->is_busmaster = 0; } else { pci_read_config_word(dev, PCI_COMMAND, &cmd); @@ -149,9 +145,6 @@ int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev, struct xen_pcibk_dev_data *dev_data; int status; - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: enable MSI\n", pci_name(dev)); - if (dev->msi_enabled) status = -EALREADY; else if (dev->msix_enabled) @@ -160,20 +153,18 @@ int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev, status = pci_enable_msi(dev); if (status) { - pr_warn_ratelimited("%s: error enabling MSI for guest %u: err %d\n", - pci_name(dev), pdev->xdev->otherend_id, - status); + dev_warn_ratelimited(&dev->dev, "error enabling MSI for guest %u: err %d\n", + pdev->xdev->otherend_id, status); op->value = 0; return XEN_PCI_ERR_op_failed; } - /* The value the guest needs is actually the IDT vector, not the + /* The value the guest needs is actually the IDT vector, not * the local domain's IRQ number. */ op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev), - op->value); + + dev_dbg(&dev->dev, "MSI: %d\n", op->value); dev_data = pci_get_drvdata(dev); if (dev_data) @@ -186,10 +177,6 @@ static int xen_pcibk_disable_msi(struct xen_pcibk_device *pdev, struct pci_dev *dev, struct xen_pci_op *op) { - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: disable MSI\n", - pci_name(dev)); - if (dev->msi_enabled) { struct xen_pcibk_dev_data *dev_data; @@ -200,9 +187,9 @@ int xen_pcibk_disable_msi(struct xen_pcibk_device *pdev, dev_data->ack_intr = 1; } op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev), - op->value); + + dev_dbg(&dev->dev, "MSI: %d\n", op->value); + return 0; } @@ -215,9 +202,7 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev, struct msix_entry *entries; u16 cmd; - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: enable MSI-X\n", - pci_name(dev)); + dev_dbg(&dev->dev, "enable MSI-X\n"); if (op->value > SH_INFO_MAX_VEC) return -EINVAL; @@ -250,17 +235,13 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev, if (entries[i].vector) { op->msix_entries[i].vector = xen_pirq_from_irq(entries[i].vector); - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: " \ - "MSI-X[%d]: %d\n", - pci_name(dev), i, - op->msix_entries[i].vector); + dev_dbg(&dev->dev, "MSI-X[%d]: %d\n", i, + op->msix_entries[i].vector); } } } else - pr_warn_ratelimited("%s: error enabling MSI-X for guest %u: err %d!\n", - pci_name(dev), pdev->xdev->otherend_id, - result); + dev_warn_ratelimited(&dev->dev, "error enabling MSI-X for guest %u: err %d!\n", + pdev->xdev->otherend_id, result); kfree(entries); op->value = result; @@ -275,10 +256,6 @@ static int xen_pcibk_disable_msix(struct xen_pcibk_device *pdev, struct pci_dev *dev, struct xen_pci_op *op) { - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: disable MSI-X\n", - pci_name(dev)); - if (dev->msix_enabled) { struct xen_pcibk_dev_data *dev_data; @@ -293,32 +270,47 @@ int xen_pcibk_disable_msix(struct xen_pcibk_device *pdev, * an undefined IRQ value of zero. */ op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; - if (unlikely(verbose_request)) - printk(KERN_DEBUG DRV_NAME ": %s: MSI-X: %d\n", - pci_name(dev), op->value); + + dev_dbg(&dev->dev, "MSI-X: %d\n", op->value); + return 0; } #endif + +static inline bool xen_pcibk_test_op_pending(struct xen_pcibk_device *pdev) +{ + return test_bit(_XEN_PCIF_active, + (unsigned long *)&pdev->sh_info->flags) && + !test_and_set_bit(_PDEVF_op_active, &pdev->flags); +} + /* * Now the same evtchn is used for both pcifront conf_read_write request * as well as pcie aer front end ack. We use a new work_queue to schedule * xen_pcibk conf_read_write service for avoiding confict with aer_core * do_recovery job which also use the system default work_queue */ -void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev) +static void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev) { + bool eoi = true; + /* Check that frontend is requesting an operation and that we are not * already processing a request */ - if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags) - && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) { + if (xen_pcibk_test_op_pending(pdev)) { schedule_work(&pdev->op_work); + eoi = false; } /*_XEN_PCIB_active should have been cleared by pcifront. And also make sure xen_pcibk is waiting for ack by checking _PCIB_op_pending*/ if (!test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags) && test_bit(_PCIB_op_pending, &pdev->flags)) { wake_up(&xen_pcibk_aer_wait_queue); + eoi = false; } + + /* EOI if there was nothing to do. */ + if (eoi) + xen_pcibk_lateeoi(pdev, XEN_EOI_FLAG_SPURIOUS); } /* Performing the configuration space reads/writes must not be done in atomic @@ -326,10 +318,8 @@ void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev) * use of semaphores). This function is intended to be called from a work * queue in process context taking a struct xen_pcibk_device as a parameter */ -void xen_pcibk_do_op(struct work_struct *data) +static void xen_pcibk_do_one_op(struct xen_pcibk_device *pdev) { - struct xen_pcibk_device *pdev = - container_of(data, struct xen_pcibk_device, op_work); struct pci_dev *dev; struct xen_pcibk_dev_data *dev_data = NULL; struct xen_pci_op *op = &pdev->op; @@ -402,16 +392,31 @@ void xen_pcibk_do_op(struct work_struct *data) smp_mb__before_atomic(); /* /after/ clearing PCIF_active */ clear_bit(_PDEVF_op_active, &pdev->flags); smp_mb__after_atomic(); /* /before/ final check for work */ +} - /* Check to see if the driver domain tried to start another request in - * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. - */ - xen_pcibk_test_and_schedule_op(pdev); +void xen_pcibk_do_op(struct work_struct *data) +{ + struct xen_pcibk_device *pdev = + container_of(data, struct xen_pcibk_device, op_work); + + do { + xen_pcibk_do_one_op(pdev); + } while (xen_pcibk_test_op_pending(pdev)); + + xen_pcibk_lateeoi(pdev, 0); } irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id) { struct xen_pcibk_device *pdev = dev_id; + bool eoi; + + /* IRQs might come in before pdev->evtchn_irq is written. */ + if (unlikely(pdev->evtchn_irq != irq)) + pdev->evtchn_irq = irq; + + eoi = test_and_set_bit(_EOI_pending, &pdev->flags); + WARN(eoi, "IRQ while EOI pending\n"); xen_pcibk_test_and_schedule_op(pdev); @@ -426,7 +431,7 @@ static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id) dev_data->handled++; if ((dev_data->handled % 1000) == 0) { if (xen_test_irq_shared(irq)) { - pr_info("%s IRQ line is not shared " + dev_info(&dev->dev, "%s IRQ line is not shared " "with other domains. Turning ISR off\n", dev_data->irq_name); dev_data->ack_intr = 0; diff --git a/drivers/xen/xen-pciback/vpci.c b/drivers/xen/xen-pciback/vpci.c index f6ba18191c0f..cc7450f2b2a9 100644 --- a/drivers/xen/xen-pciback/vpci.c +++ b/drivers/xen/xen-pciback/vpci.c @@ -7,6 +7,7 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define dev_fmt pr_fmt #include <linux/list.h> #include <linux/slab.h> @@ -69,7 +70,7 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, struct pci_dev *dev, int devid, publish_pci_dev_cb publish_cb) { - int err = 0, slot, func = -1; + int err = 0, slot, func = PCI_FUNC(dev->devfn); struct pci_dev_entry *t, *dev_entry; struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; @@ -94,23 +95,25 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, /* * Keep multi-function devices together on the virtual PCI bus, except - * virtual functions. + * that we want to keep virtual functions at func 0 on their own. They + * aren't multi-function devices and hence their presence at func 0 + * may cause guests to not scan the other functions. */ - if (!dev->is_virtfn) { + if (!dev->is_virtfn || func) { for (slot = 0; slot < PCI_SLOT_MAX; slot++) { if (list_empty(&vpci_dev->dev_list[slot])) continue; t = list_entry(list_first(&vpci_dev->dev_list[slot]), struct pci_dev_entry, list); + if (t->dev->is_virtfn && !PCI_FUNC(t->dev->devfn)) + continue; if (match_slot(dev, t->dev)) { - pr_info("vpci: %s: assign to virtual slot %d func %d\n", - pci_name(dev), slot, - PCI_FUNC(dev->devfn)); + dev_info(&dev->dev, "vpci: assign to virtual slot %d func %d\n", + slot, func); list_add_tail(&dev_entry->list, &vpci_dev->dev_list[slot]); - func = PCI_FUNC(dev->devfn); goto unlock; } } @@ -119,11 +122,10 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, /* Assign to a new slot on the virtual PCI bus */ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { if (list_empty(&vpci_dev->dev_list[slot])) { - pr_info("vpci: %s: assign to virtual slot %d\n", - pci_name(dev), slot); + dev_info(&dev->dev, "vpci: assign to virtual slot %d\n", + slot); list_add_tail(&dev_entry->list, &vpci_dev->dev_list[slot]); - func = dev->is_virtfn ? 0 : PCI_FUNC(dev->devfn); goto unlock; } } @@ -233,7 +235,6 @@ static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, unsigned int *devfn) { struct pci_dev_entry *entry; - struct pci_dev *dev = NULL; struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; int found = 0, slot; @@ -242,11 +243,7 @@ static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, list_for_each_entry(entry, &vpci_dev->dev_list[slot], list) { - dev = entry->dev; - if (dev && dev->bus->number == pcidev->bus->number - && pci_domain_nr(dev->bus) == - pci_domain_nr(pcidev->bus) - && dev->devfn == pcidev->devfn) { + if (entry->dev == pcidev) { found = 1; *domain = 0; *bus = 0; diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c index 581c4e1a8b82..b11e401f1b1e 100644 --- a/drivers/xen/xen-pciback/xenbus.c +++ b/drivers/xen/xen-pciback/xenbus.c @@ -14,7 +14,7 @@ #include <linux/workqueue.h> #include <xen/xenbus.h> #include <xen/events.h> -#include <asm/xen/pci.h> +#include <xen/pci.h> #include "pciback.h" #define INVALID_EVTCHN_IRQ (-1) @@ -31,7 +31,7 @@ MODULE_PARM_DESC(passthrough, " frontend (for example, a device at 06:01.b will still appear at\n"\ " 06:01.b to the frontend). This is similar to how Xen 2.0.x\n"\ " exposed PCI devices to its driver domains. This may be required\n"\ - " for drivers which depend on finding their hardward in certain\n"\ + " for drivers which depend on finding their hardware in certain\n"\ " bus/slot locations."); static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev) @@ -105,13 +105,13 @@ static void free_pdev(struct xen_pcibk_device *pdev) } static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref, - int remote_evtchn) + evtchn_port_t remote_evtchn) { int err = 0; void *vaddr; dev_dbg(&pdev->xdev->dev, - "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n", + "Attaching to frontend resources - gnt_ref=%d evtchn=%u\n", gnt_ref, remote_evtchn); err = xenbus_map_ring_valloc(pdev->xdev, &gnt_ref, 1, &vaddr); @@ -123,8 +123,8 @@ static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref, pdev->sh_info = vaddr; - err = bind_interdomain_evtchn_to_irqhandler( - pdev->xdev->otherend_id, remote_evtchn, xen_pcibk_handle_event, + err = bind_interdomain_evtchn_to_irqhandler_lateeoi( + pdev->xdev, remote_evtchn, xen_pcibk_handle_event, 0, DRV_NAME, pdev); if (err < 0) { xenbus_dev_fatal(pdev->xdev, err, @@ -142,7 +142,8 @@ out: static int xen_pcibk_attach(struct xen_pcibk_device *pdev) { int err = 0; - int gnt_ref, remote_evtchn; + int gnt_ref; + evtchn_port_t remote_evtchn; char *magic = NULL; @@ -358,7 +359,8 @@ out: return err; } -static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev) +static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev, + enum xenbus_state state) { int err = 0; int num_devs; @@ -372,9 +374,7 @@ static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev) dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n"); mutex_lock(&pdev->dev_lock); - /* Make sure we only reconfigure once */ - if (xenbus_read_driver_state(pdev->xdev->nodename) != - XenbusStateReconfiguring) + if (xenbus_read_driver_state(pdev->xdev->nodename) != state) goto out; err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d", @@ -499,6 +499,10 @@ static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev) } } + if (state != XenbusStateReconfiguring) + /* Make sure we only reconfigure once. */ + goto out; + err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured); if (err) { xenbus_dev_fatal(pdev->xdev, err, @@ -524,7 +528,7 @@ static void xen_pcibk_frontend_changed(struct xenbus_device *xdev, break; case XenbusStateReconfiguring: - xen_pcibk_reconfigure(pdev); + xen_pcibk_reconfigure(pdev, XenbusStateReconfiguring); break; case XenbusStateConnected: @@ -544,7 +548,7 @@ static void xen_pcibk_frontend_changed(struct xenbus_device *xdev, xenbus_switch_state(xdev, XenbusStateClosed); if (xenbus_dev_is_online(xdev)) break; - /* fall through if not online */ + fallthrough; /* if not online */ case XenbusStateUnknown: dev_dbg(&xdev->dev, "frontend is gone! unregister device\n"); device_unregister(&xdev->dev); @@ -663,6 +667,15 @@ static void xen_pcibk_be_watch(struct xenbus_watch *watch, xen_pcibk_setup_backend(pdev); break; + case XenbusStateInitialised: + /* + * We typically move to Initialised when the first device was + * added. Hence subsequent devices getting added may need + * reconfiguring. + */ + xen_pcibk_reconfigure(pdev, XenbusStateInitialised); + break; + default: break; } @@ -688,7 +701,7 @@ static int xen_pcibk_xenbus_probe(struct xenbus_device *dev, /* watch the backend node for backend configuration information */ err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch, - xen_pcibk_be_watch); + NULL, xen_pcibk_be_watch); if (err) goto out; @@ -697,20 +710,18 @@ static int xen_pcibk_xenbus_probe(struct xenbus_device *dev, /* We need to force a call to our callback here in case * xend already configured us! */ - xen_pcibk_be_watch(&pdev->be_watch, NULL, 0); + xen_pcibk_be_watch(&pdev->be_watch, NULL, NULL); out: return err; } -static int xen_pcibk_xenbus_remove(struct xenbus_device *dev) +static void xen_pcibk_xenbus_remove(struct xenbus_device *dev) { struct xen_pcibk_device *pdev = dev_get_drvdata(&dev->dev); if (pdev != NULL) free_pdev(pdev); - - return 0; } static const struct xenbus_device_id xen_pcibk_ids[] = { @@ -730,6 +741,9 @@ const struct xen_pcibk_backend *__read_mostly xen_pcibk_backend; int __init xen_pcibk_xenbus_register(void) { + if (!xen_pcibk_pv_support()) + return 0; + xen_pcibk_backend = &xen_pcibk_vpci_backend; if (passthrough) xen_pcibk_backend = &xen_pcibk_passthrough_backend; @@ -739,5 +753,6 @@ int __init xen_pcibk_xenbus_register(void) void __exit xen_pcibk_xenbus_unregister(void) { - xenbus_unregister_driver(&xen_pcibk_driver); + if (xen_pcibk_pv_support()) + xenbus_unregister_driver(&xen_pcibk_driver); } diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c index c9e23a126218..0c51edfd13dc 100644 --- a/drivers/xen/xen-scsiback.c +++ b/drivers/xen/xen-scsiback.c @@ -33,8 +33,6 @@ #define pr_fmt(fmt) "xen-pvscsi: " fmt -#include <stdarg.h> - #include <linux/module.h> #include <linux/utsname.h> #include <linux/interrupt.h> @@ -91,7 +89,6 @@ struct vscsibk_info { unsigned int irq; struct vscsiif_back_ring ring; - int ring_error; spinlock_t ring_lock; atomic_t nr_unreplied_reqs; @@ -100,6 +97,8 @@ struct vscsibk_info { struct list_head v2p_entry_lists; wait_queue_head_t waiting_to_free; + + struct gnttab_page_cache free_pages; }; /* theoretical maximum of grants for one request */ @@ -189,10 +188,6 @@ module_param_named(max_buffer_pages, scsiback_max_buffer_pages, int, 0644); MODULE_PARM_DESC(max_buffer_pages, "Maximum number of free pages to keep in backend buffer"); -static DEFINE_SPINLOCK(free_pages_lock); -static int free_pages_num; -static LIST_HEAD(scsiback_free_pages); - /* Global spinlock to protect scsiback TPG list */ static DEFINE_MUTEX(scsiback_mutex); static LIST_HEAD(scsiback_list); @@ -208,41 +203,6 @@ static void scsiback_put(struct vscsibk_info *info) wake_up(&info->waiting_to_free); } -static void put_free_pages(struct page **page, int num) -{ - unsigned long flags; - int i = free_pages_num + num, n = num; - - if (num == 0) - return; - if (i > scsiback_max_buffer_pages) { - n = min(num, i - scsiback_max_buffer_pages); - gnttab_free_pages(n, page + num - n); - n = num - n; - } - spin_lock_irqsave(&free_pages_lock, flags); - for (i = 0; i < n; i++) - list_add(&page[i]->lru, &scsiback_free_pages); - free_pages_num += n; - spin_unlock_irqrestore(&free_pages_lock, flags); -} - -static int get_free_page(struct page **page) -{ - unsigned long flags; - - spin_lock_irqsave(&free_pages_lock, flags); - if (list_empty(&scsiback_free_pages)) { - spin_unlock_irqrestore(&free_pages_lock, flags); - return gnttab_alloc_pages(1, page); - } - page[0] = list_first_entry(&scsiback_free_pages, struct page, lru); - list_del(&page[0]->lru); - free_pages_num--; - spin_unlock_irqrestore(&free_pages_lock, flags); - return 0; -} - static unsigned long vaddr_page(struct page *page) { unsigned long pfn = page_to_pfn(page); @@ -260,10 +220,10 @@ static void scsiback_print_status(char *sense_buffer, int errors, { struct scsiback_tpg *tpg = pending_req->v2p->tpg; - pr_err("[%s:%d] cmnd[0]=%02x -> st=%02x msg=%02x host=%02x drv=%02x\n", + pr_err("[%s:%d] cmnd[0]=%02x -> st=%02x msg=%02x host=%02x\n", tpg->tport->tport_name, pending_req->v2p->lun, - pending_req->cmnd[0], status_byte(errors), msg_byte(errors), - host_byte(errors), driver_byte(errors)); + pending_req->cmnd[0], errors & 0xff, COMMAND_COMPLETE, + host_byte(errors)); } static void scsiback_fast_flush_area(struct vscsibk_pend *req) @@ -303,7 +263,8 @@ static void scsiback_fast_flush_area(struct vscsibk_pend *req) BUG_ON(err); } - put_free_pages(req->pages, req->n_grants); + gnttab_page_cache_put(&req->info->free_pages, req->pages, + req->n_grants); req->n_grants = 0; } @@ -319,6 +280,70 @@ static void scsiback_free_translation_entry(struct kref *kref) kfree(entry); } +static int32_t scsiback_result(int32_t result) +{ + int32_t host_status; + + switch (XEN_VSCSIIF_RSLT_HOST(result)) { + case DID_OK: + host_status = XEN_VSCSIIF_RSLT_HOST_OK; + break; + case DID_NO_CONNECT: + host_status = XEN_VSCSIIF_RSLT_HOST_NO_CONNECT; + break; + case DID_BUS_BUSY: + host_status = XEN_VSCSIIF_RSLT_HOST_BUS_BUSY; + break; + case DID_TIME_OUT: + host_status = XEN_VSCSIIF_RSLT_HOST_TIME_OUT; + break; + case DID_BAD_TARGET: + host_status = XEN_VSCSIIF_RSLT_HOST_BAD_TARGET; + break; + case DID_ABORT: + host_status = XEN_VSCSIIF_RSLT_HOST_ABORT; + break; + case DID_PARITY: + host_status = XEN_VSCSIIF_RSLT_HOST_PARITY; + break; + case DID_ERROR: + host_status = XEN_VSCSIIF_RSLT_HOST_ERROR; + break; + case DID_RESET: + host_status = XEN_VSCSIIF_RSLT_HOST_RESET; + break; + case DID_BAD_INTR: + host_status = XEN_VSCSIIF_RSLT_HOST_BAD_INTR; + break; + case DID_PASSTHROUGH: + host_status = XEN_VSCSIIF_RSLT_HOST_PASSTHROUGH; + break; + case DID_SOFT_ERROR: + host_status = XEN_VSCSIIF_RSLT_HOST_SOFT_ERROR; + break; + case DID_IMM_RETRY: + host_status = XEN_VSCSIIF_RSLT_HOST_IMM_RETRY; + break; + case DID_REQUEUE: + host_status = XEN_VSCSIIF_RSLT_HOST_REQUEUE; + break; + case DID_TRANSPORT_DISRUPTED: + host_status = XEN_VSCSIIF_RSLT_HOST_TRANSPORT_DISRUPTED; + break; + case DID_TRANSPORT_FAILFAST: + host_status = XEN_VSCSIIF_RSLT_HOST_TRANSPORT_FAILFAST; + break; + case DID_TRANSPORT_MARGINAL: + host_status = XEN_VSCSIIF_RSLT_HOST_TRANSPORT_MARGINAL; + break; + default: + host_status = XEN_VSCSIIF_RSLT_HOST_ERROR; + break; + } + + return (host_status << 16) | (result & 0x00ffff); +} + static void scsiback_send_response(struct vscsibk_info *info, char *sense_buffer, int32_t result, uint32_t resid, uint16_t rqid) @@ -334,7 +359,7 @@ static void scsiback_send_response(struct vscsibk_info *info, ring_res = RING_GET_RESPONSE(&info->ring, info->ring.rsp_prod_pvt); info->ring.rsp_prod_pvt++; - ring_res->rslt = result; + ring_res->rslt = scsiback_result(result); ring_res->rqid = rqid; if (sense_buffer != NULL && @@ -397,21 +422,18 @@ static void scsiback_cmd_exec(struct vscsibk_pend *pending_req) { struct se_cmd *se_cmd = &pending_req->se_cmd; struct se_session *sess = pending_req->v2p->tpg->tpg_nexus->tvn_se_sess; - int rc; scsiback_get(pending_req->info); se_cmd->tag = pending_req->rqid; - rc = target_submit_cmd_map_sgls(se_cmd, sess, pending_req->cmnd, - pending_req->sense_buffer, pending_req->v2p->lun, - pending_req->data_len, 0, - pending_req->sc_data_direction, TARGET_SCF_ACK_KREF, - pending_req->sgl, pending_req->n_sg, - NULL, 0, NULL, 0); - if (rc < 0) { - transport_send_check_condition_and_sense(se_cmd, - TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE, 0); - transport_generic_free_cmd(se_cmd, 0); - } + target_init_cmd(se_cmd, sess, pending_req->sense_buffer, + pending_req->v2p->lun, pending_req->data_len, 0, + pending_req->sc_data_direction, TARGET_SCF_ACK_KREF); + + if (target_submit_prep(se_cmd, pending_req->cmnd, pending_req->sgl, + pending_req->n_sg, NULL, 0, NULL, 0, GFP_KERNEL)) + return; + + target_submit(se_cmd); } static int scsiback_gnttab_data_map_batch(struct gnttab_map_grant_ref *map, @@ -423,12 +445,12 @@ static int scsiback_gnttab_data_map_batch(struct gnttab_map_grant_ref *map, return 0; err = gnttab_map_refs(map, NULL, pg, cnt); - BUG_ON(err); for (i = 0; i < cnt; i++) { if (unlikely(map[i].status != GNTST_okay)) { pr_err("invalid buffer -- could not remap it\n"); map[i].handle = SCSIBACK_INVALID_HANDLE; - err = -ENOMEM; + if (!err) + err = -ENOMEM; } else { get_page(pg[i]); } @@ -446,8 +468,8 @@ static int scsiback_gnttab_data_map_list(struct vscsibk_pend *pending_req, struct vscsibk_info *info = pending_req->info; for (i = 0; i < cnt; i++) { - if (get_free_page(pg + mapcount)) { - put_free_pages(pg, mapcount); + if (gnttab_page_cache_get(&info->free_pages, pg + mapcount)) { + gnttab_page_cache_put(&info->free_pages, pg, mapcount); pr_err("no grant page\n"); return -ENOMEM; } @@ -597,7 +619,7 @@ static void scsiback_device_action(struct vscsibk_pend *pending_req, struct scsiback_nexus *nexus = tpg->tpg_nexus; struct se_cmd *se_cmd = &pending_req->se_cmd; u64 unpacked_lun = pending_req->v2p->lun; - int rc, err = FAILED; + int rc, err = XEN_VSCSIIF_RSLT_RESET_FAILED; init_completion(&pending_req->tmr_done); @@ -611,7 +633,7 @@ static void scsiback_device_action(struct vscsibk_pend *pending_req, wait_for_completion(&pending_req->tmr_done); err = (se_cmd->se_tmr_req->response == TMR_FUNCTION_COMPLETE) ? - SUCCESS : FAILED; + XEN_VSCSIIF_RSLT_RESET_SUCCESS : XEN_VSCSIIF_RSLT_RESET_FAILED; scsiback_do_resp_with_sense(NULL, err, 0, pending_req); transport_generic_free_cmd(&pending_req->se_cmd, 0); @@ -722,7 +744,8 @@ static struct vscsibk_pend *prepare_pending_reqs(struct vscsibk_info *info, return pending_req; } -static int scsiback_do_cmd_fn(struct vscsibk_info *info) +static int scsiback_do_cmd_fn(struct vscsibk_info *info, + unsigned int *eoi_flags) { struct vscsiif_back_ring *ring = &info->ring; struct vscsiif_request ring_req; @@ -739,11 +762,12 @@ static int scsiback_do_cmd_fn(struct vscsibk_info *info) rc = ring->rsp_prod_pvt; pr_warn("Dom%d provided bogus ring requests (%#x - %#x = %u). Halting ring processing\n", info->domid, rp, rc, rp - rc); - info->ring_error = 1; - return 0; + return -EINVAL; } while ((rc != rp)) { + *eoi_flags &= ~XEN_EOI_FLAG_SPURIOUS; + if (RING_REQUEST_CONS_OVERFLOW(ring, rc)) break; @@ -757,10 +781,10 @@ static int scsiback_do_cmd_fn(struct vscsibk_info *info) result = DID_NO_CONNECT; break; default: - result = DRIVER_ERROR; + result = DID_ERROR; break; } - scsiback_send_response(info, NULL, result << 24, 0, + scsiback_send_response(info, NULL, result << 16, 0, ring_req.rqid); return 1; } @@ -770,7 +794,7 @@ static int scsiback_do_cmd_fn(struct vscsibk_info *info) if (scsiback_gnttab_data_map(&ring_req, pending_req)) { scsiback_fast_flush_area(pending_req); scsiback_do_resp_with_sense(NULL, - DRIVER_ERROR << 24, 0, pending_req); + DID_ERROR << 16, 0, pending_req); transport_generic_free_cmd(&pending_req->se_cmd, 0); } else { scsiback_cmd_exec(pending_req); @@ -785,7 +809,7 @@ static int scsiback_do_cmd_fn(struct vscsibk_info *info) break; default: pr_err_ratelimited("invalid request\n"); - scsiback_do_resp_with_sense(NULL, DRIVER_ERROR << 24, 0, + scsiback_do_resp_with_sense(NULL, DID_ERROR << 16, 0, pending_req); transport_generic_free_cmd(&pending_req->se_cmd, 0); break; @@ -795,6 +819,8 @@ static int scsiback_do_cmd_fn(struct vscsibk_info *info) cond_resched(); } + gnttab_page_cache_shrink(&info->free_pages, scsiback_max_buffer_pages); + RING_FINAL_CHECK_FOR_REQUESTS(&info->ring, more_to_do); return more_to_do; } @@ -802,13 +828,16 @@ static int scsiback_do_cmd_fn(struct vscsibk_info *info) static irqreturn_t scsiback_irq_fn(int irq, void *dev_id) { struct vscsibk_info *info = dev_id; + int rc; + unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS; - if (info->ring_error) - return IRQ_HANDLED; - - while (scsiback_do_cmd_fn(info)) + while ((rc = scsiback_do_cmd_fn(info, &eoi_flags)) > 0) cond_resched(); + /* In case of a ring error we keep the event channel masked. */ + if (!rc) + xen_irq_lateeoi(irq, eoi_flags); + return IRQ_HANDLED; } @@ -829,7 +858,7 @@ static int scsiback_init_sring(struct vscsibk_info *info, grant_ref_t ring_ref, sring = (struct vscsiif_sring *)area; BACK_RING_INIT(&info->ring, sring, PAGE_SIZE); - err = bind_interdomain_evtchn_to_irq(info->domid, evtchn); + err = bind_interdomain_evtchn_to_irq_lateeoi(info->dev, evtchn); if (err < 0) goto unmap_page; @@ -854,7 +883,8 @@ unmap_page: static int scsiback_map(struct vscsibk_info *info) { struct xenbus_device *dev = info->dev; - unsigned int ring_ref, evtchn; + unsigned int ring_ref; + evtchn_port_t evtchn; int err; err = xenbus_gather(XBT_NIL, dev->otherend, @@ -980,12 +1010,6 @@ out_free: return err; } -static void __scsiback_del_translation_entry(struct v2p_entry *entry) -{ - list_del(&entry->l); - kref_put(&entry->kref, scsiback_free_translation_entry); -} - /* Delete the translation entry specified */ @@ -994,18 +1018,20 @@ static int scsiback_del_translation_entry(struct vscsibk_info *info, { struct v2p_entry *entry; unsigned long flags; - int ret = 0; spin_lock_irqsave(&info->v2p_lock, flags); /* Find out the translation entry specified */ entry = scsiback_chk_translation_entry(info, v); if (entry) - __scsiback_del_translation_entry(entry); - else - ret = -ENOENT; + list_del(&entry->l); spin_unlock_irqrestore(&info->v2p_lock, flags); - return ret; + + if (!entry) + return -ENOENT; + + kref_put(&entry->kref, scsiback_free_translation_entry); + return 0; } static void scsiback_do_add_lun(struct vscsibk_info *info, const char *state, @@ -1079,7 +1105,7 @@ static void scsiback_do_1lun_hotplug(struct vscsibk_info *info, int op, "%s: writing %s", __func__, state); return; } - strlcpy(phy, val, VSCSI_NAMELEN); + strscpy(phy, val, VSCSI_NAMELEN); kfree(val); /* virtual SCSI device */ @@ -1184,7 +1210,7 @@ static void scsiback_frontend_changed(struct xenbus_device *dev, xenbus_switch_state(dev, XenbusStateClosed); if (xenbus_dev_is_online(dev)) break; - /* fall through if not online */ + fallthrough; /* if not online */ case XenbusStateUnknown: device_unregister(&dev->dev); break; @@ -1209,17 +1235,22 @@ static void scsiback_release_translation_entry(struct vscsibk_info *info) { struct v2p_entry *entry, *tmp; struct list_head *head = &(info->v2p_entry_lists); + struct list_head tmp_list; unsigned long flags; spin_lock_irqsave(&info->v2p_lock, flags); - list_for_each_entry_safe(entry, tmp, head, l) - __scsiback_del_translation_entry(entry); + list_cut_before(&tmp_list, head, head); spin_unlock_irqrestore(&info->v2p_lock, flags); + + list_for_each_entry_safe(entry, tmp, &tmp_list, l) { + list_del(&entry->l); + kref_put(&entry->kref, scsiback_free_translation_entry); + } } -static int scsiback_remove(struct xenbus_device *dev) +static void scsiback_remove(struct xenbus_device *dev) { struct vscsibk_info *info = dev_get_drvdata(&dev->dev); @@ -1228,9 +1259,9 @@ static int scsiback_remove(struct xenbus_device *dev) scsiback_release_translation_entry(info); - dev_set_drvdata(&dev->dev, NULL); + gnttab_page_cache_shrink(&info->free_pages, 0); - return 0; + dev_set_drvdata(&dev->dev, NULL); } static int scsiback_probe(struct xenbus_device *dev, @@ -1252,13 +1283,13 @@ static int scsiback_probe(struct xenbus_device *dev, info->domid = dev->otherend_id; spin_lock_init(&info->ring_lock); - info->ring_error = 0; atomic_set(&info->nr_unreplied_reqs, 0); init_waitqueue_head(&info->waiting_to_free); info->dev = dev; info->irq = 0; INIT_LIST_HEAD(&info->v2p_entry_lists); spin_lock_init(&info->v2p_lock); + gnttab_page_cache_init(&info->free_pages); err = xenbus_printf(XBT_NIL, dev->nodename, "feature-sg-grant", "%u", SG_ALL); @@ -1376,11 +1407,6 @@ static void scsiback_drop_tport(struct se_wwn *wwn) kfree(tport); } -static u32 scsiback_tpg_get_inst_index(struct se_portal_group *se_tpg) -{ - return 1; -} - static int scsiback_check_stop_free(struct se_cmd *se_cmd) { return transport_generic_free_cmd(se_cmd, 0); @@ -1391,11 +1417,6 @@ static void scsiback_release_cmd(struct se_cmd *se_cmd) target_free_tag(se_cmd->se_sess, se_cmd); } -static u32 scsiback_sess_get_index(struct se_session *se_sess) -{ - return 0; -} - static int scsiback_write_pending(struct se_cmd *se_cmd) { /* Go ahead and process the write immediately */ @@ -1404,20 +1425,6 @@ static int scsiback_write_pending(struct se_cmd *se_cmd) return 0; } -static int scsiback_write_pending_status(struct se_cmd *se_cmd) -{ - return 0; -} - -static void scsiback_set_default_node_attrs(struct se_node_acl *nacl) -{ -} - -static int scsiback_get_cmd_state(struct se_cmd *se_cmd) -{ - return 0; -} - static int scsiback_queue_data_in(struct se_cmd *se_cmd) { struct vscsibk_pend *pending_req = container_of(se_cmd, @@ -1436,8 +1443,7 @@ static int scsiback_queue_status(struct se_cmd *se_cmd) if (se_cmd->sense_buffer && ((se_cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) || (se_cmd->se_cmd_flags & SCF_EMULATED_TASK_SENSE))) - pending_req->result = (DRIVER_SENSE << 24) | - SAM_STAT_CHECK_CONDITION; + pending_req->result = SAM_STAT_CHECK_CONDITION; else pending_req->result = se_cmd->scsi_status; @@ -1798,11 +1804,6 @@ static int scsiback_check_true(struct se_portal_group *se_tpg) return 1; } -static int scsiback_check_false(struct se_portal_group *se_tpg) -{ - return 0; -} - static const struct target_core_fabric_ops scsiback_ops = { .module = THIS_MODULE, .fabric_name = "xen-pvscsi", @@ -1810,17 +1811,10 @@ static const struct target_core_fabric_ops scsiback_ops = { .tpg_get_tag = scsiback_get_tag, .tpg_check_demo_mode = scsiback_check_true, .tpg_check_demo_mode_cache = scsiback_check_true, - .tpg_check_demo_mode_write_protect = scsiback_check_false, - .tpg_check_prod_mode_write_protect = scsiback_check_false, - .tpg_get_inst_index = scsiback_tpg_get_inst_index, .check_stop_free = scsiback_check_stop_free, .release_cmd = scsiback_release_cmd, - .sess_get_index = scsiback_sess_get_index, .sess_get_initiator_sid = NULL, .write_pending = scsiback_write_pending, - .write_pending_status = scsiback_write_pending_status, - .set_default_node_attributes = scsiback_set_default_node_attrs, - .get_cmd_state = scsiback_get_cmd_state, .queue_data_in = scsiback_queue_data_in, .queue_status = scsiback_queue_status, .queue_tm_rsp = scsiback_queue_tm_rsp, @@ -1838,6 +1832,9 @@ static const struct target_core_fabric_ops scsiback_ops = { .tfc_wwn_attrs = scsiback_wwn_attrs, .tfc_tpg_base_attrs = scsiback_tpg_attrs, .tfc_tpg_param_attrs = scsiback_param_attrs, + + .default_submit_type = TARGET_DIRECT_SUBMIT, + .direct_submit_supp = 1, }; static const struct xenbus_device_id scsiback_ids[] = { @@ -1881,13 +1878,6 @@ out: static void __exit scsiback_exit(void) { - struct page *page; - - while (free_pages_num) { - if (get_free_page(&page)) - BUG(); - gnttab_free_pages(1, &page); - } target_unregister_template(&scsiback_ops); xenbus_unregister_driver(&scsiback_driver); } diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c deleted file mode 100644 index 246f6122c9ee..000000000000 --- a/drivers/xen/xen-selfballoon.c +++ /dev/null @@ -1,579 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/****************************************************************************** - * Xen selfballoon driver (and optional frontswap self-shrinking driver) - * - * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. - * - * This code complements the cleancache and frontswap patchsets to optimize - * support for Xen Transcendent Memory ("tmem"). The policy it implements - * is rudimentary and will likely improve over time, but it does work well - * enough today. - * - * Two functionalities are implemented here which both use "control theory" - * (feedback) to optimize memory utilization. In a virtualized environment - * such as Xen, RAM is often a scarce resource and we would like to ensure - * that each of a possibly large number of virtual machines is using RAM - * efficiently, i.e. using as little as possible when under light load - * and obtaining as much as possible when memory demands are high. - * Since RAM needs vary highly dynamically and sometimes dramatically, - * "hysteresis" is used, that is, memory target is determined not just - * on current data but also on past data stored in the system. - * - * "Selfballooning" creates memory pressure by managing the Xen balloon - * driver to decrease and increase available kernel memory, driven - * largely by the target value of "Committed_AS" (see /proc/meminfo). - * Since Committed_AS does not account for clean mapped pages (i.e. pages - * in RAM that are identical to pages on disk), selfballooning has the - * affect of pushing less frequently used clean pagecache pages out of - * kernel RAM and, presumably using cleancache, into Xen tmem where - * Xen can more efficiently optimize RAM utilization for such pages. - * - * When kernel memory demand unexpectedly increases faster than Xen, via - * the selfballoon driver, is able to (or chooses to) provide usable RAM, - * the kernel may invoke swapping. In most cases, frontswap is able - * to absorb this swapping into Xen tmem. However, due to the fact - * that the kernel swap subsystem assumes swapping occurs to a disk, - * swapped pages may sit on the disk for a very long time; even if - * the kernel knows the page will never be used again. This is because - * the disk space costs very little and can be overwritten when - * necessary. When such stale pages are in frontswap, however, they - * are taking up valuable real estate. "Frontswap selfshrinking" works - * to resolve this: When frontswap activity is otherwise stable - * and the guest kernel is not under memory pressure, the "frontswap - * selfshrinking" accounts for this by providing pressure to remove some - * pages from frontswap and return them to kernel memory. - * - * For both "selfballooning" and "frontswap-selfshrinking", a worker - * thread is used and sysfs tunables are provided to adjust the frequency - * and rate of adjustments to achieve the goal, as well as to disable one - * or both functions independently. - * - * While some argue that this functionality can and should be implemented - * in userspace, it has been observed that bad things happen (e.g. OOMs). - * - * System configuration note: Selfballooning should not be enabled on - * systems without a sufficiently large swap device configured; for best - * results, it is recommended that total swap be increased by the size - * of the guest memory. Note, that selfballooning should be disabled by default - * if frontswap is not configured. Similarly selfballooning should be enabled - * by default if frontswap is configured and can be disabled with the - * "tmem.selfballooning=0" kernel boot option. Finally, when frontswap is - * configured, frontswap-selfshrinking can be disabled with the - * "tmem.selfshrink=0" kernel boot option. - * - * Selfballooning is disallowed in domain0 and force-disabled. - * - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/memblock.h> -#include <linux/swap.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/workqueue.h> -#include <linux/device.h> -#include <xen/balloon.h> -#include <xen/tmem.h> -#include <xen/xen.h> - -/* Enable/disable with sysfs. */ -static int xen_selfballooning_enabled __read_mostly; - -/* - * Controls rate at which memory target (this iteration) approaches - * ultimate goal when memory need is increasing (up-hysteresis) or - * decreasing (down-hysteresis). Higher values of hysteresis cause - * slower increases/decreases. The default values for the various - * parameters were deemed reasonable by experimentation, may be - * workload-dependent, and can all be adjusted via sysfs. - */ -static unsigned int selfballoon_downhysteresis __read_mostly = 8; -static unsigned int selfballoon_uphysteresis __read_mostly = 1; - -/* In HZ, controls frequency of worker invocation. */ -static unsigned int selfballoon_interval __read_mostly = 5; - -/* - * Minimum usable RAM in MB for selfballooning target for balloon. - * If non-zero, it is added to totalreserve_pages and self-ballooning - * will not balloon below the sum. If zero, a piecewise linear function - * is calculated as a minimum and added to totalreserve_pages. Note that - * setting this value indiscriminately may cause OOMs and crashes. - */ -static unsigned int selfballoon_min_usable_mb; - -/* - * Amount of RAM in MB to add to the target number of pages. - * Can be used to reserve some more room for caches and the like. - */ -static unsigned int selfballoon_reserved_mb; - -static void selfballoon_process(struct work_struct *work); -static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process); - -#ifdef CONFIG_FRONTSWAP -#include <linux/frontswap.h> - -/* Enable/disable with sysfs. */ -static bool frontswap_selfshrinking __read_mostly; - -/* - * The default values for the following parameters were deemed reasonable - * by experimentation, may be workload-dependent, and can all be - * adjusted via sysfs. - */ - -/* Control rate for frontswap shrinking. Higher hysteresis is slower. */ -static unsigned int frontswap_hysteresis __read_mostly = 20; - -/* - * Number of selfballoon worker invocations to wait before observing that - * frontswap selfshrinking should commence. Note that selfshrinking does - * not use a separate worker thread. - */ -static unsigned int frontswap_inertia __read_mostly = 3; - -/* Countdown to next invocation of frontswap_shrink() */ -static unsigned long frontswap_inertia_counter; - -/* - * Invoked by the selfballoon worker thread, uses current number of pages - * in frontswap (frontswap_curr_pages()), previous status, and control - * values (hysteresis and inertia) to determine if frontswap should be - * shrunk and what the new frontswap size should be. Note that - * frontswap_shrink is essentially a partial swapoff that immediately - * transfers pages from the "swap device" (frontswap) back into kernel - * RAM; despite the name, frontswap "shrinking" is very different from - * the "shrinker" interface used by the kernel MM subsystem to reclaim - * memory. - */ -static void frontswap_selfshrink(void) -{ - static unsigned long cur_frontswap_pages; - unsigned long last_frontswap_pages; - unsigned long tgt_frontswap_pages; - - last_frontswap_pages = cur_frontswap_pages; - cur_frontswap_pages = frontswap_curr_pages(); - if (!cur_frontswap_pages || - (cur_frontswap_pages > last_frontswap_pages)) { - frontswap_inertia_counter = frontswap_inertia; - return; - } - if (frontswap_inertia_counter && --frontswap_inertia_counter) - return; - if (cur_frontswap_pages <= frontswap_hysteresis) - tgt_frontswap_pages = 0; - else - tgt_frontswap_pages = cur_frontswap_pages - - (cur_frontswap_pages / frontswap_hysteresis); - frontswap_shrink(tgt_frontswap_pages); - frontswap_inertia_counter = frontswap_inertia; -} - -#endif /* CONFIG_FRONTSWAP */ - -#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) -#define PAGES2MB(pages) ((pages) >> (20 - PAGE_SHIFT)) - -/* - * Use current balloon size, the goal (vm_committed_as), and hysteresis - * parameters to set a new target balloon size - */ -static void selfballoon_process(struct work_struct *work) -{ - unsigned long cur_pages, goal_pages, tgt_pages, floor_pages; - unsigned long useful_pages; - bool reset_timer = false; - - if (xen_selfballooning_enabled) { - cur_pages = totalram_pages(); - tgt_pages = cur_pages; /* default is no change */ - goal_pages = vm_memory_committed() + - totalreserve_pages + - MB2PAGES(selfballoon_reserved_mb); -#ifdef CONFIG_FRONTSWAP - /* allow space for frontswap pages to be repatriated */ - if (frontswap_selfshrinking) - goal_pages += frontswap_curr_pages(); -#endif - if (cur_pages > goal_pages) - tgt_pages = cur_pages - - ((cur_pages - goal_pages) / - selfballoon_downhysteresis); - else if (cur_pages < goal_pages) - tgt_pages = cur_pages + - ((goal_pages - cur_pages) / - selfballoon_uphysteresis); - /* else if cur_pages == goal_pages, no change */ - useful_pages = max_pfn - totalreserve_pages; - if (selfballoon_min_usable_mb != 0) - floor_pages = totalreserve_pages + - MB2PAGES(selfballoon_min_usable_mb); - /* piecewise linear function ending in ~3% slope */ - else if (useful_pages < MB2PAGES(16)) - floor_pages = max_pfn; /* not worth ballooning */ - else if (useful_pages < MB2PAGES(64)) - floor_pages = totalreserve_pages + MB2PAGES(16) + - ((useful_pages - MB2PAGES(16)) >> 1); - else if (useful_pages < MB2PAGES(512)) - floor_pages = totalreserve_pages + MB2PAGES(40) + - ((useful_pages - MB2PAGES(40)) >> 3); - else /* useful_pages >= MB2PAGES(512) */ - floor_pages = totalreserve_pages + MB2PAGES(99) + - ((useful_pages - MB2PAGES(99)) >> 5); - if (tgt_pages < floor_pages) - tgt_pages = floor_pages; - balloon_set_new_target(tgt_pages + - balloon_stats.current_pages - totalram_pages()); - reset_timer = true; - } -#ifdef CONFIG_FRONTSWAP - if (frontswap_selfshrinking) { - frontswap_selfshrink(); - reset_timer = true; - } -#endif - if (reset_timer) - schedule_delayed_work(&selfballoon_worker, - selfballoon_interval * HZ); -} - -#ifdef CONFIG_SYSFS - -#include <linux/capability.h> - -#define SELFBALLOON_SHOW(name, format, args...) \ - static ssize_t show_##name(struct device *dev, \ - struct device_attribute *attr, \ - char *buf) \ - { \ - return sprintf(buf, format, ##args); \ - } - -SELFBALLOON_SHOW(selfballooning, "%d\n", xen_selfballooning_enabled); - -static ssize_t store_selfballooning(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - bool was_enabled = xen_selfballooning_enabled; - unsigned long tmp; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - err = kstrtoul(buf, 10, &tmp); - if (err) - return err; - if ((tmp != 0) && (tmp != 1)) - return -EINVAL; - - xen_selfballooning_enabled = !!tmp; - if (!was_enabled && xen_selfballooning_enabled) - schedule_delayed_work(&selfballoon_worker, - selfballoon_interval * HZ); - - return count; -} - -static DEVICE_ATTR(selfballooning, S_IRUGO | S_IWUSR, - show_selfballooning, store_selfballooning); - -SELFBALLOON_SHOW(selfballoon_interval, "%d\n", selfballoon_interval); - -static ssize_t store_selfballoon_interval(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - err = kstrtoul(buf, 10, &val); - if (err) - return err; - if (val == 0) - return -EINVAL; - selfballoon_interval = val; - return count; -} - -static DEVICE_ATTR(selfballoon_interval, S_IRUGO | S_IWUSR, - show_selfballoon_interval, store_selfballoon_interval); - -SELFBALLOON_SHOW(selfballoon_downhys, "%d\n", selfballoon_downhysteresis); - -static ssize_t store_selfballoon_downhys(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - err = kstrtoul(buf, 10, &val); - if (err) - return err; - if (val == 0) - return -EINVAL; - selfballoon_downhysteresis = val; - return count; -} - -static DEVICE_ATTR(selfballoon_downhysteresis, S_IRUGO | S_IWUSR, - show_selfballoon_downhys, store_selfballoon_downhys); - - -SELFBALLOON_SHOW(selfballoon_uphys, "%d\n", selfballoon_uphysteresis); - -static ssize_t store_selfballoon_uphys(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - err = kstrtoul(buf, 10, &val); - if (err) - return err; - if (val == 0) - return -EINVAL; - selfballoon_uphysteresis = val; - return count; -} - -static DEVICE_ATTR(selfballoon_uphysteresis, S_IRUGO | S_IWUSR, - show_selfballoon_uphys, store_selfballoon_uphys); - -SELFBALLOON_SHOW(selfballoon_min_usable_mb, "%d\n", - selfballoon_min_usable_mb); - -static ssize_t store_selfballoon_min_usable_mb(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - err = kstrtoul(buf, 10, &val); - if (err) - return err; - if (val == 0) - return -EINVAL; - selfballoon_min_usable_mb = val; - return count; -} - -static DEVICE_ATTR(selfballoon_min_usable_mb, S_IRUGO | S_IWUSR, - show_selfballoon_min_usable_mb, - store_selfballoon_min_usable_mb); - -SELFBALLOON_SHOW(selfballoon_reserved_mb, "%d\n", - selfballoon_reserved_mb); - -static ssize_t store_selfballoon_reserved_mb(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - err = kstrtoul(buf, 10, &val); - if (err) - return err; - if (val == 0) - return -EINVAL; - selfballoon_reserved_mb = val; - return count; -} - -static DEVICE_ATTR(selfballoon_reserved_mb, S_IRUGO | S_IWUSR, - show_selfballoon_reserved_mb, - store_selfballoon_reserved_mb); - - -#ifdef CONFIG_FRONTSWAP -SELFBALLOON_SHOW(frontswap_selfshrinking, "%d\n", frontswap_selfshrinking); - -static ssize_t store_frontswap_selfshrinking(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - bool was_enabled = frontswap_selfshrinking; - unsigned long tmp; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - err = kstrtoul(buf, 10, &tmp); - if (err) - return err; - if ((tmp != 0) && (tmp != 1)) - return -EINVAL; - frontswap_selfshrinking = !!tmp; - if (!was_enabled && !xen_selfballooning_enabled && - frontswap_selfshrinking) - schedule_delayed_work(&selfballoon_worker, - selfballoon_interval * HZ); - - return count; -} - -static DEVICE_ATTR(frontswap_selfshrinking, S_IRUGO | S_IWUSR, - show_frontswap_selfshrinking, store_frontswap_selfshrinking); - -SELFBALLOON_SHOW(frontswap_inertia, "%d\n", frontswap_inertia); - -static ssize_t store_frontswap_inertia(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - err = kstrtoul(buf, 10, &val); - if (err) - return err; - if (val == 0) - return -EINVAL; - frontswap_inertia = val; - frontswap_inertia_counter = val; - return count; -} - -static DEVICE_ATTR(frontswap_inertia, S_IRUGO | S_IWUSR, - show_frontswap_inertia, store_frontswap_inertia); - -SELFBALLOON_SHOW(frontswap_hysteresis, "%d\n", frontswap_hysteresis); - -static ssize_t store_frontswap_hysteresis(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - err = kstrtoul(buf, 10, &val); - if (err) - return err; - if (val == 0) - return -EINVAL; - frontswap_hysteresis = val; - return count; -} - -static DEVICE_ATTR(frontswap_hysteresis, S_IRUGO | S_IWUSR, - show_frontswap_hysteresis, store_frontswap_hysteresis); - -#endif /* CONFIG_FRONTSWAP */ - -static struct attribute *selfballoon_attrs[] = { - &dev_attr_selfballooning.attr, - &dev_attr_selfballoon_interval.attr, - &dev_attr_selfballoon_downhysteresis.attr, - &dev_attr_selfballoon_uphysteresis.attr, - &dev_attr_selfballoon_min_usable_mb.attr, - &dev_attr_selfballoon_reserved_mb.attr, -#ifdef CONFIG_FRONTSWAP - &dev_attr_frontswap_selfshrinking.attr, - &dev_attr_frontswap_hysteresis.attr, - &dev_attr_frontswap_inertia.attr, -#endif - NULL -}; - -static const struct attribute_group selfballoon_group = { - .name = "selfballoon", - .attrs = selfballoon_attrs -}; -#endif - -int register_xen_selfballooning(struct device *dev) -{ - int error = -1; - -#ifdef CONFIG_SYSFS - error = sysfs_create_group(&dev->kobj, &selfballoon_group); -#endif - return error; -} -EXPORT_SYMBOL(register_xen_selfballooning); - -int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink) -{ - bool enable = false; - unsigned long reserve_pages; - - if (!xen_domain()) - return -ENODEV; - - if (xen_initial_domain()) { - pr_info("Xen selfballooning driver disabled for domain0\n"); - return -ENODEV; - } - - xen_selfballooning_enabled = tmem_enabled && use_selfballooning; - if (xen_selfballooning_enabled) { - pr_info("Initializing Xen selfballooning driver\n"); - enable = true; - } -#ifdef CONFIG_FRONTSWAP - frontswap_selfshrinking = tmem_enabled && use_frontswap_selfshrink; - if (frontswap_selfshrinking) { - pr_info("Initializing frontswap selfshrinking driver\n"); - enable = true; - } -#endif - if (!enable) - return -ENODEV; - - /* - * Give selfballoon_reserved_mb a default value(10% of total ram pages) - * to make selfballoon not so aggressive. - * - * There are mainly two reasons: - * 1) The original goal_page didn't consider some pages used by kernel - * space, like slab pages and memory used by device drivers. - * - * 2) The balloon driver may not give back memory to guest OS fast - * enough when the workload suddenly aquries a lot of physical memory. - * - * In both cases, the guest OS will suffer from memory pressure and - * OOM killer may be triggered. - * By reserving extra 10% of total ram pages, we can keep the system - * much more reliably and response faster in some cases. - */ - if (!selfballoon_reserved_mb) { - reserve_pages = totalram_pages() / 10; - selfballoon_reserved_mb = PAGES2MB(reserve_pages); - } - schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ); - - return 0; -} -EXPORT_SYMBOL(xen_selfballoon_init); diff --git a/drivers/xen/xen-stub.c b/drivers/xen/xen-stub.c deleted file mode 100644 index bbef194c5b01..000000000000 --- a/drivers/xen/xen-stub.c +++ /dev/null @@ -1,100 +0,0 @@ -/* - * xen-stub.c - stub drivers to reserve space for Xen - * - * Copyright (C) 2012 Intel Corporation - * Author: Liu Jinsong <jinsong.liu@intel.com> - * Author: Jiang Yunhong <yunhong.jiang@intel.com> - * - * Copyright (C) 2012 Oracle Inc - * Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/export.h> -#include <linux/types.h> -#include <linux/acpi.h> -#include <xen/acpi.h> - -#ifdef CONFIG_ACPI - -/*-------------------------------------------- - stub driver for Xen memory hotplug ---------------------------------------------*/ - -static const struct acpi_device_id memory_device_ids[] = { - {ACPI_MEMORY_DEVICE_HID, 0}, - {"", 0}, -}; - -static struct acpi_driver xen_stub_memory_device_driver = { - /* same name as native memory driver to block native loaded */ - .name = "acpi_memhotplug", - .class = ACPI_MEMORY_DEVICE_CLASS, - .ids = memory_device_ids, -}; - -int xen_stub_memory_device_init(void) -{ - if (!xen_initial_domain()) - return -ENODEV; - - /* just reserve space for Xen, block native driver loaded */ - return acpi_bus_register_driver(&xen_stub_memory_device_driver); -} -EXPORT_SYMBOL_GPL(xen_stub_memory_device_init); -subsys_initcall(xen_stub_memory_device_init); - -void xen_stub_memory_device_exit(void) -{ - acpi_bus_unregister_driver(&xen_stub_memory_device_driver); -} -EXPORT_SYMBOL_GPL(xen_stub_memory_device_exit); - - -/*-------------------------------------------- - stub driver for Xen cpu hotplug ---------------------------------------------*/ - -static const struct acpi_device_id processor_device_ids[] = { - {ACPI_PROCESSOR_OBJECT_HID, 0}, - {ACPI_PROCESSOR_DEVICE_HID, 0}, - {"", 0}, -}; - -static struct acpi_driver xen_stub_processor_driver = { - /* same name as native processor driver to block native loaded */ - .name = "processor", - .class = ACPI_PROCESSOR_CLASS, - .ids = processor_device_ids, -}; - -int xen_stub_processor_init(void) -{ - if (!xen_initial_domain()) - return -ENODEV; - - /* just reserve space for Xen, block native driver loaded */ - return acpi_bus_register_driver(&xen_stub_processor_driver); -} -EXPORT_SYMBOL_GPL(xen_stub_processor_init); -subsys_initcall(xen_stub_processor_init); - -void xen_stub_processor_exit(void) -{ - acpi_bus_unregister_driver(&xen_stub_processor_driver); -} -EXPORT_SYMBOL_GPL(xen_stub_processor_exit); - -#endif diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile index 0c7532110815..b0d69602214e 100644 --- a/drivers/xen/xenbus/Makefile +++ b/drivers/xen/xenbus/Makefile @@ -1,15 +1,13 @@ # SPDX-License-Identifier: GPL-2.0 obj-y += xenbus.o -obj-y += xenbus_dev_frontend.o -xenbus-objs = -xenbus-objs += xenbus_client.o -xenbus-objs += xenbus_comms.o -xenbus-objs += xenbus_xs.o -xenbus-objs += xenbus_probe.o +xenbus-y := xenbus_client.o +xenbus-y += xenbus_comms.o +xenbus-y += xenbus_xs.o +xenbus-y += xenbus_probe.o -xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o -xenbus-objs += $(xenbus-be-objs-y) +xenbus-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o +obj-y += xenbus_dev_frontend.o obj-$(CONFIG_XEN_BACKEND) += xenbus_dev_backend.o obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o diff --git a/drivers/xen/xenbus/xenbus.h b/drivers/xen/xenbus/xenbus.h index 092981171df1..9ac0427724a3 100644 --- a/drivers/xen/xenbus/xenbus.h +++ b/drivers/xen/xenbus/xenbus.h @@ -44,6 +44,8 @@ struct xen_bus_type { int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename); int (*probe)(struct xen_bus_type *bus, const char *type, const char *dir); + bool (*otherend_will_handle)(struct xenbus_watch *watch, + const char *path, const char *token); void (*otherend_changed)(struct xenbus_watch *watch, const char *path, const char *token); struct bus_type bus; @@ -75,6 +77,7 @@ enum xb_req_state { struct xb_req_data { struct list_head list; wait_queue_head_t wq; + struct kref kref; struct xsd_sockmsg msg; uint32_t caller_req_id; enum xsd_sockmsg_type type; @@ -83,6 +86,7 @@ struct xb_req_data { int num_vecs; int err; enum xb_req_state state; + bool user_req; void (*cb)(struct xb_req_data *); void *par; }; @@ -100,10 +104,11 @@ int xb_init_comms(void); void xb_deinit_comms(void); int xs_watch_msg(struct xs_watch_event *event); void xs_request_exit(struct xb_req_data *req); +void xs_free_req(struct kref *kref); -int xenbus_match(struct device *_dev, struct device_driver *_drv); +int xenbus_match(struct device *_dev, const struct device_driver *_drv); int xenbus_dev_probe(struct device *_dev); -int xenbus_dev_remove(struct device *_dev); +void xenbus_dev_remove(struct device *_dev); int xenbus_register_driver_common(struct xenbus_driver *drv, struct xen_bus_type *bus, struct module *owner, @@ -115,8 +120,6 @@ int xenbus_probe_devices(struct xen_bus_type *bus); void xenbus_dev_changed(const char *node, struct xen_bus_type *bus); -void xenbus_dev_shutdown(struct device *_dev); - int xenbus_dev_suspend(struct device *dev); int xenbus_dev_resume(struct device *dev); int xenbus_dev_cancel(struct device *dev); @@ -133,4 +136,6 @@ void xenbus_ring_ops_init(void); int xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void *par); void xenbus_dev_queue_reply(struct xb_req_data *req); +extern unsigned int xb_dev_generation_id; + #endif diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index e17ca8156171..2dc874fb5506 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -69,11 +69,24 @@ struct xenbus_map_node { unsigned int nr_handles; }; +struct map_ring_valloc { + struct xenbus_map_node *node; + + /* Why do we need two arrays? See comment of __xenbus_map_ring */ + unsigned long addrs[XENBUS_MAX_RING_GRANTS]; + phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; + + struct gnttab_map_grant_ref map[XENBUS_MAX_RING_GRANTS]; + struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS]; + + unsigned int idx; +}; + static DEFINE_SPINLOCK(xenbus_valloc_lock); static LIST_HEAD(xenbus_valloc_pages); struct xenbus_ring_ops { - int (*map)(struct xenbus_device *dev, + int (*map)(struct xenbus_device *dev, struct map_ring_valloc *info, grant_ref_t *gnt_refs, unsigned int nr_grefs, void **vaddr); int (*unmap)(struct xenbus_device *dev, void *vaddr); @@ -103,29 +116,36 @@ EXPORT_SYMBOL_GPL(xenbus_strstate); * @dev: xenbus device * @path: path to watch * @watch: watch to register + * @will_handle: events queuing determine callback * @callback: callback to register * * Register a @watch on the given path, using the given xenbus_watch structure - * for storage, and the given @callback function as the callback. Return 0 on - * success, or -errno on error. On success, the given @path will be saved as - * @watch->node, and remains the caller's to free. On error, @watch->node will - * be NULL, the device will switch to %XenbusStateClosing, and the error will - * be saved in the store. + * for storage, @will_handle function as the callback to determine if each + * event need to be queued, and the given @callback function as the callback. + * On success, the given @path will be saved as @watch->node, and remains the + * caller's to free. On error, @watch->node will be NULL, the device will + * switch to %XenbusStateClosing, and the error will be saved in the store. + * + * Returns: %0 on success or -errno on error */ int xenbus_watch_path(struct xenbus_device *dev, const char *path, struct xenbus_watch *watch, + bool (*will_handle)(struct xenbus_watch *, + const char *, const char *), void (*callback)(struct xenbus_watch *, const char *, const char *)) { int err; watch->node = path; + watch->will_handle = will_handle; watch->callback = callback; err = register_xenbus_watch(watch); if (err) { watch->node = NULL; + watch->will_handle = NULL; watch->callback = NULL; xenbus_dev_fatal(dev, err, "adding watch on %s", path); } @@ -139,19 +159,25 @@ EXPORT_SYMBOL_GPL(xenbus_watch_path); * xenbus_watch_pathfmt - register a watch on a sprintf-formatted path * @dev: xenbus device * @watch: watch to register + * @will_handle: events queuing determine callback * @callback: callback to register * @pathfmt: format of path to watch * * Register a watch on the given @path, using the given xenbus_watch - * structure for storage, and the given @callback function as the callback. - * Return 0 on success, or -errno on error. On success, the watched path - * (@path/@path2) will be saved as @watch->node, and becomes the caller's to - * kfree(). On error, watch->node will be NULL, so the caller has nothing to + * structure for storage, @will_handle function as the callback to determine if + * each event need to be queued, and the given @callback function as the + * callback. On success, the watched path (@path/@path2) will be saved + * as @watch->node, and becomes the caller's to kfree(). + * On error, watch->node will be NULL, so the caller has nothing to * free, the device will switch to %XenbusStateClosing, and the error will be * saved in the store. + * + * Returns: %0 on success or -errno on error */ int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch, + bool (*will_handle)(struct xenbus_watch *, + const char *, const char *), void (*callback)(struct xenbus_watch *, const char *, const char *), const char *pathfmt, ...) @@ -168,7 +194,7 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev, xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch"); return -ENOMEM; } - err = xenbus_watch_path(dev, path, watch, callback); + err = xenbus_watch_path(dev, path, watch, will_handle, callback); if (err) kfree(path); @@ -176,6 +202,7 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev, } EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt); +__printf(4, 5) static void xenbus_switch_fatal(struct xenbus_device *, int, int, const char *, ...); @@ -236,13 +263,15 @@ abort: } /** - * xenbus_switch_state + * xenbus_switch_state - save the new state of a driver * @dev: xenbus device * @state: new state * * Advertise in the store a change of the given driver to the given new_state. - * Return 0 on success, or -errno on error. On error, the device will switch - * to XenbusStateClosing, and the error will be saved in the store. + * On error, the device will switch to XenbusStateClosing, and the error + * will be saved in the store. + * + * Returns: %0 on success or -errno on error */ int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) { @@ -259,6 +288,7 @@ int xenbus_frontend_closed(struct xenbus_device *dev) } EXPORT_SYMBOL_GPL(xenbus_frontend_closed); +__printf(3, 0) static void xenbus_va_dev_error(struct xenbus_device *dev, int err, const char *fmt, va_list ap) { @@ -286,7 +316,7 @@ static void xenbus_va_dev_error(struct xenbus_device *dev, int err, } /** - * xenbus_dev_error + * xenbus_dev_error - place an error message into the store * @dev: xenbus device * @err: error to report * @fmt: error message format @@ -305,7 +335,7 @@ void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...) EXPORT_SYMBOL_GPL(xenbus_dev_error); /** - * xenbus_dev_fatal + * xenbus_dev_fatal - put an error messages into the store and then shutdown * @dev: xenbus device * @err: error to report * @fmt: error message format @@ -327,7 +357,7 @@ void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...) } EXPORT_SYMBOL_GPL(xenbus_dev_fatal); -/** +/* * Equivalent to xenbus_dev_fatal(dev, err, fmt, args), but helps * avoiding recursion within xenbus_switch_state. */ @@ -344,54 +374,103 @@ static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err, __xenbus_switch_state(dev, XenbusStateClosing, 1); } -/** - * xenbus_grant_ring +/* + * xenbus_setup_ring * @dev: xenbus device - * @vaddr: starting virtual address of the ring + * @vaddr: pointer to starting virtual address of the ring * @nr_pages: number of pages to be granted * @grefs: grant reference array to be filled in * - * Grant access to the given @vaddr to the peer of the given device. - * Then fill in @grefs with grant references. Return 0 on success, or - * -errno on error. On error, the device will switch to - * XenbusStateClosing, and the error will be saved in the store. + * Allocate physically contiguous pages for a shared ring buffer and grant it + * to the peer of the given device. The ring buffer is initially filled with + * zeroes. The virtual address of the ring is stored at @vaddr and the + * grant references are stored in the @grefs array. In case of error @vaddr + * will be set to NULL and @grefs will be filled with INVALID_GRANT_REF. */ -int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr, +int xenbus_setup_ring(struct xenbus_device *dev, gfp_t gfp, void **vaddr, unsigned int nr_pages, grant_ref_t *grefs) { - int err; - int i, j; + unsigned long ring_size = nr_pages * XEN_PAGE_SIZE; + grant_ref_t gref_head; + unsigned int i; + void *addr; + int ret; + + addr = *vaddr = alloc_pages_exact(ring_size, gfp | __GFP_ZERO); + if (!*vaddr) { + ret = -ENOMEM; + goto err; + } + + ret = gnttab_alloc_grant_references(nr_pages, &gref_head); + if (ret) { + xenbus_dev_fatal(dev, ret, "granting access to %u ring pages", + nr_pages); + goto err; + } for (i = 0; i < nr_pages; i++) { - err = gnttab_grant_foreign_access(dev->otherend_id, - virt_to_gfn(vaddr), 0); - if (err < 0) { - xenbus_dev_fatal(dev, err, - "granting access to ring page"); - goto fail; - } - grefs[i] = err; + unsigned long gfn; + + if (is_vmalloc_addr(*vaddr)) + gfn = pfn_to_gfn(vmalloc_to_pfn(addr)); + else + gfn = virt_to_gfn(addr); + + grefs[i] = gnttab_claim_grant_reference(&gref_head); + gnttab_grant_foreign_access_ref(grefs[i], dev->otherend_id, + gfn, 0); - vaddr = vaddr + XEN_PAGE_SIZE; + addr += XEN_PAGE_SIZE; } return 0; -fail: - for (j = 0; j < i; j++) - gnttab_end_foreign_access_ref(grefs[j], 0); - return err; + err: + if (*vaddr) + free_pages_exact(*vaddr, ring_size); + for (i = 0; i < nr_pages; i++) + grefs[i] = INVALID_GRANT_REF; + *vaddr = NULL; + + return ret; } -EXPORT_SYMBOL_GPL(xenbus_grant_ring); +EXPORT_SYMBOL_GPL(xenbus_setup_ring); + +/* + * xenbus_teardown_ring + * @vaddr: starting virtual address of the ring + * @nr_pages: number of pages + * @grefs: grant reference array + * + * Remove grants for the shared ring buffer and free the associated memory. + * On return the grant reference array is filled with INVALID_GRANT_REF. + */ +void xenbus_teardown_ring(void **vaddr, unsigned int nr_pages, + grant_ref_t *grefs) +{ + unsigned int i; + for (i = 0; i < nr_pages; i++) { + if (grefs[i] != INVALID_GRANT_REF) { + gnttab_end_foreign_access(grefs[i], NULL); + grefs[i] = INVALID_GRANT_REF; + } + } -/** + if (*vaddr) + free_pages_exact(*vaddr, nr_pages * XEN_PAGE_SIZE); + *vaddr = NULL; +} +EXPORT_SYMBOL_GPL(xenbus_teardown_ring); + +/* * Allocate an event channel for the given xenbus_device, assigning the newly * created local port to *port. Return 0 on success, or -errno on error. On * error, the device will switch to XenbusStateClosing, and the error will be * saved in the store. */ -int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port) +int xenbus_alloc_evtchn(struct xenbus_device *dev, evtchn_port_t *port) { struct evtchn_alloc_unbound alloc_unbound; int err; @@ -411,10 +490,10 @@ int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port) EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn); -/** +/* * Free an existing event channel. Returns 0 on success or -errno on error. */ -int xenbus_free_evtchn(struct xenbus_device *dev, int port) +int xenbus_free_evtchn(struct xenbus_device *dev, evtchn_port_t port) { struct evtchn_close close; int err; @@ -423,7 +502,7 @@ int xenbus_free_evtchn(struct xenbus_device *dev, int port) err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close); if (err) - xenbus_dev_error(dev, err, "freeing event channel %d", port); + xenbus_dev_error(dev, err, "freeing event channel %u", port); return err; } @@ -431,7 +510,7 @@ EXPORT_SYMBOL_GPL(xenbus_free_evtchn); /** - * xenbus_map_ring_valloc + * xenbus_map_ring_valloc - allocate & map pages of VA space * @dev: xenbus device * @gnt_refs: grant reference array * @nr_grefs: number of grant references @@ -439,16 +518,36 @@ EXPORT_SYMBOL_GPL(xenbus_free_evtchn); * * Map @nr_grefs pages of memory into this domain from another * domain's grant table. xenbus_map_ring_valloc allocates @nr_grefs - * pages of virtual address space, maps the pages to that address, and - * sets *vaddr to that address. Returns 0 on success, and GNTST_* - * (see xen/include/interface/grant_table.h) or -ENOMEM / -EINVAL on - * error. If an error is returned, device will switch to + * pages of virtual address space, maps the pages to that address, and sets + * *vaddr to that address. If an error is returned, device will switch to * XenbusStateClosing and the error message will be saved in XenStore. + * + * Returns: %0 on success or -errno on error */ int xenbus_map_ring_valloc(struct xenbus_device *dev, grant_ref_t *gnt_refs, unsigned int nr_grefs, void **vaddr) { - return ring_ops->map(dev, gnt_refs, nr_grefs, vaddr); + int err; + struct map_ring_valloc *info; + + *vaddr = NULL; + + if (nr_grefs > XENBUS_MAX_RING_GRANTS) + return -EINVAL; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + info->node = kzalloc(sizeof(*info->node), GFP_KERNEL); + if (!info->node) + err = -ENOMEM; + else + err = ring_ops->map(dev, info, gnt_refs, nr_grefs, vaddr); + + kfree(info->node); + kfree(info); + return err; } EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc); @@ -459,79 +558,106 @@ static int __xenbus_map_ring(struct xenbus_device *dev, grant_ref_t *gnt_refs, unsigned int nr_grefs, grant_handle_t *handles, - phys_addr_t *addrs, + struct map_ring_valloc *info, unsigned int flags, bool *leaked) { - struct gnttab_map_grant_ref map[XENBUS_MAX_RING_GRANTS]; - struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS]; int i, j; - int err = GNTST_okay; if (nr_grefs > XENBUS_MAX_RING_GRANTS) return -EINVAL; for (i = 0; i < nr_grefs; i++) { - memset(&map[i], 0, sizeof(map[i])); - gnttab_set_map_op(&map[i], addrs[i], flags, gnt_refs[i], - dev->otherend_id); + gnttab_set_map_op(&info->map[i], info->phys_addrs[i], flags, + gnt_refs[i], dev->otherend_id); handles[i] = INVALID_GRANT_HANDLE; } - gnttab_batch_map(map, i); + gnttab_batch_map(info->map, i); for (i = 0; i < nr_grefs; i++) { - if (map[i].status != GNTST_okay) { - err = map[i].status; - xenbus_dev_fatal(dev, map[i].status, + if (info->map[i].status != GNTST_okay) { + xenbus_dev_fatal(dev, info->map[i].status, "mapping in shared page %d from domain %d", gnt_refs[i], dev->otherend_id); goto fail; } else - handles[i] = map[i].handle; + handles[i] = info->map[i].handle; } - return GNTST_okay; + return 0; fail: for (i = j = 0; i < nr_grefs; i++) { if (handles[i] != INVALID_GRANT_HANDLE) { - memset(&unmap[j], 0, sizeof(unmap[j])); - gnttab_set_unmap_op(&unmap[j], (phys_addr_t)addrs[i], + gnttab_set_unmap_op(&info->unmap[j], + info->phys_addrs[i], GNTMAP_host_map, handles[i]); j++; } } - if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap, j)) - BUG(); + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, info->unmap, j)); *leaked = false; for (i = 0; i < j; i++) { - if (unmap[i].status != GNTST_okay) { + if (info->unmap[i].status != GNTST_okay) { *leaked = true; break; } } - return err; + return -ENOENT; } -struct map_ring_valloc_hvm +/** + * xenbus_unmap_ring - unmap memory from another domain + * @dev: xenbus device + * @handles: grant handle array + * @nr_handles: number of handles in the array + * @vaddrs: addresses to unmap + * + * Unmap memory in this domain that was imported from another domain. + * + * Returns: %0 on success or GNTST_* on error + * (see xen/include/interface/grant_table.h). + */ +static int xenbus_unmap_ring(struct xenbus_device *dev, grant_handle_t *handles, + unsigned int nr_handles, unsigned long *vaddrs) { - unsigned int idx; + struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS]; + int i; + int err; - /* Why do we need two arrays? See comment of __xenbus_map_ring */ - phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; - unsigned long addrs[XENBUS_MAX_RING_GRANTS]; -}; + if (nr_handles > XENBUS_MAX_RING_GRANTS) + return -EINVAL; + + for (i = 0; i < nr_handles; i++) + gnttab_set_unmap_op(&unmap[i], vaddrs[i], + GNTMAP_host_map, handles[i]); + + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap, i)); + + err = GNTST_okay; + for (i = 0; i < nr_handles; i++) { + if (unmap[i].status != GNTST_okay) { + xenbus_dev_error(dev, unmap[i].status, + "unmapping page at handle %d error %d", + handles[i], unmap[i].status); + err = unmap[i].status; + break; + } + } + + return err; +} static void xenbus_map_ring_setup_grant_hvm(unsigned long gfn, unsigned int goffset, unsigned int len, void *data) { - struct map_ring_valloc_hvm *info = data; + struct map_ring_valloc *info = data; unsigned long vaddr = (unsigned long)gfn_to_virt(gfn); info->phys_addrs[info->idx] = vaddr; @@ -540,39 +666,28 @@ static void xenbus_map_ring_setup_grant_hvm(unsigned long gfn, info->idx++; } -static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev, - grant_ref_t *gnt_ref, - unsigned int nr_grefs, - void **vaddr) +static int xenbus_map_ring_hvm(struct xenbus_device *dev, + struct map_ring_valloc *info, + grant_ref_t *gnt_ref, + unsigned int nr_grefs, + void **vaddr) { - struct xenbus_map_node *node; + struct xenbus_map_node *node = info->node; int err; void *addr; bool leaked = false; - struct map_ring_valloc_hvm info = { - .idx = 0, - }; unsigned int nr_pages = XENBUS_PAGES(nr_grefs); - if (nr_grefs > XENBUS_MAX_RING_GRANTS) - return -EINVAL; - - *vaddr = NULL; - - node = kzalloc(sizeof(*node), GFP_KERNEL); - if (!node) - return -ENOMEM; - - err = alloc_xenballooned_pages(nr_pages, node->hvm.pages); + err = xen_alloc_unpopulated_pages(nr_pages, node->hvm.pages); if (err) goto out_err; gnttab_foreach_grant(node->hvm.pages, nr_grefs, xenbus_map_ring_setup_grant_hvm, - &info); + info); err = __xenbus_map_ring(dev, gnt_ref, nr_grefs, node->handles, - info.phys_addrs, GNTMAP_host_map, &leaked); + info, GNTMAP_host_map, &leaked); node->nr_handles = nr_grefs; if (err) @@ -592,63 +707,25 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev, spin_unlock(&xenbus_valloc_lock); *vaddr = addr; + info->node = NULL; + return 0; out_xenbus_unmap_ring: if (!leaked) - xenbus_unmap_ring(dev, node->handles, nr_grefs, info.addrs); + xenbus_unmap_ring(dev, node->handles, nr_grefs, info->addrs); else pr_alert("leaking %p size %u page(s)", addr, nr_pages); out_free_ballooned_pages: if (!leaked) - free_xenballooned_pages(nr_pages, node->hvm.pages); + xen_free_unpopulated_pages(nr_pages, node->hvm.pages); out_err: - kfree(node); return err; } - -/** - * xenbus_map_ring - * @dev: xenbus device - * @gnt_refs: grant reference array - * @nr_grefs: number of grant reference - * @handles: pointer to grant handle to be filled - * @vaddrs: addresses to be mapped to - * @leaked: fail to clean up a failed map, caller should not free vaddr - * - * Map pages of memory into this domain from another domain's grant table. - * xenbus_map_ring does not allocate the virtual address space (you must do - * this yourself!). It only maps in the pages to the specified address. - * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h) - * or -ENOMEM / -EINVAL on error. If an error is returned, device will switch to - * XenbusStateClosing and the first error message will be saved in XenStore. - * Further more if we fail to map the ring, caller should check @leaked. - * If @leaked is not zero it means xenbus_map_ring fails to clean up, caller - * should not free the address space of @vaddr. - */ -int xenbus_map_ring(struct xenbus_device *dev, grant_ref_t *gnt_refs, - unsigned int nr_grefs, grant_handle_t *handles, - unsigned long *vaddrs, bool *leaked) -{ - phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; - int i; - - if (nr_grefs > XENBUS_MAX_RING_GRANTS) - return -EINVAL; - - for (i = 0; i < nr_grefs; i++) - phys_addrs[i] = (unsigned long)vaddrs[i]; - - return __xenbus_map_ring(dev, gnt_refs, nr_grefs, handles, - phys_addrs, GNTMAP_host_map, leaked); -} -EXPORT_SYMBOL_GPL(xenbus_map_ring); - - /** - * xenbus_unmap_ring_vfree + * xenbus_unmap_ring_vfree - unmap a page of memory from another domain * @dev: xenbus device * @vaddr: addr to unmap * @@ -656,7 +733,8 @@ EXPORT_SYMBOL_GPL(xenbus_map_ring); * Unmap a page of memory in this domain that was imported from another domain. * Use xenbus_unmap_ring_vfree if you mapped in your memory with * xenbus_map_ring_valloc (it will free the virtual address space). - * Returns 0 on success and returns GNTST_* on error + * + * Returns: %0 on success or GNTST_* on error * (see xen/include/interface/grant_table.h). */ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr) @@ -666,40 +744,33 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr) EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); #ifdef CONFIG_XEN_PV -static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev, - grant_ref_t *gnt_refs, - unsigned int nr_grefs, - void **vaddr) +static int map_ring_apply(pte_t *pte, unsigned long addr, void *data) { - struct xenbus_map_node *node; - struct vm_struct *area; - pte_t *ptes[XENBUS_MAX_RING_GRANTS]; - phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; - int err = GNTST_okay; - int i; - bool leaked; - - *vaddr = NULL; + struct map_ring_valloc *info = data; - if (nr_grefs > XENBUS_MAX_RING_GRANTS) - return -EINVAL; + info->phys_addrs[info->idx++] = arbitrary_virt_to_machine(pte).maddr; + return 0; +} - node = kzalloc(sizeof(*node), GFP_KERNEL); - if (!node) - return -ENOMEM; +static int xenbus_map_ring_pv(struct xenbus_device *dev, + struct map_ring_valloc *info, + grant_ref_t *gnt_refs, + unsigned int nr_grefs, + void **vaddr) +{ + struct xenbus_map_node *node = info->node; + struct vm_struct *area; + bool leaked = false; + int err = -ENOMEM; - area = alloc_vm_area(XEN_PAGE_SIZE * nr_grefs, ptes); - if (!area) { - kfree(node); + area = get_vm_area(XEN_PAGE_SIZE * nr_grefs, VM_IOREMAP); + if (!area) return -ENOMEM; - } - - for (i = 0; i < nr_grefs; i++) - phys_addrs[i] = arbitrary_virt_to_machine(ptes[i]).maddr; - + if (apply_to_page_range(&init_mm, (unsigned long)area->addr, + XEN_PAGE_SIZE * nr_grefs, map_ring_apply, info)) + goto failed; err = __xenbus_map_ring(dev, gnt_refs, nr_grefs, node->handles, - phys_addrs, - GNTMAP_host_map | GNTMAP_contains_pte, + info, GNTMAP_host_map | GNTMAP_contains_pte, &leaked); if (err) goto failed; @@ -712,6 +783,8 @@ static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev, spin_unlock(&xenbus_valloc_lock); *vaddr = area->addr; + info->node = NULL; + return 0; failed: @@ -720,11 +793,10 @@ failed: else pr_alert("leaking VM area %p size %u page(s)", area, nr_grefs); - kfree(node); return err; } -static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) +static int xenbus_unmap_ring_pv(struct xenbus_device *dev, void *vaddr) { struct xenbus_map_node *node; struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS]; @@ -761,8 +833,7 @@ static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) unmap[i].handle = node->handles[i]; } - if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap, i)) - BUG(); + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap, i)); err = GNTST_okay; leaked = false; @@ -788,12 +859,12 @@ static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) } static const struct xenbus_ring_ops ring_ops_pv = { - .map = xenbus_map_ring_valloc_pv, - .unmap = xenbus_unmap_ring_vfree_pv, + .map = xenbus_map_ring_pv, + .unmap = xenbus_unmap_ring_pv, }; #endif -struct unmap_ring_vfree_hvm +struct unmap_ring_hvm { unsigned int idx; unsigned long addrs[XENBUS_MAX_RING_GRANTS]; @@ -804,19 +875,19 @@ static void xenbus_unmap_ring_setup_grant_hvm(unsigned long gfn, unsigned int len, void *data) { - struct unmap_ring_vfree_hvm *info = data; + struct unmap_ring_hvm *info = data; info->addrs[info->idx] = (unsigned long)gfn_to_virt(gfn); info->idx++; } -static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr) +static int xenbus_unmap_ring_hvm(struct xenbus_device *dev, void *vaddr) { int rv; struct xenbus_map_node *node; void *addr; - struct unmap_ring_vfree_hvm info = { + struct unmap_ring_hvm info = { .idx = 0, }; unsigned int nr_pages; @@ -849,7 +920,7 @@ static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr) info.addrs); if (!rv) { vunmap(vaddr); - free_xenballooned_pages(nr_pages, node->hvm.pages); + xen_free_unpopulated_pages(nr_pages, node->hvm.pages); } else WARN(1, "Leaking %p, size %u page(s)\n", vaddr, nr_pages); @@ -859,55 +930,10 @@ static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr) } /** - * xenbus_unmap_ring - * @dev: xenbus device - * @handles: grant handle array - * @nr_handles: number of handles in the array - * @vaddrs: addresses to unmap - * - * Unmap memory in this domain that was imported from another domain. - * Returns 0 on success and returns GNTST_* on error - * (see xen/include/interface/grant_table.h). - */ -int xenbus_unmap_ring(struct xenbus_device *dev, - grant_handle_t *handles, unsigned int nr_handles, - unsigned long *vaddrs) -{ - struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS]; - int i; - int err; - - if (nr_handles > XENBUS_MAX_RING_GRANTS) - return -EINVAL; - - for (i = 0; i < nr_handles; i++) - gnttab_set_unmap_op(&unmap[i], vaddrs[i], - GNTMAP_host_map, handles[i]); - - if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap, i)) - BUG(); - - err = GNTST_okay; - for (i = 0; i < nr_handles; i++) { - if (unmap[i].status != GNTST_okay) { - xenbus_dev_error(dev, unmap[i].status, - "unmapping page at handle %d error %d", - handles[i], unmap[i].status); - err = unmap[i].status; - break; - } - } - - return err; -} -EXPORT_SYMBOL_GPL(xenbus_unmap_ring); - - -/** - * xenbus_read_driver_state + * xenbus_read_driver_state - read state from a store path * @path: path for driver * - * Return the state of the driver rooted at the given store path, or + * Returns: the state of the driver rooted at the given store path, or * XenbusStateUnknown if no state can be read. */ enum xenbus_state xenbus_read_driver_state(const char *path) @@ -922,14 +948,14 @@ enum xenbus_state xenbus_read_driver_state(const char *path) EXPORT_SYMBOL_GPL(xenbus_read_driver_state); static const struct xenbus_ring_ops ring_ops_hvm = { - .map = xenbus_map_ring_valloc_hvm, - .unmap = xenbus_unmap_ring_vfree_hvm, + .map = xenbus_map_ring_hvm, + .unmap = xenbus_unmap_ring_hvm, }; void __init xenbus_ring_ops_init(void) { #ifdef CONFIG_XEN_PV - if (!xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_pv_domain()) ring_ops = &ring_ops_pv; else #endif diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c index d239fc3c5e3d..82df2da1b880 100644 --- a/drivers/xen/xenbus/xenbus_comms.c +++ b/drivers/xen/xenbus/xenbus_comms.c @@ -57,16 +57,8 @@ DEFINE_MUTEX(xs_response_mutex); static int xenbus_irq; static struct task_struct *xenbus_task; -static DECLARE_WORK(probe_work, xenbus_probe); - - static irqreturn_t wake_waiting(int irq, void *unused) { - if (unlikely(xenstored_ready == 0)) { - xenstored_ready = 1; - schedule_work(&probe_work); - } - wake_up(&xb_waitq); return IRQ_HANDLED; } @@ -313,10 +305,12 @@ static int process_msg(void) req->msg.type = state.msg.type; req->msg.len = state.msg.len; req->body = state.body; + /* write body, then update state */ + virt_wmb(); req->state = xb_req_state_got_reply; req->cb(req); - } else - kfree(req); + } + kref_put(&req->kref, xs_free_req); } mutex_unlock(&xs_response_mutex); @@ -392,12 +386,13 @@ static int process_writes(void) state.req->msg.type = XS_ERROR; state.req->err = err; list_del(&state.req->list); - if (state.req->state == xb_req_state_aborted) - kfree(state.req); - else { + if (state.req->state != xb_req_state_aborted) { + /* write err, then update state */ + virt_wmb(); state.req->state = xb_req_state_got_reply; wake_up(&state.req->wq); } + kref_put(&state.req->kref, xs_free_req); mutex_unlock(&xb_write_mutex); diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c index c3e201025ef0..f5c21ba64df5 100644 --- a/drivers/xen/xenbus/xenbus_dev_frontend.c +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c @@ -55,6 +55,7 @@ #include <linux/string.h> #include <linux/slab.h> #include <linux/miscdevice.h> +#include <linux/workqueue.h> #include <xen/xenbus.h> #include <xen/xen.h> @@ -62,6 +63,8 @@ #include "xenbus.h" +unsigned int xb_dev_generation_id; + /* * An element of a list of outstanding transactions, for which we're * still waiting a reply. @@ -69,6 +72,7 @@ struct xenbus_transaction_holder { struct list_head list; struct xenbus_transaction handle; + unsigned int generation_id; }; /* @@ -78,7 +82,7 @@ struct read_buffer { struct list_head list; unsigned int cons; unsigned int len; - char msg[]; + char msg[] __counted_by(len); }; struct xenbus_file_priv { @@ -113,6 +117,8 @@ struct xenbus_file_priv { wait_queue_head_t read_waitq; struct kref kref; + + struct work_struct wq; }; /* Read out any raw xenbus messages queued up. */ @@ -122,7 +128,7 @@ static ssize_t xenbus_file_read(struct file *filp, { struct xenbus_file_priv *u = filp->private_data; struct read_buffer *rb; - unsigned i; + ssize_t i; int ret; mutex_lock(&u->reply_mutex); @@ -142,7 +148,7 @@ again: rb = list_entry(u->read_buffers.next, struct read_buffer, list); i = 0; while (i < len) { - unsigned sz = min((unsigned)len - i, rb->len - rb->cons); + size_t sz = min_t(size_t, len - i, rb->len - rb->cons); ret = copy_to_user(ubuf + i, &rb->msg[rb->cons], sz); @@ -189,7 +195,7 @@ static int queue_reply(struct list_head *queue, const void *data, size_t len) if (len > XENSTORE_PAYLOAD_MAX) return -EINVAL; - rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL); + rb = kmalloc(struct_size(rb, msg, len), GFP_KERNEL); if (rb == NULL) return -ENOMEM; @@ -297,14 +303,14 @@ static void watch_fired(struct xenbus_watch *watch, mutex_unlock(&adap->dev_data->reply_mutex); } -static void xenbus_file_free(struct kref *kref) +static void xenbus_worker(struct work_struct *wq) { struct xenbus_file_priv *u; struct xenbus_transaction_holder *trans, *tmp; struct watch_adapter *watch, *tmp_watch; struct read_buffer *rb, *tmp_rb; - u = container_of(kref, struct xenbus_file_priv, kref); + u = container_of(wq, struct xenbus_file_priv, wq); /* * No need for locking here because there are no other users, @@ -330,6 +336,18 @@ static void xenbus_file_free(struct kref *kref) kfree(u); } +static void xenbus_file_free(struct kref *kref) +{ + struct xenbus_file_priv *u; + + /* + * We might be called in xenbus_thread(). + * Use workqueue to avoid deadlock. + */ + u = container_of(kref, struct xenbus_file_priv, kref); + schedule_work(&u->wq); +} + static struct xenbus_transaction_holder *xenbus_get_transaction( struct xenbus_file_priv *u, uint32_t tx_id) { @@ -388,7 +406,7 @@ void xenbus_dev_queue_reply(struct xb_req_data *req) mutex_unlock(&u->reply_mutex); kfree(req->body); - kfree(req); + kref_put(&req->kref, xs_free_req); kref_put(&u->kref, xenbus_file_free); @@ -441,6 +459,7 @@ static int xenbus_write_transaction(unsigned msg_type, rc = -ENOMEM; goto out; } + trans->generation_id = xb_dev_generation_id; list_add(&trans->list, &u->transactions); } else if (msg->hdr.tx_id != 0 && !xenbus_get_transaction(u, msg->hdr.tx_id)) @@ -449,6 +468,20 @@ static int xenbus_write_transaction(unsigned msg_type, !(msg->hdr.len == 2 && (!strcmp(msg->body, "T") || !strcmp(msg->body, "F")))) return xenbus_command_reply(u, XS_ERROR, "EINVAL"); + else if (msg_type == XS_TRANSACTION_END) { + trans = xenbus_get_transaction(u, msg->hdr.tx_id); + if (trans && trans->generation_id != xb_dev_generation_id) { + list_del(&trans->list); + kfree(trans); + if (!strcmp(msg->body, "T")) + return xenbus_command_reply(u, XS_ERROR, + "EAGAIN"); + else + return xenbus_command_reply(u, + XS_TRANSACTION_END, + "OK"); + } + } rc = xenbus_dev_request_and_reply(&msg->hdr, u); if (rc && trans) { @@ -465,7 +498,6 @@ static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u) struct watch_adapter *watch; char *path, *token; int err, rc; - LIST_HEAD(staging_q); path = u->u.buffer + sizeof(u->u.msg); token = memchr(path, 0, u->u.msg.len); @@ -523,7 +555,6 @@ static ssize_t xenbus_file_write(struct file *filp, uint32_t msg_type; int rc = len; int ret; - LIST_HEAD(staging_q); /* * We're expecting usermode to be writing properly formed @@ -622,9 +653,7 @@ static int xenbus_file_open(struct inode *inode, struct file *filp) if (xen_store_evtchn == 0) return -ENOENT; - nonseekable_open(inode, filp); - - filp->f_mode &= ~FMODE_ATOMIC_POS; /* cdev-style semantics */ + stream_open(inode, filp); u = kzalloc(sizeof(*u), GFP_KERNEL); if (u == NULL) @@ -636,6 +665,7 @@ static int xenbus_file_open(struct inode *inode, struct file *filp) INIT_LIST_HEAD(&u->watches); INIT_LIST_HEAD(&u->read_buffers); init_waitqueue_head(&u->read_waitq); + INIT_WORK(&u->wq, xenbus_worker); mutex_init(&u->reply_mutex); mutex_init(&u->msgbuffer_mutex); @@ -670,7 +700,6 @@ const struct file_operations xen_xenbus_fops = { .open = xenbus_file_open, .release = xenbus_file_release, .poll = xenbus_file_poll, - .llseek = no_llseek, }; EXPORT_SYMBOL_GPL(xen_xenbus_fops); diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 5b471889d723..86fe6e779056 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -31,6 +31,7 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define dev_fmt pr_fmt #define DPRINTK(fmt, args...) \ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ @@ -51,7 +52,6 @@ #include <linux/module.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/xen/hypervisor.h> #include <xen/xen.h> @@ -65,12 +65,17 @@ #include "xenbus.h" +static int xs_init_irq = -1; int xen_store_evtchn; EXPORT_SYMBOL_GPL(xen_store_evtchn); struct xenstore_domain_interface *xen_store_interface; EXPORT_SYMBOL_GPL(xen_store_interface); +#define XS_INTERFACE_READY \ + ((xen_store_interface != NULL) && \ + (xen_store_interface->connection == XENSTORE_CONNECTED)) + enum xenstore_init xen_store_domain_type; EXPORT_SYMBOL_GPL(xen_store_domain_type); @@ -89,9 +94,9 @@ match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) return NULL; } -int xenbus_match(struct device *_dev, struct device_driver *_drv) +int xenbus_match(struct device *_dev, const struct device_driver *_drv) { - struct xenbus_driver *drv = to_xenbus_driver(_drv); + const struct xenbus_driver *drv = to_xenbus_driver(_drv); if (!drv->ids) return 0; @@ -136,6 +141,7 @@ static int watch_otherend(struct xenbus_device *dev) container_of(dev->dev.bus, struct xen_bus_type, bus); return xenbus_watch_pathfmt(dev, &dev->otherend_watch, + bus->otherend_will_handle, bus->otherend_changed, "%s/%s", dev->otherend, "state"); } @@ -205,6 +211,64 @@ void xenbus_otherend_changed(struct xenbus_watch *watch, } EXPORT_SYMBOL_GPL(xenbus_otherend_changed); +#define XENBUS_SHOW_STAT(name) \ +static ssize_t name##_show(struct device *_dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + struct xenbus_device *dev = to_xenbus_device(_dev); \ + \ + return sprintf(buf, "%d\n", atomic_read(&dev->name)); \ +} \ +static DEVICE_ATTR_RO(name) + +XENBUS_SHOW_STAT(event_channels); +XENBUS_SHOW_STAT(events); +XENBUS_SHOW_STAT(spurious_events); +XENBUS_SHOW_STAT(jiffies_eoi_delayed); + +static ssize_t spurious_threshold_show(struct device *_dev, + struct device_attribute *attr, + char *buf) +{ + struct xenbus_device *dev = to_xenbus_device(_dev); + + return sprintf(buf, "%d\n", dev->spurious_threshold); +} + +static ssize_t spurious_threshold_store(struct device *_dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct xenbus_device *dev = to_xenbus_device(_dev); + unsigned int val; + ssize_t ret; + + ret = kstrtouint(buf, 0, &val); + if (ret) + return ret; + + dev->spurious_threshold = val; + + return count; +} + +static DEVICE_ATTR_RW(spurious_threshold); + +static struct attribute *xenbus_attrs[] = { + &dev_attr_event_channels.attr, + &dev_attr_events.attr, + &dev_attr_spurious_events.attr, + &dev_attr_jiffies_eoi_delayed.attr, + &dev_attr_spurious_threshold.attr, + NULL +}; + +static const struct attribute_group xenbus_group = { + .name = "xenbus", + .attrs = xenbus_attrs, +}; + int xenbus_dev_probe(struct device *_dev) { struct xenbus_device *dev = to_xenbus_device(_dev); @@ -232,67 +296,79 @@ int xenbus_dev_probe(struct device *_dev) return err; } + if (!try_module_get(drv->driver.owner)) { + dev_warn(&dev->dev, "failed to acquire module reference on '%s'\n", + drv->driver.name); + err = -ESRCH; + goto fail; + } + + down(&dev->reclaim_sem); err = drv->probe(dev, id); + up(&dev->reclaim_sem); if (err) - goto fail; + goto fail_put; err = watch_otherend(dev); if (err) { dev_warn(&dev->dev, "watch_otherend on %s failed.\n", dev->nodename); - return err; + goto fail_remove; } + dev->spurious_threshold = 1; + if (sysfs_create_group(&dev->dev.kobj, &xenbus_group)) + dev_warn(&dev->dev, "sysfs_create_group on %s failed.\n", + dev->nodename); + return 0; +fail_remove: + if (drv->remove) { + down(&dev->reclaim_sem); + drv->remove(dev); + up(&dev->reclaim_sem); + } +fail_put: + module_put(drv->driver.owner); fail: xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename); - xenbus_switch_state(dev, XenbusStateClosed); return err; } EXPORT_SYMBOL_GPL(xenbus_dev_probe); -int xenbus_dev_remove(struct device *_dev) +void xenbus_dev_remove(struct device *_dev) { struct xenbus_device *dev = to_xenbus_device(_dev); struct xenbus_driver *drv = to_xenbus_driver(_dev->driver); DPRINTK("%s", dev->nodename); + sysfs_remove_group(&dev->dev.kobj, &xenbus_group); + free_otherend_watch(dev); - if (drv->remove) + if (drv->remove) { + down(&dev->reclaim_sem); drv->remove(dev); + up(&dev->reclaim_sem); + } + + module_put(drv->driver.owner); free_otherend_details(dev); - xenbus_switch_state(dev, XenbusStateClosed); - return 0; + /* + * If the toolstack has forced the device state to closing then set + * the state to closed now to allow it to be cleaned up. + * Similarly, if the driver does not support re-bind, set the + * closed. + */ + if (!drv->allow_rebind || + xenbus_read_driver_state(dev->nodename) == XenbusStateClosing) + xenbus_switch_state(dev, XenbusStateClosed); } EXPORT_SYMBOL_GPL(xenbus_dev_remove); -void xenbus_dev_shutdown(struct device *_dev) -{ - struct xenbus_device *dev = to_xenbus_device(_dev); - unsigned long timeout = 5*HZ; - - DPRINTK("%s", dev->nodename); - - get_device(&dev->dev); - if (dev->state != XenbusStateConnected) { - pr_info("%s: %s: %s != Connected, skipping\n", - __func__, dev->nodename, xenbus_strstate(dev->state)); - goto out; - } - xenbus_switch_state(dev, XenbusStateClosing); - timeout = wait_for_completion_timeout(&dev->down, timeout); - if (!timeout) - pr_info("%s: %s timeout closing device\n", - __func__, dev->nodename); - out: - put_device(&dev->dev); -} -EXPORT_SYMBOL_GPL(xenbus_dev_shutdown); - int xenbus_register_driver_common(struct xenbus_driver *drv, struct xen_bus_type *bus, struct module *owner, const char *mod_name) @@ -472,6 +548,7 @@ int xenbus_probe_node(struct xen_bus_type *bus, goto fail; dev_set_name(&xendev->dev, "%s", devname); + sema_init(&xendev->reclaim_sem, 1); /* Register with generic device framework. */ err = device_register(&xendev->dev); @@ -606,7 +683,7 @@ int xenbus_dev_suspend(struct device *dev) if (drv->suspend) err = drv->suspend(xdev); if (err) - pr_warn("suspend %s failed: %i\n", dev_name(dev), err); + dev_warn(dev, "suspend failed: %i\n", err); return 0; } EXPORT_SYMBOL_GPL(xenbus_dev_suspend); @@ -625,8 +702,7 @@ int xenbus_dev_resume(struct device *dev) drv = to_xenbus_driver(dev->driver); err = talk_to_otherend(xdev); if (err) { - pr_warn("resume (talk_to_otherend) %s failed: %i\n", - dev_name(dev), err); + dev_warn(dev, "resume (talk_to_otherend) failed: %i\n", err); return err; } @@ -635,15 +711,14 @@ int xenbus_dev_resume(struct device *dev) if (drv->resume) { err = drv->resume(xdev); if (err) { - pr_warn("resume %s failed: %i\n", dev_name(dev), err); + dev_warn(dev, "resume failed: %i\n", err); return err; } } err = watch_otherend(xdev); if (err) { - pr_warn("resume (watch_otherend) %s failed: %d.\n", - dev_name(dev), err); + dev_warn(dev, "resume (watch_otherend) failed: %d\n", err); return err; } @@ -682,29 +757,126 @@ void unregister_xenstore_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_xenstore_notifier); -void xenbus_probe(struct work_struct *unused) +static void xenbus_probe(void) { xenstored_ready = 1; + if (!xen_store_interface) + xen_store_interface = memremap(xen_store_gfn << XEN_PAGE_SHIFT, + XEN_PAGE_SIZE, MEMREMAP_WB); + /* + * Now it is safe to free the IRQ used for xenstore late + * initialization. No need to unbind: it is about to be + * bound again from xb_init_comms. Note that calling + * unbind_from_irqhandler now would result in xen_evtchn_close() + * being called and the event channel not being enabled again + * afterwards, resulting in missed event notifications. + */ + if (xs_init_irq >= 0) + free_irq(xs_init_irq, &xb_waitq); + + /* + * In the HVM case, xenbus_init() deferred its call to + * xs_init() in case callbacks were not operational yet. + * So do it now. + */ + if (xen_store_domain_type == XS_HVM) + xs_init(); + /* Notify others that xenstore is up */ blocking_notifier_call_chain(&xenstore_chain, 0, NULL); } -EXPORT_SYMBOL_GPL(xenbus_probe); + +/* + * Returns true when XenStore init must be deferred in order to + * allow the PCI platform device to be initialised, before we + * can actually have event channel interrupts working. + */ +static bool xs_hvm_defer_init_for_callback(void) +{ +#ifdef CONFIG_XEN_PVHVM + return xen_store_domain_type == XS_HVM && + !xen_have_vector_callback; +#else + return false; +#endif +} + +static int xenbus_probe_thread(void *unused) +{ + DEFINE_WAIT(w); + + /* + * We actually just want to wait for *any* trigger of xb_waitq, + * and run xenbus_probe() the moment it occurs. + */ + prepare_to_wait(&xb_waitq, &w, TASK_INTERRUPTIBLE); + schedule(); + finish_wait(&xb_waitq, &w); + + DPRINTK("probing"); + xenbus_probe(); + return 0; +} static int __init xenbus_probe_initcall(void) { if (!xen_domain()) return -ENODEV; - if (xen_initial_domain() || xen_hvm_domain()) - return 0; + /* + * Probe XenBus here in the XS_PV case, and also XS_HVM unless we + * need to wait for the platform PCI device to come up or + * xen_store_interface is not ready. + */ + if (xen_store_domain_type == XS_PV || + (xen_store_domain_type == XS_HVM && + !xs_hvm_defer_init_for_callback() && + XS_INTERFACE_READY)) + xenbus_probe(); + + /* + * For XS_LOCAL or when xen_store_interface is not ready, spawn a + * thread which will wait for xenstored or a xenstore-stubdom to be + * started, then probe. It will be triggered when communication + * starts happening, by waiting on xb_waitq. + */ + if (xen_store_domain_type == XS_LOCAL || !XS_INTERFACE_READY) { + struct task_struct *probe_task; - xenbus_probe(NULL); + probe_task = kthread_run(xenbus_probe_thread, NULL, + "xenbus_probe"); + if (IS_ERR(probe_task)) + return PTR_ERR(probe_task); + } return 0; } - device_initcall(xenbus_probe_initcall); +int xen_set_callback_via(uint64_t via) +{ + struct xen_hvm_param a; + int ret; + + a.domid = DOMID_SELF; + a.index = HVM_PARAM_CALLBACK_IRQ; + a.value = via; + + ret = HYPERVISOR_hvm_op(HVMOP_set_param, &a); + if (ret) + return ret; + + /* + * If xenbus_probe_initcall() deferred the xenbus_probe() + * due to the callback not functioning yet, we can do it now. + */ + if (!xenstored_ready && xs_hvm_defer_init_for_callback()) + xenbus_probe(); + + return ret; +} +EXPORT_SYMBOL_GPL(xen_set_callback_via); + /* Set up event channel for xenstored which is run as a local process * (this is normally used only in dom0) */ @@ -765,10 +937,25 @@ static struct notifier_block xenbus_resume_nb = { .notifier_call = xenbus_resume_cb, }; +static irqreturn_t xenbus_late_init(int irq, void *unused) +{ + int err; + uint64_t v = 0; + + err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v); + if (err || !v || !~v) + return IRQ_HANDLED; + xen_store_gfn = (unsigned long)v; + + wake_up(&xb_waitq); + return IRQ_HANDLED; +} + static int __init xenbus_init(void) { - int err = 0; + int err; uint64_t v = 0; + bool wait = false; xen_store_domain_type = XS_UNKNOWN; if (!xen_domain()) @@ -779,9 +966,15 @@ static int __init xenbus_init(void) if (xen_pv_domain()) xen_store_domain_type = XS_PV; if (xen_hvm_domain()) + { xen_store_domain_type = XS_HVM; - if (xen_hvm_domain() && xen_initial_domain()) - xen_store_domain_type = XS_LOCAL; + err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); + if (err) + goto out_error; + xen_store_evtchn = (int)v; + if (!v && xen_initial_domain()) + xen_store_domain_type = XS_LOCAL; + } if (xen_pv_domain() && !xen_start_info->store_evtchn) xen_store_domain_type = XS_LOCAL; if (xen_pv_domain() && xen_start_info->store_evtchn) @@ -800,28 +993,78 @@ static int __init xenbus_init(void) xen_store_interface = gfn_to_virt(xen_store_gfn); break; case XS_HVM: - err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); - if (err) - goto out_error; - xen_store_evtchn = (int)v; err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v); if (err) goto out_error; - xen_store_gfn = (unsigned long)v; - xen_store_interface = - xen_remap(xen_store_gfn << XEN_PAGE_SHIFT, - XEN_PAGE_SIZE); + /* + * Uninitialized hvm_params are zero and return no error. + * Although it is theoretically possible to have + * HVM_PARAM_STORE_PFN set to zero on purpose, in reality it is + * not zero when valid. If zero, it means that Xenstore hasn't + * been properly initialized. Instead of attempting to map a + * wrong guest physical address return error. + * + * Also recognize all bits set as an invalid/uninitialized value. + */ + if (!v) { + err = -ENOENT; + goto out_error; + } + if (v == ~0ULL) { + wait = true; + } else { + /* Avoid truncation on 32-bit. */ +#if BITS_PER_LONG == 32 + if (v > ULONG_MAX) { + pr_err("%s: cannot handle HVM_PARAM_STORE_PFN=%llx > ULONG_MAX\n", + __func__, v); + err = -EINVAL; + goto out_error; + } +#endif + xen_store_gfn = (unsigned long)v; + xen_store_interface = + memremap(xen_store_gfn << XEN_PAGE_SHIFT, + XEN_PAGE_SIZE, MEMREMAP_WB); + if (!xen_store_interface) { + pr_err("%s: cannot map HVM_PARAM_STORE_PFN=%llx\n", + __func__, v); + err = -EINVAL; + goto out_error; + } + if (xen_store_interface->connection != XENSTORE_CONNECTED) + wait = true; + } + if (wait) { + err = bind_evtchn_to_irqhandler(xen_store_evtchn, + xenbus_late_init, + 0, "xenstore_late_init", + &xb_waitq); + if (err < 0) { + pr_err("xenstore_late_init couldn't bind irq err=%d\n", + err); + goto out_error; + } + + xs_init_irq = err; + } break; default: pr_warn("Xenstore state unknown\n"); break; } - /* Initialize the interface to xenstore. */ - err = xs_init(); - if (err) { - pr_warn("Error initializing xenstore comms: %i\n", err); - goto out_error; + /* + * HVM domains may not have a functional callback yet. In that + * case let xs_init() be called from xenbus_probe(), which will + * get invoked at an appropriate time. + */ + if (xen_store_domain_type != XS_HVM) { + err = xs_init(); + if (err) { + pr_warn("Error initializing xenstore comms: %i\n", err); + goto out_error; + } } if ((xen_store_domain_type != XS_LOCAL) && @@ -835,8 +1078,10 @@ static int __init xenbus_init(void) */ proc_create_mount_point("xen"); #endif + return 0; out_error: + xen_store_domain_type = XS_UNKNOWN; return err; } diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index b0bed4faf44c..5ebb7233076f 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -45,9 +45,9 @@ #include <linux/mm.h> #include <linux/notifier.h> #include <linux/export.h> +#include <linux/semaphore.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/xen/hypervisor.h> #include <asm/hypervisor.h> #include <xen/xenbus.h> @@ -92,12 +92,12 @@ static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) return 0; } -static int xenbus_uevent_backend(struct device *dev, +static int xenbus_uevent_backend(const struct device *dev, struct kobj_uevent_env *env) { - struct xenbus_device *xdev; - struct xenbus_driver *drv; - struct xen_bus_type *bus; + const struct xenbus_device *xdev; + const struct xenbus_driver *drv; + const struct xen_bus_type *bus; DPRINTK(""); @@ -180,6 +180,12 @@ static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type, return err; } +static bool frontend_will_handle(struct xenbus_watch *watch, + const char *path, const char *token) +{ + return watch->nr_pending == 0; +} + static void frontend_changed(struct xenbus_watch *watch, const char *path, const char *token) { @@ -191,6 +197,7 @@ static struct xen_bus_type xenbus_backend = { .levels = 3, /* backend/type/<frontend>/<id> */ .get_bus_id = backend_bus_id, .probe = xenbus_probe_backend, + .otherend_will_handle = frontend_will_handle, .otherend_changed = frontend_changed, .bus = { .name = "xen-backend", @@ -198,7 +205,6 @@ static struct xen_bus_type xenbus_backend = { .uevent = xenbus_uevent_backend, .probe = xenbus_dev_probe, .remove = xenbus_dev_remove, - .shutdown = xenbus_dev_shutdown, .dev_groups = xenbus_dev_groups, }, }; @@ -248,8 +254,39 @@ static int backend_probe_and_watch(struct notifier_block *notifier, return NOTIFY_DONE; } +static int backend_reclaim_memory(struct device *dev, void *data) +{ + const struct xenbus_driver *drv; + struct xenbus_device *xdev; + + if (!dev->driver) + return 0; + drv = to_xenbus_driver(dev->driver); + if (drv && drv->reclaim_memory) { + xdev = to_xenbus_device(dev); + if (down_trylock(&xdev->reclaim_sem)) + return 0; + drv->reclaim_memory(xdev); + up(&xdev->reclaim_sem); + } + return 0; +} + +/* + * Returns 0 always because we are using shrinker to only detect memory + * pressure. + */ +static unsigned long backend_shrink_memory_count(struct shrinker *shrinker, + struct shrink_control *sc) +{ + bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, + backend_reclaim_memory); + return 0; +} + static int __init xenbus_probe_backend_init(void) { + struct shrinker *backend_memory_shrinker; static struct notifier_block xenstore_notifier = { .notifier_call = backend_probe_and_watch }; @@ -264,6 +301,16 @@ static int __init xenbus_probe_backend_init(void) register_xenstore_notifier(&xenstore_notifier); + backend_memory_shrinker = shrinker_alloc(0, "xen-backend"); + if (!backend_memory_shrinker) { + pr_warn("shrinker allocation failed\n"); + return 0; + } + + backend_memory_shrinker->count_objects = backend_shrink_memory_count; + + shrinker_register(backend_memory_shrinker); + return 0; } subsys_initcall(xenbus_probe_backend_init); diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index 07896f4b2736..6d1819269cbe 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define DPRINTK(fmt, ...) \ @@ -18,7 +19,6 @@ #include <linux/module.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/xen/hypervisor.h> #include <xen/xenbus.h> #include <xen/events.h> @@ -40,7 +40,7 @@ static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) return -EINVAL; } - strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE); + strscpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE); if (!strchr(bus_id, '/')) { pr_warn("bus_id %s no slash\n", bus_id); return -EINVAL; @@ -73,10 +73,10 @@ static int xenbus_probe_frontend(struct xen_bus_type *bus, const char *type, return err; } -static int xenbus_uevent_frontend(struct device *_dev, +static int xenbus_uevent_frontend(const struct device *_dev, struct kobj_uevent_env *env) { - struct xenbus_device *dev = to_xenbus_device(_dev); + const struct xenbus_device *dev = to_xenbus_device(_dev); if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype)) return -ENOMEM; @@ -125,6 +125,28 @@ static int xenbus_frontend_dev_probe(struct device *dev) return xenbus_dev_probe(dev); } +static void xenbus_frontend_dev_shutdown(struct device *_dev) +{ + struct xenbus_device *dev = to_xenbus_device(_dev); + unsigned long timeout = 5*HZ; + + DPRINTK("%s", dev->nodename); + + get_device(&dev->dev); + if (dev->state != XenbusStateConnected) { + pr_info("%s: %s: %s != Connected, skipping\n", + __func__, dev->nodename, xenbus_strstate(dev->state)); + goto out; + } + xenbus_switch_state(dev, XenbusStateClosing); + timeout = wait_for_completion_timeout(&dev->down, timeout); + if (!timeout) + pr_info("%s: %s timeout closing device\n", + __func__, dev->nodename); + out: + put_device(&dev->dev); +} + static const struct dev_pm_ops xenbus_pm_ops = { .suspend = xenbus_dev_suspend, .resume = xenbus_frontend_dev_resume, @@ -145,7 +167,7 @@ static struct xen_bus_type xenbus_frontend = { .uevent = xenbus_uevent_frontend, .probe = xenbus_frontend_dev_probe, .remove = xenbus_dev_remove, - .shutdown = xenbus_dev_shutdown, + .shutdown = xenbus_frontend_dev_shutdown, .dev_groups = xenbus_dev_groups, .pm = &xenbus_pm_ops, @@ -189,19 +211,11 @@ static int is_device_connecting(struct device *dev, void *data, bool ignore_none if (drv && (dev->driver != drv)) return 0; - if (ignore_nonessential) { - /* With older QEMU, for PVonHVM guests the guest config files - * could contain: vfb = [ 'vnc=1, vnclisten=0.0.0.0'] - * which is nonsensical as there is no PV FB (there can be - * a PVKB) running as HVM guest. */ + xendrv = to_xenbus_driver(dev->driver); - if ((strncmp(xendev->nodename, "device/vkbd", 11) == 0)) - return 0; + if (ignore_nonessential && xendrv->not_essential) + return 0; - if ((strncmp(xendev->nodename, "device/vfb", 10) == 0)) - return 0; - } - xendrv = to_xenbus_driver(dev->driver); return (xendev->state < XenbusStateConnected || (xendev->state == XenbusStateConnected && xendrv->is_ready && !xendrv->is_ready(xendev))); @@ -379,12 +393,12 @@ static void xenbus_reset_frontend(char *fe, char *be, int be_state) case XenbusStateConnected: xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosing); xenbus_reset_wait_for_backend(be, XenbusStateClosing); - /* fall through */ + fallthrough; case XenbusStateClosing: xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosed); xenbus_reset_wait_for_backend(be, XenbusStateClosed); - /* fall through */ + fallthrough; case XenbusStateClosed: xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateInitialising); @@ -415,7 +429,7 @@ static void xenbus_check_frontend(char *class, char *dev) printk(KERN_DEBUG "XENBUS: frontend %s %s\n", frontend, xenbus_strstate(fe_state)); backend = xenbus_read(XBT_NIL, frontend, "backend", NULL); - if (!backend || IS_ERR(backend)) + if (IS_ERR_OR_NULL(backend)) goto out; err = xenbus_scanf(XBT_NIL, backend, "state", "%i", &be_state); if (err == 1) @@ -499,4 +513,5 @@ static int __init boot_wait_for_devices(void) late_initcall(boot_wait_for_devices); #endif +MODULE_DESCRIPTION("Xen PV-device frontend support"); MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 49a3874ae6bb..15f18374020e 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -105,12 +105,19 @@ static void xs_suspend_enter(void) static void xs_suspend_exit(void) { + xb_dev_generation_id++; spin_lock(&xs_state_lock); xs_suspend_active--; spin_unlock(&xs_state_lock); wake_up_all(&xs_state_enter_wq); } +void xs_free_req(struct kref *kref) +{ + struct xb_req_data *req = container_of(kref, struct xb_req_data, kref); + kfree(req); +} + static uint32_t xs_request_enter(struct xb_req_data *req) { uint32_t rq_id; @@ -125,7 +132,7 @@ static uint32_t xs_request_enter(struct xb_req_data *req) spin_lock(&xs_state_lock); } - if (req->type == XS_TRANSACTION_START) + if (req->type == XS_TRANSACTION_START && !req->user_req) xs_state_users++; xs_state_users++; rq_id = xs_request_id++; @@ -140,7 +147,7 @@ void xs_request_exit(struct xb_req_data *req) spin_lock(&xs_state_lock); xs_state_users--; if ((req->type == XS_TRANSACTION_START && req->msg.type == XS_ERROR) || - (req->type == XS_TRANSACTION_END && + (req->type == XS_TRANSACTION_END && !req->user_req && !WARN_ON_ONCE(req->msg.type == XS_ERROR && !strcmp(req->body, "ENOENT")))) xs_state_users--; @@ -190,8 +197,11 @@ static bool xenbus_ok(void) static bool test_reply(struct xb_req_data *req) { - if (req->state == xb_req_state_got_reply || !xenbus_ok()) + if (req->state == xb_req_state_got_reply || !xenbus_ok()) { + /* read req->state before all other fields */ + virt_rmb(); return true; + } /* Make sure to reread req->state each time. */ barrier(); @@ -201,7 +211,7 @@ static bool test_reply(struct xb_req_data *req) static void *read_reply(struct xb_req_data *req) { - while (req->state != xb_req_state_got_reply) { + do { wait_event(req->wq, test_reply(req)); if (!xenbus_ok()) @@ -215,7 +225,7 @@ static void *read_reply(struct xb_req_data *req) if (req->err) return ERR_PTR(req->err); - } + } while (req->state != xb_req_state_got_reply); return req->body; } @@ -233,6 +243,12 @@ static void xs_send(struct xb_req_data *req, struct xsd_sockmsg *msg) req->caller_req_id = req->msg.req_id; req->msg.req_id = xs_request_enter(req); + /* + * Take 2nd ref. One for this thread, and the second for the + * xenbus_thread. + */ + kref_get(&req->kref); + mutex_lock(&xb_write_mutex); list_add_tail(&req->list, &xb_write_list); notify = list_is_singular(&xb_write_list); @@ -257,8 +273,8 @@ static void *xs_wait_for_reply(struct xb_req_data *req, struct xsd_sockmsg *msg) if (req->state == xb_req_state_queued || req->state == xb_req_state_wait_reply) req->state = xb_req_state_aborted; - else - kfree(req); + + kref_put(&req->kref, xs_free_req); mutex_unlock(&xb_write_mutex); return ret; @@ -286,6 +302,8 @@ int xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void *par) req->num_vecs = 1; req->cb = xenbus_dev_queue_reply; req->par = par; + req->user_req = true; + kref_init(&req->kref); xs_send(req, msg); @@ -313,6 +331,8 @@ static void *xs_talkv(struct xenbus_transaction t, req->vec = iovec; req->num_vecs = num_vecs; req->cb = xs_wake_up; + req->user_req = false; + kref_init(&req->kref); msg.req_id = 0; msg.tx_id = t.id; @@ -387,10 +407,10 @@ static char *join(const char *dir, const char *name) buffer = kasprintf(GFP_NOIO | __GFP_HIGH, "%s", dir); else buffer = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/%s", dir, name); - return (!buffer) ? ERR_PTR(-ENOMEM) : buffer; + return buffer ?: ERR_PTR(-ENOMEM); } -static char **split(char *strings, unsigned int len, unsigned int *num) +static char **split_strings(char *strings, unsigned int len, unsigned int *num) { char *p, **ret; @@ -421,14 +441,14 @@ char **xenbus_directory(struct xenbus_transaction t, path = join(dir, node); if (IS_ERR(path)) - return (char **)path; + return ERR_CAST(path); strings = xs_single(t, XS_DIRECTORY, path, &len); kfree(path); if (IS_ERR(strings)) - return (char **)strings; + return ERR_CAST(strings); - return split(strings, len, num); + return split_strings(strings, len, num); } EXPORT_SYMBOL_GPL(xenbus_directory); @@ -459,7 +479,7 @@ void *xenbus_read(struct xenbus_transaction t, path = join(dir, node); if (IS_ERR(path)) - return (void *)path; + return ERR_CAST(path); ret = xs_single(t, XS_READ, path, len); kfree(path); @@ -492,23 +512,6 @@ int xenbus_write(struct xenbus_transaction t, } EXPORT_SYMBOL_GPL(xenbus_write); -/* Create a new directory. */ -int xenbus_mkdir(struct xenbus_transaction t, - const char *dir, const char *node) -{ - char *path; - int ret; - - path = join(dir, node); - if (IS_ERR(path)) - return PTR_ERR(path); - - ret = xs_error(xs_single(t, XS_MKDIR, path, NULL)); - kfree(path); - return ret; -} -EXPORT_SYMBOL_GPL(xenbus_mkdir); - /* Destroy a file or directory (directories must be empty). */ int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node) { @@ -543,18 +546,12 @@ int xenbus_transaction_start(struct xenbus_transaction *t) EXPORT_SYMBOL_GPL(xenbus_transaction_start); /* End a transaction. - * If abandon is true, transaction is discarded instead of committed. + * If abort is true, transaction is discarded instead of committed. */ -int xenbus_transaction_end(struct xenbus_transaction t, int abort) +int xenbus_transaction_end(struct xenbus_transaction t, bool abort) { - char abortstr[2]; - - if (abort) - strcpy(abortstr, "F"); - else - strcpy(abortstr, "T"); - - return xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL)); + return xs_error(xs_single(t, XS_TRANSACTION_END, abort ? "F" : "T", + NULL)); } EXPORT_SYMBOL_GPL(xenbus_transaction_end); @@ -699,9 +696,13 @@ int xs_watch_msg(struct xs_watch_event *event) spin_lock(&watches_lock); event->handle = find_watch(event->token); - if (event->handle != NULL) { + if (event->handle != NULL && + (!event->handle->will_handle || + event->handle->will_handle(event->handle, + event->path, event->token))) { spin_lock(&watch_events_lock); list_add_tail(&event->list, &watch_events); + event->handle->nr_pending++; wake_up(&watch_events_waitq); spin_unlock(&watch_events_lock); } else @@ -711,26 +712,6 @@ int xs_watch_msg(struct xs_watch_event *event) return 0; } -/* - * Certain older XenBus toolstack cannot handle reading values that are - * not populated. Some Xen 3.4 installation are incapable of doing this - * so if we are running on anything older than 4 do not attempt to read - * control/platform-feature-xs_reset_watches. - */ -static bool xen_strict_xenbus_quirk(void) -{ -#ifdef CONFIG_X86 - uint32_t eax, ebx, ecx, edx, base; - - base = xen_cpuid_base(); - cpuid(base + 1, &eax, &ebx, &ecx, &edx); - - if ((eax >> 16) < 4) - return true; -#endif - return false; - -} static void xs_reset_watches(void) { int err; @@ -738,9 +719,6 @@ static void xs_reset_watches(void) if (!xen_hvm_domain() || xen_initial_domain()) return; - if (xen_strict_xenbus_quirk()) - return; - if (!xenbus_read_unsigned("control", "platform-feature-xs_reset_watches", 0)) return; @@ -759,6 +737,8 @@ int register_xenbus_watch(struct xenbus_watch *watch) sprintf(token, "%lX", (long)watch); + watch->nr_pending = 0; + down_read(&xs_watch_rwsem); spin_lock(&watches_lock); @@ -808,11 +788,14 @@ void unregister_xenbus_watch(struct xenbus_watch *watch) /* Cancel pending watch events. */ spin_lock(&watch_events_lock); - list_for_each_entry_safe(event, tmp, &watch_events, list) { - if (event->handle != watch) - continue; - list_del(&event->list); - kfree(event); + if (watch->nr_pending) { + list_for_each_entry_safe(event, tmp, &watch_events, list) { + if (event->handle != watch) + continue; + list_del(&event->list); + kfree(event); + } + watch->nr_pending = 0; } spin_unlock(&watch_events_lock); @@ -825,8 +808,8 @@ void xs_suspend(void) { xs_suspend_enter(); - down_write(&xs_watch_rwsem); mutex_lock(&xs_response_mutex); + down_write(&xs_watch_rwsem); } void xs_resume(void) @@ -851,15 +834,14 @@ void xs_resume(void) void xs_suspend_cancel(void) { - mutex_unlock(&xs_response_mutex); up_write(&xs_watch_rwsem); + mutex_unlock(&xs_response_mutex); xs_suspend_exit(); } static int xenwatch_thread(void *unused) { - struct list_head *ent; struct xs_watch_event *event; xenwatch_pid = current->pid; @@ -874,13 +856,15 @@ static int xenwatch_thread(void *unused) mutex_lock(&xenwatch_mutex); spin_lock(&watch_events_lock); - ent = watch_events.next; - if (ent != &watch_events) - list_del(ent); + event = list_first_entry_or_null(&watch_events, + struct xs_watch_event, list); + if (event) { + list_del(&event->list); + event->handle->nr_pending--; + } spin_unlock(&watch_events_lock); - if (ent != &watch_events) { - event = list_entry(ent, struct xs_watch_event, list); + if (event) { event->handle->callback(event->handle, event->path, event->token); kfree(event); diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile index 1a83010ddffa..8490644df1a3 100644 --- a/drivers/xen/xenfs/Makefile +++ b/drivers/xen/xenfs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_XENFS) += xenfs.o xenfs-y = super.o diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index 71ddfb4cf61c..37ea7c5c0346 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * xenfs.c - a filesystem for passing info between the a domain and * the hypervisor. @@ -13,6 +14,7 @@ #include <linux/errno.h> #include <linux/module.h> #include <linux/fs.h> +#include <linux/fs_context.h> #include <linux/magic.h> #include <xen/xen.h> @@ -42,7 +44,7 @@ static const struct file_operations capabilities_file_ops = { .llseek = default_llseek, }; -static int xenfs_fill_super(struct super_block *sb, void *data, int silent) +static int xenfs_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr xenfs_files[] = { [2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR }, @@ -67,18 +69,26 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent) xen_initial_domain() ? xenfs_init_files : xenfs_files); } -static struct dentry *xenfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) +static int xenfs_get_tree(struct fs_context *fc) { - return mount_single(fs_type, flags, data, xenfs_fill_super); + return get_tree_single(fc, xenfs_fill_super); +} + +static const struct fs_context_operations xenfs_context_ops = { + .get_tree = xenfs_get_tree, +}; + +static int xenfs_init_fs_context(struct fs_context *fc) +{ + fc->ops = &xenfs_context_ops; + return 0; } static struct file_system_type xenfs_type = { .owner = THIS_MODULE, .name = "xenfs", - .mount = xenfs_mount, - .kill_sb = kill_litter_super, + .init_fs_context = xenfs_init_fs_context, + .kill_sb = kill_anon_super, }; MODULE_ALIAS_FS("xenfs"); diff --git a/drivers/xen/xenfs/xensyms.c b/drivers/xen/xenfs/xensyms.c index c6c73a33c44d..088b7f02c358 100644 --- a/drivers/xen/xenfs/xensyms.c +++ b/drivers/xen/xenfs/xensyms.c @@ -48,7 +48,7 @@ static int xensyms_next_sym(struct xensyms *xs) return -ENOMEM; set_xen_guest_handle(symdata->name, xs->name); - symdata->symnum--; /* Rewind */ + symdata->symnum = symnum; /* Rewind */ ret = HYPERVISOR_platform_op(&xs->op); if (ret < 0) @@ -64,7 +64,7 @@ static int xensyms_next_sym(struct xensyms *xs) static void *xensyms_start(struct seq_file *m, loff_t *pos) { - struct xensyms *xs = (struct xensyms *)m->private; + struct xensyms *xs = m->private; xs->op.u.symdata.symnum = *pos; @@ -76,9 +76,9 @@ static void *xensyms_start(struct seq_file *m, loff_t *pos) static void *xensyms_next(struct seq_file *m, void *p, loff_t *pos) { - struct xensyms *xs = (struct xensyms *)m->private; + struct xensyms *xs = m->private; - xs->op.u.symdata.symnum = ++(*pos); + *pos = xs->op.u.symdata.symnum; if (xensyms_next_sym(xs)) return NULL; @@ -88,7 +88,7 @@ static void *xensyms_next(struct seq_file *m, void *p, loff_t *pos) static int xensyms_show(struct seq_file *m, void *p) { - struct xensyms *xs = (struct xensyms *)m->private; + struct xensyms *xs = m->private; struct xenpf_symdata *symdata = &xs->op.u.symdata; seq_printf(m, "%016llx %c %s\n", symdata->address, @@ -120,7 +120,7 @@ static int xensyms_open(struct inode *inode, struct file *file) return ret; m = file->private_data; - xs = (struct xensyms *)m->private; + xs = m->private; xs->namelen = XEN_KSYM_NAME_LEN + 1; xs->name = kzalloc(xs->namelen, GFP_KERNEL); @@ -138,7 +138,7 @@ static int xensyms_open(struct inode *inode, struct file *file) static int xensyms_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; - struct xensyms *xs = (struct xensyms *)m->private; + struct xensyms *xs = m->private; kfree(xs->name); return seq_release_private(inode, file); diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c index e7df65d32c91..f17c4c03db30 100644 --- a/drivers/xen/xlate_mmu.c +++ b/drivers/xen/xlate_mmu.c @@ -93,8 +93,7 @@ static void setup_hparams(unsigned long gfn, void *data) info->fgfn++; } -static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, - void *data) +static int remap_pte_fn(pte_t *ptep, unsigned long addr, void *data) { struct remap_data *info = data; struct page *page = info->pages[info->index++]; @@ -233,7 +232,7 @@ int __init xen_xlate_map_ballooned_pages(xen_pfn_t **gfns, void **virt, kfree(pages); return -ENOMEM; } - rc = alloc_xenballooned_pages(nr_pages, pages); + rc = xen_alloc_unpopulated_pages(nr_pages, pages); if (rc) { pr_warn("%s Couldn't balloon alloc %ld pages rc:%d\n", __func__, nr_pages, rc); @@ -250,7 +249,7 @@ int __init xen_xlate_map_ballooned_pages(xen_pfn_t **gfns, void **virt, if (!vaddr) { pr_warn("%s Couldn't map %ld pages rc:%d\n", __func__, nr_pages, rc); - free_xenballooned_pages(nr_pages, pages); + xen_free_unpopulated_pages(nr_pages, pages); kfree(pages); kfree(pfns); return -ENOMEM; @@ -262,4 +261,35 @@ int __init xen_xlate_map_ballooned_pages(xen_pfn_t **gfns, void **virt, return 0; } -EXPORT_SYMBOL_GPL(xen_xlate_map_ballooned_pages); + +struct remap_pfn { + struct mm_struct *mm; + struct page **pages; + pgprot_t prot; + unsigned long i; +}; + +static int remap_pfn_fn(pte_t *ptep, unsigned long addr, void *data) +{ + struct remap_pfn *r = data; + struct page *page = r->pages[r->i]; + pte_t pte = pte_mkspecial(pfn_pte(page_to_pfn(page), r->prot)); + + set_pte_at(r->mm, addr, ptep, pte); + r->i++; + + return 0; +} + +/* Used by the privcmd module, but has to be built-in on ARM */ +int xen_remap_vma_range(struct vm_area_struct *vma, unsigned long addr, unsigned long len) +{ + struct remap_pfn r = { + .mm = vma->vm_mm, + .pages = vma->vm_private_data, + .prot = vma->vm_page_prot, + }; + + return apply_to_page_range(vma->vm_mm, addr, len, remap_pfn_fn, &r); +} +EXPORT_SYMBOL_GPL(xen_remap_vma_range); |
