From 178a787502123b01499c5a4617b94bb69ad49dd5 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Mon, 1 Feb 2016 11:14:15 +1100 Subject: vfio: Enable VFIO device for powerpc ec53500f "kvm: Add VFIO device" added a special KVM pseudo-device which is used to handle any necessary interactions between KVM and VFIO. Currently that device is built on x86 and ARM, but not powerpc, although powerpc does support both KVM and VFIO. This makes things awkward in userspace Currently qemu prints an alarming error message if you attempt to use VFIO and it can't initialize the KVM VFIO device. We don't want to remove the warning, because lack of the KVM VFIO device could mean coherency problems on x86. On powerpc, however, the error is harmless but looks disturbing, and a test based on host architecture in qemu would be ugly, and break if we do need the KVM VFIO device for something important in future. There's nothing preventing the KVM VFIO device from being built for powerpc, so this patch turns it on. It won't actually do anything, since we don't define any of the arch_*() hooks, but it will make qemu happy and we can extend it in future if we need to. Signed-off-by: David Gibson Reviewed-by: Eric Auger Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/kvm') diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 0570eef83fba..7f7b6d86ac73 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -8,7 +8,7 @@ ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm KVM := ../../../virt/kvm common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ - $(KVM)/eventfd.o + $(KVM)/eventfd.o $(KVM)/vfio.o CFLAGS_e500_mmu.o := -I. CFLAGS_e500_mmu_host.o := -I. -- cgit From fcbb2ce672848481275c1f014ad44ccd1e43a7a2 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 15 Feb 2016 12:55:04 +1100 Subject: KVM: PPC: Rework H_PUT_TCE/H_GET_TCE handlers This reworks the existing H_PUT_TCE/H_GET_TCE handlers to have following patches applied nicer. This moves the ioba boundaries check to a helper and adds a check for least bits which have to be zeros. The patch is pretty mechanical (only check for least ioba bits is added) so no change in behaviour is expected. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_vio_hv.c | 117 +++++++++++++++++++++++------------- 1 file changed, 75 insertions(+), 42 deletions(-) (limited to 'arch/powerpc/kvm') diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 89e96b3e0039..f29ba2c63e07 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -35,71 +35,104 @@ #include #include #include +#include #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) +/* + * Finds a TCE table descriptor by LIOBN. + * + * WARNING: This will be called in real or virtual mode on HV KVM and virtual + * mode on PR KVM + */ +static struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu, + unsigned long liobn) +{ + struct kvm *kvm = vcpu->kvm; + struct kvmppc_spapr_tce_table *stt; + + list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) + if (stt->liobn == liobn) + return stt; + + return NULL; +} + +/* + * Validates IO address. + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +static long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, + unsigned long ioba, unsigned long npages) +{ + unsigned long mask = (1ULL << IOMMU_PAGE_SHIFT_4K) - 1; + unsigned long idx = ioba >> IOMMU_PAGE_SHIFT_4K; + unsigned long size = stt->window_size >> IOMMU_PAGE_SHIFT_4K; + + if ((ioba & mask) || (idx + npages > size) || (idx + npages < idx)) + return H_PARAMETER; + + return H_SUCCESS; +} + /* WARNING: This will be called in real-mode on HV KVM and virtual * mode on PR KVM */ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce) { - struct kvm *kvm = vcpu->kvm; - struct kvmppc_spapr_tce_table *stt; + struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn); + long ret; + unsigned long idx; + struct page *page; + u64 *tbl; /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ /* liobn, ioba, tce); */ - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { - if (stt->liobn == liobn) { - unsigned long idx = ioba >> SPAPR_TCE_SHIFT; - struct page *page; - u64 *tbl; - - /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p window_size=0x%x\n", */ - /* liobn, stt, stt->window_size); */ - if (ioba >= stt->window_size) - return H_PARAMETER; - - page = stt->pages[idx / TCES_PER_PAGE]; - tbl = (u64 *)page_address(page); - - /* FIXME: Need to validate the TCE itself */ - /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */ - tbl[idx % TCES_PER_PAGE] = tce; - return H_SUCCESS; - } - } - - /* Didn't find the liobn, punt it to userspace */ - return H_TOO_HARD; + if (!stt) + return H_TOO_HARD; + + ret = kvmppc_ioba_validate(stt, ioba, 1); + if (ret != H_SUCCESS) + return ret; + + idx = ioba >> SPAPR_TCE_SHIFT; + page = stt->pages[idx / TCES_PER_PAGE]; + tbl = (u64 *)page_address(page); + + /* FIXME: Need to validate the TCE itself */ + /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */ + tbl[idx % TCES_PER_PAGE] = tce; + + return H_SUCCESS; } EXPORT_SYMBOL_GPL(kvmppc_h_put_tce); long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba) { - struct kvm *kvm = vcpu->kvm; - struct kvmppc_spapr_tce_table *stt; + struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn); + long ret; + unsigned long idx; + struct page *page; + u64 *tbl; - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { - if (stt->liobn == liobn) { - unsigned long idx = ioba >> SPAPR_TCE_SHIFT; - struct page *page; - u64 *tbl; + if (!stt) + return H_TOO_HARD; - if (ioba >= stt->window_size) - return H_PARAMETER; + ret = kvmppc_ioba_validate(stt, ioba, 1); + if (ret != H_SUCCESS) + return ret; - page = stt->pages[idx / TCES_PER_PAGE]; - tbl = (u64 *)page_address(page); + idx = ioba >> SPAPR_TCE_SHIFT; + page = stt->pages[idx / TCES_PER_PAGE]; + tbl = (u64 *)page_address(page); - vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE]; - return H_SUCCESS; - } - } + vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE]; - /* Didn't find the liobn, punt it to userspace */ - return H_TOO_HARD; + return H_SUCCESS; } EXPORT_SYMBOL_GPL(kvmppc_h_get_tce); -- cgit From 366baf28ee3fc22dea504a0bddf8edd1e9bcee70 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 15 Feb 2016 12:55:05 +1100 Subject: KVM: PPC: Use RCU for arch.spapr_tce_tables At the moment only spapr_tce_tables updates are protected against races but not lookups. This fixes missing protection by using RCU for the list. As lookups also happen in real mode, this uses list_for_each_entry_lockless() (which is expected not to access any vmalloc'd memory). This converts release_spapr_tce_table() to a RCU scheduled handler. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s.c | 2 +- arch/powerpc/kvm/book3s_64_vio.c | 20 +++++++++++--------- arch/powerpc/kvm/book3s_64_vio_hv.c | 2 +- 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'arch/powerpc/kvm') diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 638c6d9be9e0..b34220d2aa42 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -807,7 +807,7 @@ int kvmppc_core_init_vm(struct kvm *kvm) { #ifdef CONFIG_PPC64 - INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); + INIT_LIST_HEAD_RCU(&kvm->arch.spapr_tce_tables); INIT_LIST_HEAD(&kvm->arch.rtas_tokens); #endif diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 54cf9bc94dad..9526c34c29c2 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -45,19 +45,16 @@ static long kvmppc_stt_npages(unsigned long window_size) * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; } -static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt) +static void release_spapr_tce_table(struct rcu_head *head) { - struct kvm *kvm = stt->kvm; + struct kvmppc_spapr_tce_table *stt = container_of(head, + struct kvmppc_spapr_tce_table, rcu); int i; - mutex_lock(&kvm->lock); - list_del(&stt->list); for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++) __free_page(stt->pages[i]); - kfree(stt); - mutex_unlock(&kvm->lock); - kvm_put_kvm(kvm); + kfree(stt); } static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -88,7 +85,12 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) { struct kvmppc_spapr_tce_table *stt = filp->private_data; - release_spapr_tce_table(stt); + list_del_rcu(&stt->list); + + kvm_put_kvm(stt->kvm); + + call_rcu(&stt->rcu, release_spapr_tce_table); + return 0; } @@ -131,7 +133,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, kvm_get_kvm(kvm); mutex_lock(&kvm->lock); - list_add(&stt->list, &kvm->arch.spapr_tce_tables); + list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); mutex_unlock(&kvm->lock); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index f29ba2c63e07..124d69246e11 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -51,7 +51,7 @@ static struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu, struct kvm *kvm = vcpu->kvm; struct kvmppc_spapr_tce_table *stt; - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) + list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list) if (stt->liobn == liobn) return stt; -- cgit From f8626985c7c2485c423ce9f448028f81535b0ecc Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 15 Feb 2016 12:55:06 +1100 Subject: KVM: PPC: Account TCE-containing pages in locked_vm At the moment pages used for TCE tables (in addition to pages addressed by TCEs) are not counted in locked_vm counter so a malicious userspace tool can call ioctl(KVM_CREATE_SPAPR_TCE) as many times as RLIMIT_NOFILE and lock a lot of memory. This adds counting for pages used for TCE tables. This counts the number of pages required for a table plus pages for the kvmppc_spapr_tce_table struct (TCE table descriptor) itself. This changes release_spapr_tce_table() to store @npages on stack to avoid calling kvmppc_stt_npages() in the loop (tiny optimization, probably). This does not change the amount of used memory. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_vio.c | 63 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/kvm') diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 9526c34c29c2..1a1e14f8572f 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -39,19 +39,65 @@ #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) -static long kvmppc_stt_npages(unsigned long window_size) +static unsigned long kvmppc_tce_pages(unsigned long window_size) { return ALIGN((window_size >> SPAPR_TCE_SHIFT) * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; } +static unsigned long kvmppc_stt_pages(unsigned long tce_pages) +{ + unsigned long stt_bytes = sizeof(struct kvmppc_spapr_tce_table) + + (tce_pages * sizeof(struct page *)); + + return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE; +} + +static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc) +{ + long ret = 0; + + if (!current || !current->mm) + return ret; /* process exited */ + + down_write(¤t->mm->mmap_sem); + + if (inc) { + unsigned long locked, lock_limit; + + locked = current->mm->locked_vm + stt_pages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + ret = -ENOMEM; + else + current->mm->locked_vm += stt_pages; + } else { + if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm)) + stt_pages = current->mm->locked_vm; + + current->mm->locked_vm -= stt_pages; + } + + pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid, + inc ? '+' : '-', + stt_pages << PAGE_SHIFT, + current->mm->locked_vm << PAGE_SHIFT, + rlimit(RLIMIT_MEMLOCK), + ret ? " - exceeded" : ""); + + up_write(¤t->mm->mmap_sem); + + return ret; +} + static void release_spapr_tce_table(struct rcu_head *head) { struct kvmppc_spapr_tce_table *stt = container_of(head, struct kvmppc_spapr_tce_table, rcu); int i; + unsigned long npages = kvmppc_tce_pages(stt->window_size); - for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++) + for (i = 0; i < npages; i++) __free_page(stt->pages[i]); kfree(stt); @@ -62,7 +108,7 @@ static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data; struct page *page; - if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size)) + if (vmf->pgoff >= kvmppc_tce_pages(stt->window_size)) return VM_FAULT_SIGBUS; page = stt->pages[vmf->pgoff]; @@ -89,6 +135,8 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) kvm_put_kvm(stt->kvm); + kvmppc_account_memlimit( + kvmppc_stt_pages(kvmppc_tce_pages(stt->window_size)), false); call_rcu(&stt->rcu, release_spapr_tce_table); return 0; @@ -103,7 +151,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args) { struct kvmppc_spapr_tce_table *stt = NULL; - long npages; + unsigned long npages; int ret = -ENOMEM; int i; @@ -113,7 +161,12 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, return -EBUSY; } - npages = kvmppc_stt_npages(args->window_size); + npages = kvmppc_tce_pages(args->window_size); + ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); + if (ret) { + stt = NULL; + goto fail; + } stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), GFP_KERNEL); -- cgit From 462ee11e58c96b81707d98fb1d02a8a3e84290ce Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 15 Feb 2016 12:55:07 +1100 Subject: KVM: PPC: Replace SPAPR_TCE_SHIFT with IOMMU_PAGE_SHIFT_4K SPAPR_TCE_SHIFT is used in few places only and since IOMMU_PAGE_SHIFT_4K can be easily used instead, remove SPAPR_TCE_SHIFT. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_vio.c | 3 ++- arch/powerpc/kvm/book3s_64_vio_hv.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/kvm') diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 1a1e14f8572f..84993d151db3 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -36,12 +36,13 @@ #include #include #include +#include #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) static unsigned long kvmppc_tce_pages(unsigned long window_size) { - return ALIGN((window_size >> SPAPR_TCE_SHIFT) + return ALIGN((window_size >> IOMMU_PAGE_SHIFT_4K) * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; } diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 124d69246e11..0ce4ffb2df12 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -99,7 +99,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; - idx = ioba >> SPAPR_TCE_SHIFT; + idx = ioba >> IOMMU_PAGE_SHIFT_4K; page = stt->pages[idx / TCES_PER_PAGE]; tbl = (u64 *)page_address(page); @@ -127,7 +127,7 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; - idx = ioba >> SPAPR_TCE_SHIFT; + idx = ioba >> IOMMU_PAGE_SHIFT_4K; page = stt->pages[idx / TCES_PER_PAGE]; tbl = (u64 *)page_address(page); -- cgit From 5ee7af18642ce38c79b35927872f13d292cc3e27 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 15 Feb 2016 12:55:08 +1100 Subject: KVM: PPC: Move reusable bits of H_PUT_TCE handler to helpers Upcoming multi-tce support (H_PUT_TCE_INDIRECT/H_STUFF_TCE hypercalls) will validate TCE (not to have unexpected bits) and IO address (to be within the DMA window boundaries). This introduces helpers to validate TCE and IO address. The helpers are exported as they compile into vmlinux (to work in realmode) and will be used later by KVM kernel module in virtual mode. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_vio_hv.c | 89 ++++++++++++++++++++++++++++++++----- 1 file changed, 79 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/kvm') diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 0ce4ffb2df12..b608fdd0c6f6 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -36,6 +36,7 @@ #include #include #include +#include #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) @@ -64,7 +65,7 @@ static struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu, * WARNING: This will be called in real-mode on HV KVM and virtual * mode on PR KVM */ -static long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, +long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, unsigned long ioba, unsigned long npages) { unsigned long mask = (1ULL << IOMMU_PAGE_SHIFT_4K) - 1; @@ -76,6 +77,79 @@ static long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, return H_SUCCESS; } +EXPORT_SYMBOL_GPL(kvmppc_ioba_validate); + +/* + * Validates TCE address. + * At the moment flags and page mask are validated. + * As the host kernel does not access those addresses (just puts them + * to the table and user space is supposed to process them), we can skip + * checking other things (such as TCE is a guest RAM address or the page + * was actually allocated). + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce) +{ + unsigned long mask = + ~(IOMMU_PAGE_MASK_4K | TCE_PCI_WRITE | TCE_PCI_READ); + + if (tce & mask) + return H_PARAMETER; + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_tce_validate); + +/* Note on the use of page_address() in real mode, + * + * It is safe to use page_address() in real mode on ppc64 because + * page_address() is always defined as lowmem_page_address() + * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetic + * operation and does not access page struct. + * + * Theoretically page_address() could be defined different + * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL + * would have to be enabled. + * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64, + * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only + * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP + * is not expected to be enabled on ppc32, page_address() + * is safe for ppc32 as well. + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +static u64 *kvmppc_page_address(struct page *page) +{ +#if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL) +#error TODO: fix to avoid page_address() here +#endif + return (u64 *) page_address(page); +} + +/* + * Handles TCE requests for emulated devices. + * Puts guest TCE values to the table and expects user space to convert them. + * Called in both real and virtual modes. + * Cannot fail so kvmppc_tce_validate must be called before it. + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, + unsigned long idx, unsigned long tce) +{ + struct page *page; + u64 *tbl; + + page = stt->pages[idx / TCES_PER_PAGE]; + tbl = kvmppc_page_address(page); + + tbl[idx % TCES_PER_PAGE] = tce; +} +EXPORT_SYMBOL_GPL(kvmppc_tce_put); /* WARNING: This will be called in real-mode on HV KVM and virtual * mode on PR KVM @@ -85,9 +159,6 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, { struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn); long ret; - unsigned long idx; - struct page *page; - u64 *tbl; /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ /* liobn, ioba, tce); */ @@ -99,13 +170,11 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; - idx = ioba >> IOMMU_PAGE_SHIFT_4K; - page = stt->pages[idx / TCES_PER_PAGE]; - tbl = (u64 *)page_address(page); + ret = kvmppc_tce_validate(stt, tce); + if (ret != H_SUCCESS) + return ret; - /* FIXME: Need to validate the TCE itself */ - /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */ - tbl[idx % TCES_PER_PAGE] = tce; + kvmppc_tce_put(stt, ioba >> IOMMU_PAGE_SHIFT_4K, tce); return H_SUCCESS; } -- cgit From d3695aa4f452bc09c834a5010484f65fca37d87c Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 15 Feb 2016 12:55:09 +1100 Subject: KVM: PPC: Add support for multiple-TCE hcalls This adds real and virtual mode handlers for the H_PUT_TCE_INDIRECT and H_STUFF_TCE hypercalls for user space emulated devices such as IBMVIO devices or emulated PCI. These calls allow adding multiple entries (up to 512) into the TCE table in one call which saves time on transition between kernel and user space. The current implementation of kvmppc_h_stuff_tce() allows it to be executed in both real and virtual modes so there is one helper. The kvmppc_rm_h_put_tce_indirect() needs to translate the guest address to the host address and since the translation is different, there are 2 helpers - one for each mode. This implements the KVM_CAP_PPC_MULTITCE capability. When present, the kernel will try handling H_PUT_TCE_INDIRECT and H_STUFF_TCE if these are enabled by the userspace via KVM_CAP_PPC_ENABLE_HCALL. If they can not be handled by the kernel, they are passed on to the user space. The user space still has to have an implementation for these. Both HV and PR-syle KVM are supported. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_vio.c | 60 ++++++++++++- arch/powerpc/kvm/book3s_64_vio_hv.c | 150 +++++++++++++++++++++++++++++++- arch/powerpc/kvm/book3s_hv.c | 26 +++++- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 4 +- arch/powerpc/kvm/book3s_pr_papr.c | 35 ++++++++ arch/powerpc/kvm/powerpc.c | 3 + 6 files changed, 269 insertions(+), 9 deletions(-) (limited to 'arch/powerpc/kvm') diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 84993d151db3..94c8e7e9b58c 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -14,6 +14,7 @@ * * Copyright 2010 Paul Mackerras, IBM Corp. * Copyright 2011 David Gibson, IBM Corporation + * Copyright 2016 Alexey Kardashevskiy, IBM Corporation */ #include @@ -37,8 +38,7 @@ #include #include #include - -#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) +#include static unsigned long kvmppc_tce_pages(unsigned long window_size) { @@ -204,3 +204,59 @@ fail: } return ret; } + +long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_list, unsigned long npages) +{ + struct kvmppc_spapr_tce_table *stt; + long i, ret = H_SUCCESS, idx; + unsigned long entry, ua = 0; + u64 __user *tces, tce; + + stt = kvmppc_find_table(vcpu, liobn); + if (!stt) + return H_TOO_HARD; + + entry = ioba >> IOMMU_PAGE_SHIFT_4K; + /* + * SPAPR spec says that the maximum size of the list is 512 TCEs + * so the whole table fits in 4K page + */ + if (npages > 512) + return H_PARAMETER; + + if (tce_list & (SZ_4K - 1)) + return H_PARAMETER; + + ret = kvmppc_ioba_validate(stt, ioba, npages); + if (ret != H_SUCCESS) + return ret; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) { + ret = H_TOO_HARD; + goto unlock_exit; + } + tces = (u64 __user *) ua; + + for (i = 0; i < npages; ++i) { + if (get_user(tce, tces + i)) { + ret = H_TOO_HARD; + goto unlock_exit; + } + tce = be64_to_cpu(tce); + + ret = kvmppc_tce_validate(stt, tce); + if (ret != H_SUCCESS) + goto unlock_exit; + + kvmppc_tce_put(stt, entry + i, tce); + } + +unlock_exit: + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + return ret; +} +EXPORT_SYMBOL_GPL(kvmppc_h_put_tce_indirect); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index b608fdd0c6f6..0486aa2329ee 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -14,6 +14,7 @@ * * Copyright 2010 Paul Mackerras, IBM Corp. * Copyright 2011 David Gibson, IBM Corporation + * Copyright 2016 Alexey Kardashevskiy, IBM Corporation */ #include @@ -30,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -37,6 +39,7 @@ #include #include #include +#include #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) @@ -46,7 +49,7 @@ * WARNING: This will be called in real or virtual mode on HV KVM and virtual * mode on PR KVM */ -static struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu, +struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu, unsigned long liobn) { struct kvm *kvm = vcpu->kvm; @@ -58,6 +61,7 @@ static struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu, return NULL; } +EXPORT_SYMBOL_GPL(kvmppc_find_table); /* * Validates IO address. @@ -151,9 +155,29 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, } EXPORT_SYMBOL_GPL(kvmppc_tce_put); -/* WARNING: This will be called in real-mode on HV KVM and virtual - * mode on PR KVM - */ +long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, + unsigned long *ua, unsigned long **prmap) +{ + unsigned long gfn = gpa >> PAGE_SHIFT; + struct kvm_memory_slot *memslot; + + memslot = search_memslots(kvm_memslots(kvm), gfn); + if (!memslot) + return -EINVAL; + + *ua = __gfn_to_hva_memslot(memslot, gfn) | + (gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + if (prmap) + *prmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; +#endif + + return 0; +} +EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce) { @@ -180,6 +204,122 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, } EXPORT_SYMBOL_GPL(kvmppc_h_put_tce); +static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu, + unsigned long ua, unsigned long *phpa) +{ + pte_t *ptep, pte; + unsigned shift = 0; + + ptep = __find_linux_pte_or_hugepte(vcpu->arch.pgdir, ua, NULL, &shift); + if (!ptep || !pte_present(*ptep)) + return -ENXIO; + pte = *ptep; + + if (!shift) + shift = PAGE_SHIFT; + + /* Avoid handling anything potentially complicated in realmode */ + if (shift > PAGE_SHIFT) + return -EAGAIN; + + if (!pte_young(pte)) + return -EAGAIN; + + *phpa = (pte_pfn(pte) << PAGE_SHIFT) | (ua & ((1ULL << shift) - 1)) | + (ua & ~PAGE_MASK); + + return 0; +} + +long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_list, unsigned long npages) +{ + struct kvmppc_spapr_tce_table *stt; + long i, ret = H_SUCCESS; + unsigned long tces, entry, ua = 0; + unsigned long *rmap = NULL; + + stt = kvmppc_find_table(vcpu, liobn); + if (!stt) + return H_TOO_HARD; + + entry = ioba >> IOMMU_PAGE_SHIFT_4K; + /* + * The spec says that the maximum size of the list is 512 TCEs + * so the whole table addressed resides in 4K page + */ + if (npages > 512) + return H_PARAMETER; + + if (tce_list & (SZ_4K - 1)) + return H_PARAMETER; + + ret = kvmppc_ioba_validate(stt, ioba, npages); + if (ret != H_SUCCESS) + return ret; + + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) + return H_TOO_HARD; + + rmap = (void *) vmalloc_to_phys(rmap); + + /* + * Synchronize with the MMU notifier callbacks in + * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). + * While we have the rmap lock, code running on other CPUs + * cannot finish unmapping the host real page that backs + * this guest real page, so we are OK to access the host + * real page. + */ + lock_rmap(rmap); + if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { + ret = H_TOO_HARD; + goto unlock_exit; + } + + for (i = 0; i < npages; ++i) { + unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); + + ret = kvmppc_tce_validate(stt, tce); + if (ret != H_SUCCESS) + goto unlock_exit; + + kvmppc_tce_put(stt, entry + i, tce); + } + +unlock_exit: + unlock_rmap(rmap); + + return ret; +} + +long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_value, unsigned long npages) +{ + struct kvmppc_spapr_tce_table *stt; + long i, ret; + + stt = kvmppc_find_table(vcpu, liobn); + if (!stt) + return H_TOO_HARD; + + ret = kvmppc_ioba_validate(stt, ioba, npages); + if (ret != H_SUCCESS) + return ret; + + /* Check permission bits only to allow userspace poison TCE for debug */ + if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)) + return H_PARAMETER; + + for (i = 0; i < npages; ++i, ioba += IOMMU_PAGE_SIZE_4K) + kvmppc_tce_put(stt, ioba >> IOMMU_PAGE_SHIFT_4K, tce_value); + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce); + long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba) { @@ -205,3 +345,5 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, return H_SUCCESS; } EXPORT_SYMBOL_GPL(kvmppc_h_get_tce); + +#endif /* KVM_BOOK3S_HV_POSSIBLE */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index baeddb06811d..33b491e6f666 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -768,7 +768,31 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) if (kvmppc_xics_enabled(vcpu)) { ret = kvmppc_xics_hcall(vcpu, req); break; - } /* fallthrough */ + } + return RESUME_HOST; + case H_PUT_TCE: + ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_PUT_TCE_INDIRECT: + ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6), + kvmppc_get_gpr(vcpu, 7)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_STUFF_TCE: + ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6), + kvmppc_get_gpr(vcpu, 7)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; default: return RESUME_HOST; } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 6ee26de9a1de..ed16182a008b 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2006,8 +2006,8 @@ hcall_real_table: .long 0 /* 0x12c */ .long 0 /* 0x130 */ .long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table - .long 0 /* 0x138 */ - .long 0 /* 0x13c */ + .long DOTSYM(kvmppc_h_stuff_tce) - hcall_real_table + .long DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table .long 0 /* 0x140 */ .long 0 /* 0x144 */ .long 0 /* 0x148 */ diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index f2c75a1e0536..02176fd52f84 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -280,6 +280,37 @@ static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu) return EMULATE_DONE; } +static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu) +{ + unsigned long liobn = kvmppc_get_gpr(vcpu, 4); + unsigned long ioba = kvmppc_get_gpr(vcpu, 5); + unsigned long tce = kvmppc_get_gpr(vcpu, 6); + unsigned long npages = kvmppc_get_gpr(vcpu, 7); + long rc; + + rc = kvmppc_h_put_tce_indirect(vcpu, liobn, ioba, + tce, npages); + if (rc == H_TOO_HARD) + return EMULATE_FAIL; + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + +static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu) +{ + unsigned long liobn = kvmppc_get_gpr(vcpu, 4); + unsigned long ioba = kvmppc_get_gpr(vcpu, 5); + unsigned long tce_value = kvmppc_get_gpr(vcpu, 6); + unsigned long npages = kvmppc_get_gpr(vcpu, 7); + long rc; + + rc = kvmppc_h_stuff_tce(vcpu, liobn, ioba, tce_value, npages); + if (rc == H_TOO_HARD) + return EMULATE_FAIL; + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) { long rc = kvmppc_xics_hcall(vcpu, cmd); @@ -306,6 +337,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) return kvmppc_h_pr_bulk_remove(vcpu); case H_PUT_TCE: return kvmppc_h_pr_put_tce(vcpu); + case H_PUT_TCE_INDIRECT: + return kvmppc_h_pr_put_tce_indirect(vcpu); + case H_STUFF_TCE: + return kvmppc_h_pr_stuff_tce(vcpu); case H_CEDE: kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE); kvm_vcpu_block(vcpu); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index a3b182dcb823..69f897da782d 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -569,6 +569,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PPC_GET_SMMU_INFO: r = 1; break; + case KVM_CAP_SPAPR_MULTITCE: + r = 1; + break; #endif default: r = 0; -- cgit From 79b6c247e9afe35714c1f83cfcecf40a438ca4a4 Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Thu, 17 Dec 2015 14:59:06 -0600 Subject: KVM: PPC: Book3S HV: Host-side RM data structures This patch defines the data structures to support the setting up of host side operations while running in real mode in the guest, and also the functions to allocate and free it. The operations are for now limited to virtual XICS operations. Currently, we have only defined one operation in the data structure: - Wake up a VCPU sleeping in the host when it receives a virtual interrupt The operations are assigned at the core level because PowerKVM requires that the host run in SMT off mode. For each core, we will need to manage its state atomically - where the state is defined by: 1. Is the core running in the host? 2. Is there a Real Mode (RM) operation pending on the host? Currently, core state is only managed at the whole-core level even when the system is in split-core mode. This just limits the number of free or "available" cores in the host to perform any host-side operations. The kvmppc_host_rm_core.rm_data allows any data to be passed by KVM in real mode to the host core along with the operation to be performed. The kvmppc_host_rm_ops structure is allocated the very first time a guest VM is started. Initial core state is also set - all online cores are in the host. This structure is never deleted, not even when there are no active guests. However, it needs to be freed when the module is unloaded because the kvmppc_host_rm_ops_hv can contain function pointers to kvm-hv.ko functions for the different supported host operations. Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 70 ++++++++++++++++++++++++++++++++++++ arch/powerpc/kvm/book3s_hv_builtin.c | 3 ++ 2 files changed, 73 insertions(+) (limited to 'arch/powerpc/kvm') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 33b491e6f666..8b3332fb9ed2 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3008,6 +3008,73 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) goto out_srcu; } +#ifdef CONFIG_KVM_XICS +/* + * Allocate a per-core structure for managing state about which cores are + * running in the host versus the guest and for exchanging data between + * real mode KVM and CPU running in the host. + * This is only done for the first VM. + * The allocated structure stays even if all VMs have stopped. + * It is only freed when the kvm-hv module is unloaded. + * It's OK for this routine to fail, we just don't support host + * core operations like redirecting H_IPI wakeups. + */ +void kvmppc_alloc_host_rm_ops(void) +{ + struct kvmppc_host_rm_ops *ops; + unsigned long l_ops; + int cpu, core; + int size; + + /* Not the first time here ? */ + if (kvmppc_host_rm_ops_hv != NULL) + return; + + ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL); + if (!ops) + return; + + size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core); + ops->rm_core = kzalloc(size, GFP_KERNEL); + + if (!ops->rm_core) { + kfree(ops); + return; + } + + for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) { + if (!cpu_online(cpu)) + continue; + + core = cpu >> threads_shift; + ops->rm_core[core].rm_state.in_host = 1; + } + + /* + * Make the contents of the kvmppc_host_rm_ops structure visible + * to other CPUs before we assign it to the global variable. + * Do an atomic assignment (no locks used here), but if someone + * beats us to it, just free our copy and return. + */ + smp_wmb(); + l_ops = (unsigned long) ops; + + if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) { + kfree(ops->rm_core); + kfree(ops); + } +} + +void kvmppc_free_host_rm_ops(void) +{ + if (kvmppc_host_rm_ops_hv) { + kfree(kvmppc_host_rm_ops_hv->rm_core); + kfree(kvmppc_host_rm_ops_hv); + kvmppc_host_rm_ops_hv = NULL; + } +} +#endif + static int kvmppc_core_init_vm_hv(struct kvm *kvm) { unsigned long lpcr, lpid; @@ -3020,6 +3087,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) return -ENOMEM; kvm->arch.lpid = lpid; + kvmppc_alloc_host_rm_ops(); + /* * Since we don't flush the TLB when tearing down a VM, * and this lpid might have previously been used, @@ -3253,6 +3322,7 @@ static int kvmppc_book3s_init_hv(void) static void kvmppc_book3s_exit_hv(void) { + kvmppc_free_host_rm_ops(); kvmppc_hv_ops = NULL; } diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index fd7006bf6b1a..5f0380db3eab 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -283,3 +283,6 @@ void kvmhv_commence_exit(int trap) kvmhv_interrupt_vcore(vc, ee); } } + +struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv; +EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv); -- cgit From b8e6a87c82927ed9ccf0f3ee42946a41cb9d75fe Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Thu, 17 Dec 2015 14:59:07 -0600 Subject: KVM: PPC: Book3S HV: Manage core host state Update the core host state in kvmppc_host_rm_ops whenever the primary thread of the core enters the guest or returns back. Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'arch/powerpc/kvm') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 8b3332fb9ed2..542ec97129b4 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2302,6 +2302,46 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) spin_unlock(&vc->lock); } +/* + * Clear core from the list of active host cores as we are about to + * enter the guest. Only do this if it is the primary thread of the + * core (not if a subcore) that is entering the guest. + */ +static inline void kvmppc_clear_host_core(int cpu) +{ + int core; + + if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu)) + return; + /* + * Memory barrier can be omitted here as we will do a smp_wmb() + * later in kvmppc_start_thread and we need ensure that state is + * visible to other CPUs only after we enter guest. + */ + core = cpu >> threads_shift; + kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0; +} + +/* + * Advertise this core as an active host core since we exited the guest + * Only need to do this if it is the primary thread of the core that is + * exiting. + */ +static inline void kvmppc_set_host_core(int cpu) +{ + int core; + + if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu)) + return; + + /* + * Memory barrier can be omitted here because we do a spin_unlock + * immediately after this which provides the memory barrier. + */ + core = cpu >> threads_shift; + kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1; +} + /* * Run a set of guest threads on a physical core. * Called with vc->lock held. @@ -2414,6 +2454,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) } } + kvmppc_clear_host_core(pcpu); + /* Start all the threads */ active = 0; for (sub = 0; sub < core_info.n_subcores; ++sub) { @@ -2510,6 +2552,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) kvmppc_ipi_thread(pcpu + i); } + kvmppc_set_host_core(pcpu); + spin_unlock(&vc->lock); /* make sure updates to secondary vcpu structs are visible now */ -- cgit From 6f3bb80944148012cbac1f98da249f591cbcae43 Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Thu, 17 Dec 2015 14:59:08 -0600 Subject: KVM: PPC: Book3S HV: kvmppc_host_rm_ops - handle offlining CPUs The kvmppc_host_rm_ops structure keeps track of which cores are are in the host by maintaining a bitmask of active/runnable online CPUs that have not entered the guest. This patch adds support to manage the bitmask when a CPU is offlined or onlined in the host. Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'arch/powerpc/kvm') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 542ec97129b4..16304d2c0cb7 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3053,6 +3053,36 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) } #ifdef CONFIG_KVM_XICS +static int kvmppc_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) +{ + unsigned long cpu = (long)hcpu; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + kvmppc_set_host_core(cpu); + break; + +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + case CPU_DEAD_FROZEN: + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + kvmppc_clear_host_core(cpu); + break; +#endif + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block kvmppc_cpu_notifier = { + .notifier_call = kvmppc_cpu_notify, +}; + /* * Allocate a per-core structure for managing state about which cores are * running in the host versus the guest and for exchanging data between @@ -3086,6 +3116,8 @@ void kvmppc_alloc_host_rm_ops(void) return; } + get_online_cpus(); + for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) { if (!cpu_online(cpu)) continue; @@ -3104,14 +3136,21 @@ void kvmppc_alloc_host_rm_ops(void) l_ops = (unsigned long) ops; if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) { + put_online_cpus(); kfree(ops->rm_core); kfree(ops); + return; } + + register_cpu_notifier(&kvmppc_cpu_notifier); + + put_online_cpus(); } void kvmppc_free_host_rm_ops(void) { if (kvmppc_host_rm_ops_hv) { + unregister_cpu_notifier(&kvmppc_cpu_notifier); kfree(kvmppc_host_rm_ops_hv->rm_core); kfree(kvmppc_host_rm_ops_hv); kvmppc_host_rm_ops_hv = NULL; -- cgit From 0c2a66062470cd1f6d11ae6db31059f59d3f725f Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Thu, 17 Dec 2015 14:59:09 -0600 Subject: KVM: PPC: Book3S HV: Host side kick VCPU when poked by real-mode KVM This patch adds the support for the kick VCPU operation for kvmppc_host_rm_ops. The kvmppc_xics_ipi_action() function provides the function to be invoked for a host side operation when poked by the real mode KVM. This is initiated by KVM by sending an IPI to any free host core. KVM real mode must set the rm_action to XICS_RM_KICK_VCPU and rm_data to point to the VCPU to be woken up before sending the IPI. Note that we have allocated one kvmppc_host_rm_core structure per core. The above values need to be set in the structure corresponding to the core to which the IPI will be sent. Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 2 ++ arch/powerpc/kvm/book3s_hv_rm_xics.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'arch/powerpc/kvm') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 16304d2c0cb7..c3c731085c1f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3126,6 +3126,8 @@ void kvmppc_alloc_host_rm_ops(void) ops->rm_core[core].rm_state.in_host = 1; } + ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv; + /* * Make the contents of the kvmppc_host_rm_ops structure visible * to other CPUs before we assign it to the global variable. diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 24f58076d49e..43ffbfe2a18a 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "book3s_xics.h" @@ -623,3 +624,38 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) bail: return check_too_hard(xics, icp); } + +/* --- Non-real mode XICS-related built-in routines --- */ + +/** + * Host Operations poked by RM KVM + */ +static void rm_host_ipi_action(int action, void *data) +{ + switch (action) { + case XICS_RM_KICK_VCPU: + kvmppc_host_rm_ops_hv->vcpu_kick(data); + break; + default: + WARN(1, "Unexpected rm_action=%d data=%p\n", action, data); + break; + } + +} + +void kvmppc_xics_ipi_action(void) +{ + int core; + unsigned int cpu = smp_processor_id(); + struct kvmppc_host_rm_core *rm_corep; + + core = cpu >> threads_shift; + rm_corep = &kvmppc_host_rm_ops_hv->rm_core[core]; + + if (rm_corep->rm_data) { + rm_host_ipi_action(rm_corep->rm_state.rm_action, + rm_corep->rm_data); + rm_corep->rm_data = NULL; + rm_corep->rm_state.rm_action = 0; + } +} -- cgit From e17769eb8c897101e2c6df62ec397e450b6e53b4 Mon Sep 17 00:00:00 2001 From: "Suresh E. Warrier" Date: Mon, 21 Dec 2015 16:22:51 -0600 Subject: KVM: PPC: Book3S HV: Send IPI to host core to wake VCPU This patch adds support to real-mode KVM to search for a core running in the host partition and send it an IPI message with VCPU to be woken. This avoids having to switch to the host partition to complete an H_IPI hypercall when the VCPU which is the target of the the H_IPI is not loaded (is not running in the guest). The patch also includes the support in the IPI handler running in the host to do the wakeup by calling kvmppc_xics_ipi_action for the PPC_MSG_RM_HOST_ACTION message. When a guest is being destroyed, we need to ensure that there are no pending IPIs waiting to wake up a VCPU before we free the VCPUs of the guest. This is accomplished by: - Forces a PPC_MSG_CALL_FUNCTION IPI to be completed by all CPUs before freeing any VCPUs in kvm_arch_destroy_vm(). - Any PPC_MSG_RM_HOST_ACTION messages must be executed first before any other PPC_MSG_CALL_FUNCTION messages. Signed-off-by: Suresh Warrier Acked-by: Michael Ellerman Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rm_xics.c | 92 ++++++++++++++++++++++++++++++++++-- arch/powerpc/kvm/powerpc.c | 10 ++++ 2 files changed, 99 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/kvm') diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 43ffbfe2a18a..e673fb9fee98 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -51,11 +51,84 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics, /* -- ICP routines -- */ +#ifdef CONFIG_SMP +static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) +{ + int hcpu; + + hcpu = hcore << threads_shift; + kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu; + smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION); + icp_native_cause_ipi_rm(hcpu); +} +#else +static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { } +#endif + +/* + * We start the search from our current CPU Id in the core map + * and go in a circle until we get back to our ID looking for a + * core that is running in host context and that hasn't already + * been targeted for another rm_host_ops. + * + * In the future, could consider using a fairer algorithm (one + * that distributes the IPIs better) + * + * Returns -1, if no CPU could be found in the host + * Else, returns a CPU Id which has been reserved for use + */ +static inline int grab_next_hostcore(int start, + struct kvmppc_host_rm_core *rm_core, int max, int action) +{ + bool success; + int core; + union kvmppc_rm_state old, new; + + for (core = start + 1; core < max; core++) { + old = new = READ_ONCE(rm_core[core].rm_state); + + if (!old.in_host || old.rm_action) + continue; + + /* Try to grab this host core if not taken already. */ + new.rm_action = action; + + success = cmpxchg64(&rm_core[core].rm_state.raw, + old.raw, new.raw) == old.raw; + if (success) { + /* + * Make sure that the store to the rm_action is made + * visible before we return to caller (and the + * subsequent store to rm_data) to synchronize with + * the IPI handler. + */ + smp_wmb(); + return core; + } + } + + return -1; +} + +static inline int find_available_hostcore(int action) +{ + int core; + int my_core = smp_processor_id() >> threads_shift; + struct kvmppc_host_rm_core *rm_core = kvmppc_host_rm_ops_hv->rm_core; + + core = grab_next_hostcore(my_core, rm_core, cpu_nr_cores(), action); + if (core == -1) + core = grab_next_hostcore(core, rm_core, my_core, action); + + return core; +} + static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, struct kvm_vcpu *this_vcpu) { struct kvmppc_icp *this_icp = this_vcpu->arch.icp; int cpu; + int hcore; /* Mark the target VCPU as having an interrupt pending */ vcpu->stat.queue_intr++; @@ -67,11 +140,22 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, return; } - /* Check if the core is loaded, if not, too hard */ + /* + * Check if the core is loaded, + * if not, find an available host core to post to wake the VCPU, + * if we can't find one, set up state to eventually return too hard. + */ cpu = vcpu->arch.thread_cpu; if (cpu < 0 || cpu >= nr_cpu_ids) { - this_icp->rm_action |= XICS_RM_KICK_VCPU; - this_icp->rm_kick_target = vcpu; + hcore = -1; + if (kvmppc_host_rm_ops_hv) + hcore = find_available_hostcore(XICS_RM_KICK_VCPU); + if (hcore != -1) { + icp_send_hcore_msg(hcore, vcpu); + } else { + this_icp->rm_action |= XICS_RM_KICK_VCPU; + this_icp->rm_kick_target = vcpu; + } return; } @@ -655,7 +739,9 @@ void kvmppc_xics_ipi_action(void) if (rm_corep->rm_data) { rm_host_ipi_action(rm_corep->rm_state.rm_action, rm_corep->rm_data); + /* Order these stores against the real mode KVM */ rm_corep->rm_data = NULL; + smp_wmb(); rm_corep->rm_state.rm_action = 0; } } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 69f897da782d..9258675e2ff7 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -437,6 +437,16 @@ void kvm_arch_destroy_vm(struct kvm *kvm) unsigned int i; struct kvm_vcpu *vcpu; +#ifdef CONFIG_KVM_XICS + /* + * We call kick_all_cpus_sync() to ensure that all + * CPUs have executed any pending IPIs before we + * continue and free VCPUs structures below. + */ + if (is_kvmppc_hv_enabled(kvm)) + kick_all_cpus_sync(); +#endif + kvm_for_each_vcpu(i, vcpu, kvm) kvm_arch_vcpu_free(vcpu); -- cgit From 520fe9c607d3acea96391aad27e17518bd7d39bd Mon Sep 17 00:00:00 2001 From: "Suresh E. Warrier" Date: Mon, 21 Dec 2015 16:33:57 -0600 Subject: KVM: PPC: Book3S HV: Add tunable to control H_IPI redirection Redirecting the wakeup of a VCPU from the H_IPI hypercall to a core running in the host is usually a good idea, most workloads seemed to benefit. However, in one heavily interrupt-driven SMT1 workload, some regression was observed. This patch adds a kvm_hv module parameter called h_ipi_redirect to control this feature. The default value for this tunable is 1 - that is enable the feature. Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 11 +++++++++++ arch/powerpc/kvm/book3s_hv_rm_xics.c | 5 ++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/kvm') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index c3c731085c1f..f47fffefadc1 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -81,6 +81,17 @@ static int target_smt_mode; module_param(target_smt_mode, int, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)"); +#ifdef CONFIG_KVM_XICS +static struct kernel_param_ops module_param_ops = { + .set = param_set_int, + .get = param_get_int, +}; + +module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); +#endif + static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index e673fb9fee98..980d8a6f7284 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -24,6 +24,9 @@ #define DEBUG_PASSUP +int h_ipi_redirect = 1; +EXPORT_SYMBOL(h_ipi_redirect); + static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, u32 new_irq); @@ -148,7 +151,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, cpu = vcpu->arch.thread_cpu; if (cpu < 0 || cpu >= nr_cpu_ids) { hcore = -1; - if (kvmppc_host_rm_ops_hv) + if (kvmppc_host_rm_ops_hv && h_ipi_redirect) hcore = find_available_hostcore(XICS_RM_KICK_VCPU); if (hcore != -1) { icp_send_hcore_msg(hcore, vcpu); -- cgit From fe26e52712ccab6648df17ecc029a68a69a01a85 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 1 Mar 2016 17:54:38 +1100 Subject: KVM: PPC: Add @page_shift to kvmppc_spapr_tce_table At the moment the kvmppc_spapr_tce_table struct can only describe 4GB windows and handle fixed size (4K) pages. Dynamic DMA windows support more so these limits need to be extended. This replaces window_size (in bytes, 4GB max) with page_shift (32bit) and size (64bit, in pages). This should cause no behavioural change as this is changing the internal structures only - the user interface still only allows one to create a 32-bit table with 4KiB pages at this stage. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_vio.c | 22 +++++++++++----------- arch/powerpc/kvm/book3s_64_vio_hv.c | 21 ++++++++++----------- 2 files changed, 21 insertions(+), 22 deletions(-) (limited to 'arch/powerpc/kvm') diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 94c8e7e9b58c..61cbc449d0a8 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -40,10 +40,9 @@ #include #include -static unsigned long kvmppc_tce_pages(unsigned long window_size) +static unsigned long kvmppc_tce_pages(unsigned long iommu_pages) { - return ALIGN((window_size >> IOMMU_PAGE_SHIFT_4K) - * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; + return ALIGN(iommu_pages * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; } static unsigned long kvmppc_stt_pages(unsigned long tce_pages) @@ -95,8 +94,7 @@ static void release_spapr_tce_table(struct rcu_head *head) { struct kvmppc_spapr_tce_table *stt = container_of(head, struct kvmppc_spapr_tce_table, rcu); - int i; - unsigned long npages = kvmppc_tce_pages(stt->window_size); + unsigned long i, npages = kvmppc_tce_pages(stt->size); for (i = 0; i < npages; i++) __free_page(stt->pages[i]); @@ -109,7 +107,7 @@ static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data; struct page *page; - if (vmf->pgoff >= kvmppc_tce_pages(stt->window_size)) + if (vmf->pgoff >= kvmppc_tce_pages(stt->size)) return VM_FAULT_SIGBUS; page = stt->pages[vmf->pgoff]; @@ -137,7 +135,7 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) kvm_put_kvm(stt->kvm); kvmppc_account_memlimit( - kvmppc_stt_pages(kvmppc_tce_pages(stt->window_size)), false); + kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false); call_rcu(&stt->rcu, release_spapr_tce_table); return 0; @@ -152,7 +150,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args) { struct kvmppc_spapr_tce_table *stt = NULL; - unsigned long npages; + unsigned long npages, size; int ret = -ENOMEM; int i; @@ -162,7 +160,8 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, return -EBUSY; } - npages = kvmppc_tce_pages(args->window_size); + size = args->window_size >> IOMMU_PAGE_SHIFT_4K; + npages = kvmppc_tce_pages(size); ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); if (ret) { stt = NULL; @@ -175,7 +174,8 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, goto fail; stt->liobn = args->liobn; - stt->window_size = args->window_size; + stt->page_shift = IOMMU_PAGE_SHIFT_4K; + stt->size = size; stt->kvm = kvm; for (i = 0; i < npages; i++) { @@ -218,7 +218,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, if (!stt) return H_TOO_HARD; - entry = ioba >> IOMMU_PAGE_SHIFT_4K; + entry = ioba >> stt->page_shift; /* * SPAPR spec says that the maximum size of the list is 512 TCEs * so the whole table fits in 4K page diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 0486aa2329ee..c786a58c28a7 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -72,11 +72,10 @@ EXPORT_SYMBOL_GPL(kvmppc_find_table); long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, unsigned long ioba, unsigned long npages) { - unsigned long mask = (1ULL << IOMMU_PAGE_SHIFT_4K) - 1; - unsigned long idx = ioba >> IOMMU_PAGE_SHIFT_4K; - unsigned long size = stt->window_size >> IOMMU_PAGE_SHIFT_4K; + unsigned long mask = (1ULL << stt->page_shift) - 1; + unsigned long idx = ioba >> stt->page_shift; - if ((ioba & mask) || (idx + npages > size) || (idx + npages < idx)) + if ((ioba & mask) || (idx + npages > stt->size) || (idx + npages < idx)) return H_PARAMETER; return H_SUCCESS; @@ -96,8 +95,8 @@ EXPORT_SYMBOL_GPL(kvmppc_ioba_validate); */ long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce) { - unsigned long mask = - ~(IOMMU_PAGE_MASK_4K | TCE_PCI_WRITE | TCE_PCI_READ); + unsigned long page_mask = ~((1ULL << stt->page_shift) - 1); + unsigned long mask = ~(page_mask | TCE_PCI_WRITE | TCE_PCI_READ); if (tce & mask) return H_PARAMETER; @@ -198,7 +197,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; - kvmppc_tce_put(stt, ioba >> IOMMU_PAGE_SHIFT_4K, tce); + kvmppc_tce_put(stt, ioba >> stt->page_shift, tce); return H_SUCCESS; } @@ -244,7 +243,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, if (!stt) return H_TOO_HARD; - entry = ioba >> IOMMU_PAGE_SHIFT_4K; + entry = ioba >> stt->page_shift; /* * The spec says that the maximum size of the list is 512 TCEs * so the whole table addressed resides in 4K page @@ -313,8 +312,8 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)) return H_PARAMETER; - for (i = 0; i < npages; ++i, ioba += IOMMU_PAGE_SIZE_4K) - kvmppc_tce_put(stt, ioba >> IOMMU_PAGE_SHIFT_4K, tce_value); + for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) + kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); return H_SUCCESS; } @@ -336,7 +335,7 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; - idx = ioba >> IOMMU_PAGE_SHIFT_4K; + idx = ioba >> stt->page_shift; page = stt->pages[idx / TCES_PER_PAGE]; tbl = (u64 *)page_address(page); -- cgit From 14f853f1b257b69cf0213ad8c49c01038ccf7ef9 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 1 Mar 2016 17:54:39 +1100 Subject: KVM: PPC: Add @offset to kvmppc_spapr_tce_table This enables userspace view of TCE tables to start from non-zero offset on a bus. This will be used for huge DMA windows. This only changes the internal structure, the user interface needs to change in order to use an offset. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_vio_hv.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/kvm') diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index c786a58c28a7..44be73e6aa26 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -75,7 +75,9 @@ long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, unsigned long mask = (1ULL << stt->page_shift) - 1; unsigned long idx = ioba >> stt->page_shift; - if ((ioba & mask) || (idx + npages > stt->size) || (idx + npages < idx)) + if ((ioba & mask) || (idx < stt->offset) || + (idx - stt->offset + npages > stt->size) || + (idx + npages < idx)) return H_PARAMETER; return H_SUCCESS; @@ -147,6 +149,7 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, struct page *page; u64 *tbl; + idx -= stt->offset; page = stt->pages[idx / TCES_PER_PAGE]; tbl = kvmppc_page_address(page); @@ -335,7 +338,7 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; - idx = ioba >> stt->page_shift; + idx = (ioba >> stt->page_shift) - stt->offset; page = stt->pages[idx / TCES_PER_PAGE]; tbl = (u64 *)page_address(page); -- cgit From 58ded4201ff028b15f6b317228faa5f154a0663f Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 1 Mar 2016 17:54:40 +1100 Subject: KVM: PPC: Add support for 64bit TCE windows The existing KVM_CREATE_SPAPR_TCE only supports 32bit windows which is not enough for directly mapped windows as the guest can get more than 4GB. This adds KVM_CREATE_SPAPR_TCE_64 ioctl and advertises it via KVM_CAP_SPAPR_TCE_64 capability. The table size is checked against the locked memory limit. Since 64bit windows are to support Dynamic DMA windows (DDW), let's add @bus_offset and @page_shift which are also required by DDW. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_vio.c | 10 +++++++--- arch/powerpc/kvm/powerpc.c | 25 ++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/kvm') diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 61cbc449d0a8..2c2d1030843a 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -147,20 +147,23 @@ static const struct file_operations kvm_spapr_tce_fops = { }; long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, - struct kvm_create_spapr_tce *args) + struct kvm_create_spapr_tce_64 *args) { struct kvmppc_spapr_tce_table *stt = NULL; unsigned long npages, size; int ret = -ENOMEM; int i; + if (!args->size) + return -EINVAL; + /* Check this LIOBN hasn't been previously allocated */ list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { if (stt->liobn == args->liobn) return -EBUSY; } - size = args->window_size >> IOMMU_PAGE_SHIFT_4K; + size = args->size; npages = kvmppc_tce_pages(size); ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); if (ret) { @@ -174,7 +177,8 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, goto fail; stt->liobn = args->liobn; - stt->page_shift = IOMMU_PAGE_SHIFT_4K; + stt->page_shift = args->page_shift; + stt->offset = args->offset; stt->size = size; stt->kvm = kvm; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 9258675e2ff7..19aa59b0850c 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "timing.h" #include "irq.h" #include "../mm/mmu_decl.h" @@ -519,6 +520,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_SPAPR_TCE: + case KVM_CAP_SPAPR_TCE_64: case KVM_CAP_PPC_ALLOC_HTAB: case KVM_CAP_PPC_RTAS: case KVM_CAP_PPC_FIXUP_HCALL: @@ -1344,13 +1346,34 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } #ifdef CONFIG_PPC_BOOK3S_64 + case KVM_CREATE_SPAPR_TCE_64: { + struct kvm_create_spapr_tce_64 create_tce_64; + + r = -EFAULT; + if (copy_from_user(&create_tce_64, argp, sizeof(create_tce_64))) + goto out; + if (create_tce_64.flags) { + r = -EINVAL; + goto out; + } + r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64); + goto out; + } case KVM_CREATE_SPAPR_TCE: { struct kvm_create_spapr_tce create_tce; + struct kvm_create_spapr_tce_64 create_tce_64; r = -EFAULT; if (copy_from_user(&create_tce, argp, sizeof(create_tce))) goto out; - r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce); + + create_tce_64.liobn = create_tce.liobn; + create_tce_64.page_shift = IOMMU_PAGE_SHIFT_4K; + create_tce_64.offset = 0; + create_tce_64.size = create_tce.window_size >> + IOMMU_PAGE_SHIFT_4K; + create_tce_64.flags = 0; + r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64); goto out; } case KVM_PPC_GET_SMMU_INFO: { -- cgit