summaryrefslogtreecommitdiff
path: root/arch/powerpc/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/kvm')
-rw-r--r--arch/powerpc/kvm/Makefile3
-rw-r--r--arch/powerpc/kvm/book3s.c1
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu.c3
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu.c3
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c743
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_radix.c716
-rw-r--r--arch/powerpc/kvm/book3s_64_vio.c5
-rw-r--r--arch/powerpc/kvm/book3s_hv.c268
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c41
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c87
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xics.c156
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S154
-rw-r--r--arch/powerpc/kvm/book3s_pr.c130
-rw-r--r--arch/powerpc/kvm/book3s_segment.S32
-rw-r--r--arch/powerpc/kvm/book3s_xics.c192
-rw-r--r--arch/powerpc/kvm/book3s_xics.h7
-rw-r--r--arch/powerpc/kvm/powerpc.c42
17 files changed, 2085 insertions, 498 deletions
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 7dd89b79d038..b87ccde2137a 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -70,7 +70,8 @@ endif
kvm-hv-y += \
book3s_hv.o \
book3s_hv_interrupts.o \
- book3s_64_mmu_hv.o
+ book3s_64_mmu_hv.o \
+ book3s_64_mmu_radix.o
kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
book3s_hv_rm_xics.o
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 019f008775b9..b6b5c185bd92 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -239,6 +239,7 @@ void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar,
kvmppc_set_dsisr(vcpu, flags);
kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE);
}
+EXPORT_SYMBOL_GPL(kvmppc_core_queue_data_storage); /* used by kvm_hv */
void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong flags)
{
diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
index a2eb6d354a57..1992676c7a94 100644
--- a/arch/powerpc/kvm/book3s_32_mmu.c
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -224,7 +224,8 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
ptem = kvmppc_mmu_book3s_32_get_ptem(sre, eaddr, primary);
if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) {
- printk(KERN_ERR "KVM: Can't copy data from 0x%lx!\n", ptegp);
+ printk_ratelimited(KERN_ERR
+ "KVM: Can't copy data from 0x%lx!\n", ptegp);
goto no_page_found;
}
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index b9131aa1aedf..70153578131a 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -265,7 +265,8 @@ do_second:
goto no_page_found;
if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) {
- printk(KERN_ERR "KVM can't copy data from 0x%lx!\n", ptegp);
+ printk_ratelimited(KERN_ERR
+ "KVM: Can't copy data from 0x%lx!\n", ptegp);
goto no_page_found;
}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index b795dd1ac2ef..f3158fb16de3 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -40,84 +40,104 @@
#include "trace_hv.h"
-/* Power architecture requires HPT is at least 256kB */
-#define PPC_MIN_HPT_ORDER 18
+//#define DEBUG_RESIZE_HPT 1
+
+#ifdef DEBUG_RESIZE_HPT
+#define resize_hpt_debug(resize, ...) \
+ do { \
+ printk(KERN_DEBUG "RESIZE HPT %p: ", resize); \
+ printk(__VA_ARGS__); \
+ } while (0)
+#else
+#define resize_hpt_debug(resize, ...) \
+ do { } while (0)
+#endif
static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
long pte_index, unsigned long pteh,
unsigned long ptel, unsigned long *pte_idx_ret);
+
+struct kvm_resize_hpt {
+ /* These fields read-only after init */
+ struct kvm *kvm;
+ struct work_struct work;
+ u32 order;
+
+ /* These fields protected by kvm->lock */
+ int error;
+ bool prepare_done;
+
+ /* Private to the work thread, until prepare_done is true,
+ * then protected by kvm->resize_hpt_sem */
+ struct kvm_hpt_info hpt;
+};
+
static void kvmppc_rmap_reset(struct kvm *kvm);
-long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
+int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
{
unsigned long hpt = 0;
- struct revmap_entry *rev;
+ int cma = 0;
struct page *page = NULL;
- long order = KVM_DEFAULT_HPT_ORDER;
+ struct revmap_entry *rev;
+ unsigned long npte;
- if (htab_orderp) {
- order = *htab_orderp;
- if (order < PPC_MIN_HPT_ORDER)
- order = PPC_MIN_HPT_ORDER;
- }
+ if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER))
+ return -EINVAL;
- kvm->arch.hpt_cma_alloc = 0;
- page = kvm_alloc_hpt(1ul << (order - PAGE_SHIFT));
+ page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT));
if (page) {
hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
memset((void *)hpt, 0, (1ul << order));
- kvm->arch.hpt_cma_alloc = 1;
+ cma = 1;
}
- /* Lastly try successively smaller sizes from the page allocator */
- /* Only do this if userspace didn't specify a size via ioctl */
- while (!hpt && order > PPC_MIN_HPT_ORDER && !htab_orderp) {
- hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
- __GFP_NOWARN, order - PAGE_SHIFT);
- if (!hpt)
- --order;
- }
+ if (!hpt)
+ hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT
+ |__GFP_NOWARN, order - PAGE_SHIFT);
if (!hpt)
return -ENOMEM;
- kvm->arch.hpt_virt = hpt;
- kvm->arch.hpt_order = order;
/* HPTEs are 2**4 bytes long */
- kvm->arch.hpt_npte = 1ul << (order - 4);
- /* 128 (2**7) bytes in each HPTEG */
- kvm->arch.hpt_mask = (1ul << (order - 7)) - 1;
-
- atomic64_set(&kvm->arch.mmio_update, 0);
+ npte = 1ul << (order - 4);
/* Allocate reverse map array */
- rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte);
+ rev = vmalloc(sizeof(struct revmap_entry) * npte);
if (!rev) {
- pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
- goto out_freehpt;
+ pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n");
+ if (cma)
+ kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
+ else
+ free_pages(hpt, order - PAGE_SHIFT);
+ return -ENOMEM;
}
- kvm->arch.revmap = rev;
- kvm->arch.sdr1 = __pa(hpt) | (order - 18);
- pr_info("KVM guest htab at %lx (order %ld), LPID %x\n",
- hpt, order, kvm->arch.lpid);
+ info->order = order;
+ info->virt = hpt;
+ info->cma = cma;
+ info->rev = rev;
- if (htab_orderp)
- *htab_orderp = order;
return 0;
+}
- out_freehpt:
- if (kvm->arch.hpt_cma_alloc)
- kvm_release_hpt(page, 1 << (order - PAGE_SHIFT));
- else
- free_pages(hpt, order - PAGE_SHIFT);
- return -ENOMEM;
+void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info)
+{
+ atomic64_set(&kvm->arch.mmio_update, 0);
+ kvm->arch.hpt = *info;
+ kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18);
+
+ pr_debug("KVM guest htab at %lx (order %ld), LPID %x\n",
+ info->virt, (long)info->order, kvm->arch.lpid);
}
-long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
+long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
{
long err = -EBUSY;
- long order;
+ struct kvm_hpt_info info;
+
+ if (kvm_is_radix(kvm))
+ return -EINVAL;
mutex_lock(&kvm->lock);
if (kvm->arch.hpte_setup_done) {
@@ -129,37 +149,44 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
goto out;
}
}
- if (kvm->arch.hpt_virt) {
- order = kvm->arch.hpt_order;
+ if (kvm->arch.hpt.order == order) {
+ /* We already have a suitable HPT */
+
/* Set the entire HPT to 0, i.e. invalid HPTEs */
- memset((void *)kvm->arch.hpt_virt, 0, 1ul << order);
+ memset((void *)kvm->arch.hpt.virt, 0, 1ul << order);
/*
* Reset all the reverse-mapping chains for all memslots
*/
kvmppc_rmap_reset(kvm);
/* Ensure that each vcpu will flush its TLB on next entry. */
cpumask_setall(&kvm->arch.need_tlb_flush);
- *htab_orderp = order;
err = 0;
- } else {
- err = kvmppc_alloc_hpt(kvm, htab_orderp);
- order = *htab_orderp;
+ goto out;
}
- out:
+
+ if (kvm->arch.hpt.virt)
+ kvmppc_free_hpt(&kvm->arch.hpt);
+
+ err = kvmppc_allocate_hpt(&info, order);
+ if (err < 0)
+ goto out;
+ kvmppc_set_hpt(kvm, &info);
+
+out:
mutex_unlock(&kvm->lock);
return err;
}
-void kvmppc_free_hpt(struct kvm *kvm)
+void kvmppc_free_hpt(struct kvm_hpt_info *info)
{
- kvmppc_free_lpid(kvm->arch.lpid);
- vfree(kvm->arch.revmap);
- if (kvm->arch.hpt_cma_alloc)
- kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt),
- 1 << (kvm->arch.hpt_order - PAGE_SHIFT));
- else
- free_pages(kvm->arch.hpt_virt,
- kvm->arch.hpt_order - PAGE_SHIFT);
+ vfree(info->rev);
+ if (info->cma)
+ kvm_free_hpt_cma(virt_to_page(info->virt),
+ 1 << (info->order - PAGE_SHIFT));
+ else if (info->virt)
+ free_pages(info->virt, info->order - PAGE_SHIFT);
+ info->virt = 0;
+ info->order = 0;
}
/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
@@ -194,8 +221,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
if (npages > 1ul << (40 - porder))
npages = 1ul << (40 - porder);
/* Can't use more than 1 HPTE per HPTEG */
- if (npages > kvm->arch.hpt_mask + 1)
- npages = kvm->arch.hpt_mask + 1;
+ if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1)
+ npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1;
hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
@@ -205,7 +232,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
for (i = 0; i < npages; ++i) {
addr = i << porder;
/* can't use hpt_hash since va > 64 bits */
- hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask;
+ hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25)))
+ & kvmppc_hpt_mask(&kvm->arch.hpt);
/*
* We assume that the hash table is empty and no
* vcpus are using it at this stage. Since we create
@@ -338,11 +366,11 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
preempt_enable();
return -ENOENT;
}
- hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
+ hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
if (cpu_has_feature(CPU_FTR_ARCH_300))
v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1]));
- gr = kvm->arch.revmap[index].guest_rpte;
+ gr = kvm->arch.hpt.rev[index].guest_rpte;
unlock_hpte(hptep, orig_v);
preempt_enable();
@@ -392,8 +420,8 @@ static int instruction_is_store(unsigned int instr)
return (instr & mask) != 0;
}
-static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
- unsigned long gpa, gva_t ea, int is_store)
+int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
+ unsigned long gpa, gva_t ea, int is_store)
{
u32 last_inst;
@@ -458,6 +486,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned long rcbits;
long mmio_update;
+ if (kvm_is_radix(kvm))
+ return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr);
+
/*
* Real-mode code has already searched the HPT and found the
* entry we're interested in. Lock the entry and check that
@@ -480,8 +511,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
}
}
index = vcpu->arch.pgfault_index;
- hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
- rev = &kvm->arch.revmap[index];
+ hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
+ rev = &kvm->arch.hpt.rev[index];
preempt_disable();
while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
cpu_relax();
@@ -695,12 +726,13 @@ static void kvmppc_rmap_reset(struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, srcu_idx);
}
+typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn);
+
static int kvm_handle_hva_range(struct kvm *kvm,
unsigned long start,
unsigned long end,
- int (*handler)(struct kvm *kvm,
- unsigned long *rmapp,
- unsigned long gfn))
+ hva_handler_fn handler)
{
int ret;
int retval = 0;
@@ -725,9 +757,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
for (; gfn < gfn_end; ++gfn) {
- gfn_t gfn_offset = gfn - memslot->base_gfn;
-
- ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn);
+ ret = handler(kvm, memslot, gfn);
retval |= ret;
}
}
@@ -736,20 +766,61 @@ static int kvm_handle_hva_range(struct kvm *kvm,
}
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp,
- unsigned long gfn))
+ hva_handler_fn handler)
{
return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
}
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+/* Must be called with both HPTE and rmap locked */
+static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
+ unsigned long *rmapp, unsigned long gfn)
+{
+ __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
+ struct revmap_entry *rev = kvm->arch.hpt.rev;
+ unsigned long j, h;
+ unsigned long ptel, psize, rcbits;
+
+ j = rev[i].forw;
+ if (j == i) {
+ /* chain is now empty */
+ *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
+ } else {
+ /* remove i from chain */
+ h = rev[i].back;
+ rev[h].forw = j;
+ rev[j].back = h;
+ rev[i].forw = rev[i].back = i;
+ *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
+ }
+
+ /* Now check and modify the HPTE */
+ ptel = rev[i].guest_rpte;
+ psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
+ if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
+ hpte_rpn(ptel, psize) == gfn) {
+ hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
+ kvmppc_invalidate_hpte(kvm, hptep, i);
+ hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
+ /* Harvest R and C */
+ rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
+ *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
+ if (rcbits & HPTE_R_C)
+ kvmppc_update_rmap_change(rmapp, psize);
+ if (rcbits & ~rev[i].guest_rpte) {
+ rev[i].guest_rpte = ptel | rcbits;
+ note_hpte_modification(kvm, &rev[i]);
+ }
+ }
+}
+
+static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long gfn)
{
- struct revmap_entry *rev = kvm->arch.revmap;
- unsigned long h, i, j;
+ unsigned long i;
__be64 *hptep;
- unsigned long ptel, psize, rcbits;
+ unsigned long *rmapp;
+ rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
for (;;) {
lock_rmap(rmapp);
if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
@@ -763,7 +834,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
* rmap chain lock.
*/
i = *rmapp & KVMPPC_RMAP_INDEX;
- hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
+ hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
/* unlock rmap before spinning on the HPTE lock */
unlock_rmap(rmapp);
@@ -771,37 +842,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
cpu_relax();
continue;
}
- j = rev[i].forw;
- if (j == i) {
- /* chain is now empty */
- *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
- } else {
- /* remove i from chain */
- h = rev[i].back;
- rev[h].forw = j;
- rev[j].back = h;
- rev[i].forw = rev[i].back = i;
- *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
- }
- /* Now check and modify the HPTE */
- ptel = rev[i].guest_rpte;
- psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
- if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
- hpte_rpn(ptel, psize) == gfn) {
- hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
- kvmppc_invalidate_hpte(kvm, hptep, i);
- hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
- /* Harvest R and C */
- rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
- *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
- if (rcbits & HPTE_R_C)
- kvmppc_update_rmap_change(rmapp, psize);
- if (rcbits & ~rev[i].guest_rpte) {
- rev[i].guest_rpte = ptel | rcbits;
- note_hpte_modification(kvm, &rev[i]);
- }
- }
+ kvmppc_unmap_hpte(kvm, i, rmapp, gfn);
unlock_rmap(rmapp);
__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
}
@@ -810,26 +852,36 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva)
{
- kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+ hva_handler_fn handler;
+
+ handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
+ kvm_handle_hva(kvm, hva, handler);
return 0;
}
int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
{
- kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
+ hva_handler_fn handler;
+
+ handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
+ kvm_handle_hva_range(kvm, start, end, handler);
return 0;
}
void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
- unsigned long *rmapp;
unsigned long gfn;
unsigned long n;
+ unsigned long *rmapp;
- rmapp = memslot->arch.rmap;
gfn = memslot->base_gfn;
- for (n = memslot->npages; n; --n) {
+ rmapp = memslot->arch.rmap;
+ for (n = memslot->npages; n; --n, ++gfn) {
+ if (kvm_is_radix(kvm)) {
+ kvm_unmap_radix(kvm, memslot, gfn);
+ continue;
+ }
/*
* Testing the present bit without locking is OK because
* the memslot has been marked invalid already, and hence
@@ -837,20 +889,21 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
* thus the present bit can't go from 0 to 1.
*/
if (*rmapp & KVMPPC_RMAP_PRESENT)
- kvm_unmap_rmapp(kvm, rmapp, gfn);
+ kvm_unmap_rmapp(kvm, memslot, gfn);
++rmapp;
- ++gfn;
}
}
-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long gfn)
{
- struct revmap_entry *rev = kvm->arch.revmap;
+ struct revmap_entry *rev = kvm->arch.hpt.rev;
unsigned long head, i, j;
__be64 *hptep;
int ret = 0;
+ unsigned long *rmapp;
+ rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
retry:
lock_rmap(rmapp);
if (*rmapp & KVMPPC_RMAP_REFERENCED) {
@@ -864,7 +917,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
i = head = *rmapp & KVMPPC_RMAP_INDEX;
do {
- hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
+ hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
j = rev[i].forw;
/* If this HPTE isn't referenced, ignore it */
@@ -898,17 +951,22 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
{
- return kvm_handle_hva_range(kvm, start, end, kvm_age_rmapp);
+ hva_handler_fn handler;
+
+ handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp;
+ return kvm_handle_hva_range(kvm, start, end, handler);
}
-static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long gfn)
{
- struct revmap_entry *rev = kvm->arch.revmap;
+ struct revmap_entry *rev = kvm->arch.hpt.rev;
unsigned long head, i, j;
unsigned long *hp;
int ret = 1;
+ unsigned long *rmapp;
+ rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
if (*rmapp & KVMPPC_RMAP_REFERENCED)
return 1;
@@ -919,7 +977,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
if (*rmapp & KVMPPC_RMAP_PRESENT) {
i = head = *rmapp & KVMPPC_RMAP_INDEX;
do {
- hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4));
+ hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4));
j = rev[i].forw;
if (be64_to_cpu(hp[1]) & HPTE_R_R)
goto out;
@@ -934,12 +992,18 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva)
{
- return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
+ hva_handler_fn handler;
+
+ handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp;
+ return kvm_handle_hva(kvm, hva, handler);
}
void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
{
- kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+ hva_handler_fn handler;
+
+ handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
+ kvm_handle_hva(kvm, hva, handler);
}
static int vcpus_running(struct kvm *kvm)
@@ -953,7 +1017,7 @@ static int vcpus_running(struct kvm *kvm)
*/
static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
{
- struct revmap_entry *rev = kvm->arch.revmap;
+ struct revmap_entry *rev = kvm->arch.hpt.rev;
unsigned long head, i, j;
unsigned long n;
unsigned long v, r;
@@ -978,7 +1042,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
i = head = *rmapp & KVMPPC_RMAP_INDEX;
do {
unsigned long hptep1;
- hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
+ hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
j = rev[i].forw;
/*
@@ -1040,7 +1104,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
return npages_dirty;
}
-static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
+void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
struct kvm_memory_slot *memslot,
unsigned long *map)
{
@@ -1058,12 +1122,11 @@ static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
__set_bit_le(gfn - memslot->base_gfn, map);
}
-long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long *map)
+long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
+ struct kvm_memory_slot *memslot, unsigned long *map)
{
unsigned long i, j;
unsigned long *rmapp;
- struct kvm_vcpu *vcpu;
preempt_disable();
rmapp = memslot->arch.rmap;
@@ -1079,15 +1142,6 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
__set_bit_le(j, map);
++rmapp;
}
-
- /* Harvest dirty bits from VPA and DTL updates */
- /* Note: we never modify the SLB shadow buffer areas */
- kvm_for_each_vcpu(i, vcpu, kvm) {
- spin_lock(&vcpu->arch.vpa_update_lock);
- harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map);
- harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map);
- spin_unlock(&vcpu->arch.vpa_update_lock);
- }
preempt_enable();
return 0;
}
@@ -1142,15 +1196,376 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
srcu_idx = srcu_read_lock(&kvm->srcu);
memslot = gfn_to_memslot(kvm, gfn);
if (memslot) {
- rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
- lock_rmap(rmap);
- *rmap |= KVMPPC_RMAP_CHANGED;
- unlock_rmap(rmap);
+ if (!kvm_is_radix(kvm)) {
+ rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
+ lock_rmap(rmap);
+ *rmap |= KVMPPC_RMAP_CHANGED;
+ unlock_rmap(rmap);
+ } else if (memslot->dirty_bitmap) {
+ mark_page_dirty(kvm, gfn);
+ }
}
srcu_read_unlock(&kvm->srcu, srcu_idx);
}
/*
+ * HPT resizing
+ */
+static int resize_hpt_allocate(struct kvm_resize_hpt *resize)
+{
+ int rc;
+
+ rc = kvmppc_allocate_hpt(&resize->hpt, resize->order);
+ if (rc < 0)
+ return rc;
+
+ resize_hpt_debug(resize, "resize_hpt_allocate(): HPT @ 0x%lx\n",
+ resize->hpt.virt);
+
+ return 0;
+}
+
+static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
+ unsigned long idx)
+{
+ struct kvm *kvm = resize->kvm;
+ struct kvm_hpt_info *old = &kvm->arch.hpt;
+ struct kvm_hpt_info *new = &resize->hpt;
+ unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1;
+ unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1;
+ __be64 *hptep, *new_hptep;
+ unsigned long vpte, rpte, guest_rpte;
+ int ret;
+ struct revmap_entry *rev;
+ unsigned long apsize, psize, avpn, pteg, hash;
+ unsigned long new_idx, new_pteg, replace_vpte;
+
+ hptep = (__be64 *)(old->virt + (idx << 4));
+
+ /* Guest is stopped, so new HPTEs can't be added or faulted
+ * in, only unmapped or altered by host actions. So, it's
+ * safe to check this before we take the HPTE lock */
+ vpte = be64_to_cpu(hptep[0]);
+ if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
+ return 0; /* nothing to do */
+
+ while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
+ cpu_relax();
+
+ vpte = be64_to_cpu(hptep[0]);
+
+ ret = 0;
+ if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
+ /* Nothing to do */
+ goto out;
+
+ /* Unmap */
+ rev = &old->rev[idx];
+ guest_rpte = rev->guest_rpte;
+
+ ret = -EIO;
+ apsize = hpte_page_size(vpte, guest_rpte);
+ if (!apsize)
+ goto out;
+
+ if (vpte & HPTE_V_VALID) {
+ unsigned long gfn = hpte_rpn(guest_rpte, apsize);
+ int srcu_idx = srcu_read_lock(&kvm->srcu);
+ struct kvm_memory_slot *memslot =
+ __gfn_to_memslot(kvm_memslots(kvm), gfn);
+
+ if (memslot) {
+ unsigned long *rmapp;
+ rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
+
+ lock_rmap(rmapp);
+ kvmppc_unmap_hpte(kvm, idx, rmapp, gfn);
+ unlock_rmap(rmapp);
+ }
+
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+ }
+
+ /* Reload PTE after unmap */
+ vpte = be64_to_cpu(hptep[0]);
+
+ BUG_ON(vpte & HPTE_V_VALID);
+ BUG_ON(!(vpte & HPTE_V_ABSENT));
+
+ ret = 0;
+ if (!(vpte & HPTE_V_BOLTED))
+ goto out;
+
+ rpte = be64_to_cpu(hptep[1]);
+ psize = hpte_base_page_size(vpte, rpte);
+ avpn = HPTE_V_AVPN_VAL(vpte) & ~((psize - 1) >> 23);
+ pteg = idx / HPTES_PER_GROUP;
+ if (vpte & HPTE_V_SECONDARY)
+ pteg = ~pteg;
+
+ if (!(vpte & HPTE_V_1TB_SEG)) {
+ unsigned long offset, vsid;
+
+ /* We only have 28 - 23 bits of offset in avpn */
+ offset = (avpn & 0x1f) << 23;
+ vsid = avpn >> 5;
+ /* We can find more bits from the pteg value */
+ if (psize < (1ULL << 23))
+ offset |= ((vsid ^ pteg) & old_hash_mask) * psize;
+
+ hash = vsid ^ (offset / psize);
+ } else {
+ unsigned long offset, vsid;
+
+ /* We only have 40 - 23 bits of seg_off in avpn */
+ offset = (avpn & 0x1ffff) << 23;
+ vsid = avpn >> 17;
+ if (psize < (1ULL << 23))
+ offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) * psize;
+
+ hash = vsid ^ (vsid << 25) ^ (offset / psize);
+ }
+
+ new_pteg = hash & new_hash_mask;
+ if (vpte & HPTE_V_SECONDARY) {
+ BUG_ON(~pteg != (hash & old_hash_mask));
+ new_pteg = ~new_pteg;
+ } else {
+ BUG_ON(pteg != (hash & old_hash_mask));
+ }
+
+ new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP);
+ new_hptep = (__be64 *)(new->virt + (new_idx << 4));
+
+ replace_vpte = be64_to_cpu(new_hptep[0]);
+
+ if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
+ BUG_ON(new->order >= old->order);
+
+ if (replace_vpte & HPTE_V_BOLTED) {
+ if (vpte & HPTE_V_BOLTED)
+ /* Bolted collision, nothing we can do */
+ ret = -ENOSPC;
+ /* Discard the new HPTE */
+ goto out;
+ }
+
+ /* Discard the previous HPTE */
+ }
+
+ new_hptep[1] = cpu_to_be64(rpte);
+ new->rev[new_idx].guest_rpte = guest_rpte;
+ /* No need for a barrier, since new HPT isn't active */
+ new_hptep[0] = cpu_to_be64(vpte);
+ unlock_hpte(new_hptep, vpte);
+
+out:
+ unlock_hpte(hptep, vpte);
+ return ret;
+}
+
+static int resize_hpt_rehash(struct kvm_resize_hpt *resize)
+{
+ struct kvm *kvm = resize->kvm;
+ unsigned long i;
+ int rc;
+
+ /*
+ * resize_hpt_rehash_hpte() doesn't handle the new-format HPTEs
+ * that POWER9 uses, and could well hit a BUG_ON on POWER9.
+ */
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ return -EIO;
+ for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) {
+ rc = resize_hpt_rehash_hpte(resize, i);
+ if (rc != 0)
+ return rc;
+ }
+
+ return 0;
+}
+
+static void resize_hpt_pivot(struct kvm_resize_hpt *resize)
+{
+ struct kvm *kvm = resize->kvm;
+ struct kvm_hpt_info hpt_tmp;
+
+ /* Exchange the pending tables in the resize structure with
+ * the active tables */
+
+ resize_hpt_debug(resize, "resize_hpt_pivot()\n");
+
+ spin_lock(&kvm->mmu_lock);
+ asm volatile("ptesync" : : : "memory");
+
+ hpt_tmp = kvm->arch.hpt;
+ kvmppc_set_hpt(kvm, &resize->hpt);
+ resize->hpt = hpt_tmp;
+
+ spin_unlock(&kvm->mmu_lock);
+
+ synchronize_srcu_expedited(&kvm->srcu);
+
+ resize_hpt_debug(resize, "resize_hpt_pivot() done\n");
+}
+
+static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)
+{
+ BUG_ON(kvm->arch.resize_hpt != resize);
+
+ if (!resize)
+ return;
+
+ if (resize->hpt.virt)
+ kvmppc_free_hpt(&resize->hpt);
+
+ kvm->arch.resize_hpt = NULL;
+ kfree(resize);
+}
+
+static void resize_hpt_prepare_work(struct work_struct *work)
+{
+ struct kvm_resize_hpt *resize = container_of(work,
+ struct kvm_resize_hpt,
+ work);
+ struct kvm *kvm = resize->kvm;
+ int err;
+
+ resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
+ resize->order);
+
+ err = resize_hpt_allocate(resize);
+
+ mutex_lock(&kvm->lock);
+
+ resize->error = err;
+ resize->prepare_done = true;
+
+ mutex_unlock(&kvm->lock);
+}
+
+long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
+ struct kvm_ppc_resize_hpt *rhpt)
+{
+ unsigned long flags = rhpt->flags;
+ unsigned long shift = rhpt->shift;
+ struct kvm_resize_hpt *resize;
+ int ret;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ if (shift && ((shift < 18) || (shift > 46)))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+
+ resize = kvm->arch.resize_hpt;
+
+ if (resize) {
+ if (resize->order == shift) {
+ /* Suitable resize in progress */
+ if (resize->prepare_done) {
+ ret = resize->error;
+ if (ret != 0)
+ resize_hpt_release(kvm, resize);
+ } else {
+ ret = 100; /* estimated time in ms */
+ }
+
+ goto out;
+ }
+
+ /* not suitable, cancel it */
+ resize_hpt_release(kvm, resize);
+ }
+
+ ret = 0;
+ if (!shift)
+ goto out; /* nothing to do */
+
+ /* start new resize */
+
+ resize = kzalloc(sizeof(*resize), GFP_KERNEL);
+ resize->order = shift;
+ resize->kvm = kvm;
+ INIT_WORK(&resize->work, resize_hpt_prepare_work);
+ kvm->arch.resize_hpt = resize;
+
+ schedule_work(&resize->work);
+
+ ret = 100; /* estimated time in ms */
+
+out:
+ mutex_unlock(&kvm->lock);
+ return ret;
+}
+
+static void resize_hpt_boot_vcpu(void *opaque)
+{
+ /* Nothing to do, just force a KVM exit */
+}
+
+long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
+ struct kvm_ppc_resize_hpt *rhpt)
+{
+ unsigned long flags = rhpt->flags;
+ unsigned long shift = rhpt->shift;
+ struct kvm_resize_hpt *resize;
+ long ret;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ if (shift && ((shift < 18) || (shift > 46)))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+
+ resize = kvm->arch.resize_hpt;
+
+ /* This shouldn't be possible */
+ ret = -EIO;
+ if (WARN_ON(!kvm->arch.hpte_setup_done))
+ goto out_no_hpt;
+
+ /* Stop VCPUs from running while we mess with the HPT */
+ kvm->arch.hpte_setup_done = 0;
+ smp_mb();
+
+ /* Boot all CPUs out of the guest so they re-read
+ * hpte_setup_done */
+ on_each_cpu(resize_hpt_boot_vcpu, NULL, 1);
+
+ ret = -ENXIO;
+ if (!resize || (resize->order != shift))
+ goto out;
+
+ ret = -EBUSY;
+ if (!resize->prepare_done)
+ goto out;
+
+ ret = resize->error;
+ if (ret != 0)
+ goto out;
+
+ ret = resize_hpt_rehash(resize);
+ if (ret != 0)
+ goto out;
+
+ resize_hpt_pivot(resize);
+
+out:
+ /* Let VCPUs run again */
+ kvm->arch.hpte_setup_done = 1;
+ smp_mb();
+out_no_hpt:
+ resize_hpt_release(kvm, resize);
+ mutex_unlock(&kvm->lock);
+ return ret;
+}
+
+/*
* Functions for reading and writing the hash table via reads and
* writes on a file descriptor.
*
@@ -1290,8 +1705,8 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
flags = ctx->flags;
i = ctx->index;
- hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
- revp = kvm->arch.revmap + i;
+ hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
+ revp = kvm->arch.hpt.rev + i;
lbuf = (unsigned long __user *)buf;
nb = 0;
@@ -1306,7 +1721,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
/* Skip uninteresting entries, i.e. clean on not-first pass */
if (!first_pass) {
- while (i < kvm->arch.hpt_npte &&
+ while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
!hpte_dirty(revp, hptp)) {
++i;
hptp += 2;
@@ -1316,7 +1731,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
hdr.index = i;
/* Grab a series of valid entries */
- while (i < kvm->arch.hpt_npte &&
+ while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
hdr.n_valid < 0xffff &&
nb + HPTE_SIZE < count &&
record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
@@ -1332,7 +1747,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
++revp;
}
/* Now skip invalid entries while we can */
- while (i < kvm->arch.hpt_npte &&
+ while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
hdr.n_invalid < 0xffff &&
record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
/* found an invalid entry */
@@ -1353,7 +1768,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
}
/* Check if we've wrapped around the hash table */
- if (i >= kvm->arch.hpt_npte) {
+ if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) {
i = 0;
ctx->first_pass = 0;
break;
@@ -1412,11 +1827,11 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
err = -EINVAL;
i = hdr.index;
- if (i >= kvm->arch.hpt_npte ||
- i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte)
+ if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) ||
+ i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt))
break;
- hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+ hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
lbuf = (unsigned long __user *)buf;
for (j = 0; j < hdr.n_valid; ++j) {
__be64 hpte_v;
@@ -1603,8 +2018,9 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
kvm = p->kvm;
i = p->hpt_index;
- hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
- for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) {
+ hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
+ for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt);
+ ++i, hptp += 2) {
if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)))
continue;
@@ -1614,7 +2030,7 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
cpu_relax();
v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK;
hr = be64_to_cpu(hptp[1]);
- gr = kvm->arch.revmap[i].guest_rpte;
+ gr = kvm->arch.hpt.rev[i].guest_rpte;
unlock_hpte(hptp, v);
preempt_enable();
@@ -1675,7 +2091,10 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */
- mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
+ if (kvm_is_radix(vcpu->kvm))
+ mmu->xlate = kvmppc_mmu_radix_xlate;
+ else
+ mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
new file mode 100644
index 000000000000..4344651f408c
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -0,0 +1,716 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+
+/*
+ * Supported radix tree geometry.
+ * Like p9, we support either 5 or 9 bits at the first (lowest) level,
+ * for a page size of 64k or 4k.
+ */
+static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
+
+int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+ struct kvmppc_pte *gpte, bool data, bool iswrite)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u32 pid;
+ int ret, level, ps;
+ __be64 prte, rpte;
+ unsigned long root, pte, index;
+ unsigned long rts, bits, offset;
+ unsigned long gpa;
+ unsigned long proc_tbl_size;
+
+ /* Work out effective PID */
+ switch (eaddr >> 62) {
+ case 0:
+ pid = vcpu->arch.pid;
+ break;
+ case 3:
+ pid = 0;
+ break;
+ default:
+ return -EINVAL;
+ }
+ proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12);
+ if (pid * 16 >= proc_tbl_size)
+ return -EINVAL;
+
+ /* Read partition table to find root of tree for effective PID */
+ ret = kvm_read_guest(kvm, kvm->arch.process_table + pid * 16,
+ &prte, sizeof(prte));
+ if (ret)
+ return ret;
+
+ root = be64_to_cpu(prte);
+ rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
+ ((root & RTS2_MASK) >> RTS2_SHIFT);
+ bits = root & RPDS_MASK;
+ root = root & RPDB_MASK;
+
+ /* P9 DD1 interprets RTS (radix tree size) differently */
+ offset = rts + 31;
+ if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+ offset -= 3;
+
+ /* current implementations only support 52-bit space */
+ if (offset != 52)
+ return -EINVAL;
+
+ for (level = 3; level >= 0; --level) {
+ if (level && bits != p9_supported_radix_bits[level])
+ return -EINVAL;
+ if (level == 0 && !(bits == 5 || bits == 9))
+ return -EINVAL;
+ offset -= bits;
+ index = (eaddr >> offset) & ((1UL << bits) - 1);
+ /* check that low bits of page table base are zero */
+ if (root & ((1UL << (bits + 3)) - 1))
+ return -EINVAL;
+ ret = kvm_read_guest(kvm, root + index * 8,
+ &rpte, sizeof(rpte));
+ if (ret)
+ return ret;
+ pte = __be64_to_cpu(rpte);
+ if (!(pte & _PAGE_PRESENT))
+ return -ENOENT;
+ if (pte & _PAGE_PTE)
+ break;
+ bits = pte & 0x1f;
+ root = pte & 0x0fffffffffffff00ul;
+ }
+ /* need a leaf at lowest level; 512GB pages not supported */
+ if (level < 0 || level == 3)
+ return -EINVAL;
+
+ /* offset is now log base 2 of the page size */
+ gpa = pte & 0x01fffffffffff000ul;
+ if (gpa & ((1ul << offset) - 1))
+ return -EINVAL;
+ gpa += eaddr & ((1ul << offset) - 1);
+ for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
+ if (offset == mmu_psize_defs[ps].shift)
+ break;
+ gpte->page_size = ps;
+
+ gpte->eaddr = eaddr;
+ gpte->raddr = gpa;
+
+ /* Work out permissions */
+ gpte->may_read = !!(pte & _PAGE_READ);
+ gpte->may_write = !!(pte & _PAGE_WRITE);
+ gpte->may_execute = !!(pte & _PAGE_EXEC);
+ if (kvmppc_get_msr(vcpu) & MSR_PR) {
+ if (pte & _PAGE_PRIVILEGED) {
+ gpte->may_read = 0;
+ gpte->may_write = 0;
+ gpte->may_execute = 0;
+ }
+ } else {
+ if (!(pte & _PAGE_PRIVILEGED)) {
+ /* Check AMR/IAMR to see if strict mode is in force */
+ if (vcpu->arch.amr & (1ul << 62))
+ gpte->may_read = 0;
+ if (vcpu->arch.amr & (1ul << 63))
+ gpte->may_write = 0;
+ if (vcpu->arch.iamr & (1ul << 62))
+ gpte->may_execute = 0;
+ }
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_PPC_64K_PAGES
+#define MMU_BASE_PSIZE MMU_PAGE_64K
+#else
+#define MMU_BASE_PSIZE MMU_PAGE_4K
+#endif
+
+static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
+ unsigned int pshift)
+{
+ int psize = MMU_BASE_PSIZE;
+
+ if (pshift >= PMD_SHIFT)
+ psize = MMU_PAGE_2M;
+ addr &= ~0xfffUL;
+ addr |= mmu_psize_defs[psize].ap << 5;
+ asm volatile("ptesync": : :"memory");
+ asm volatile(PPC_TLBIE_5(%0, %1, 0, 0, 1)
+ : : "r" (addr), "r" (kvm->arch.lpid) : "memory");
+ asm volatile("ptesync": : :"memory");
+}
+
+unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
+ unsigned long clr, unsigned long set,
+ unsigned long addr, unsigned int shift)
+{
+ unsigned long old = 0;
+
+ if (!(clr & _PAGE_PRESENT) && cpu_has_feature(CPU_FTR_POWER9_DD1) &&
+ pte_present(*ptep)) {
+ /* have to invalidate it first */
+ old = __radix_pte_update(ptep, _PAGE_PRESENT, 0);
+ kvmppc_radix_tlbie_page(kvm, addr, shift);
+ set |= _PAGE_PRESENT;
+ old &= _PAGE_PRESENT;
+ }
+ return __radix_pte_update(ptep, clr, set) | old;
+}
+
+void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
+}
+
+static struct kmem_cache *kvm_pte_cache;
+
+static pte_t *kvmppc_pte_alloc(void)
+{
+ return kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
+}
+
+static void kvmppc_pte_free(pte_t *ptep)
+{
+ kmem_cache_free(kvm_pte_cache, ptep);
+}
+
+static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
+ unsigned int level, unsigned long mmu_seq)
+{
+ pgd_t *pgd;
+ pud_t *pud, *new_pud = NULL;
+ pmd_t *pmd, *new_pmd = NULL;
+ pte_t *ptep, *new_ptep = NULL;
+ unsigned long old;
+ int ret;
+
+ /* Traverse the guest's 2nd-level tree, allocate new levels needed */
+ pgd = kvm->arch.pgtable + pgd_index(gpa);
+ pud = NULL;
+ if (pgd_present(*pgd))
+ pud = pud_offset(pgd, gpa);
+ else
+ new_pud = pud_alloc_one(kvm->mm, gpa);
+
+ pmd = NULL;
+ if (pud && pud_present(*pud))
+ pmd = pmd_offset(pud, gpa);
+ else
+ new_pmd = pmd_alloc_one(kvm->mm, gpa);
+
+ if (level == 0 && !(pmd && pmd_present(*pmd)))
+ new_ptep = kvmppc_pte_alloc();
+
+ /* Check if we might have been invalidated; let the guest retry if so */
+ spin_lock(&kvm->mmu_lock);
+ ret = -EAGAIN;
+ if (mmu_notifier_retry(kvm, mmu_seq))
+ goto out_unlock;
+
+ /* Now traverse again under the lock and change the tree */
+ ret = -ENOMEM;
+ if (pgd_none(*pgd)) {
+ if (!new_pud)
+ goto out_unlock;
+ pgd_populate(kvm->mm, pgd, new_pud);
+ new_pud = NULL;
+ }
+ pud = pud_offset(pgd, gpa);
+ if (pud_none(*pud)) {
+ if (!new_pmd)
+ goto out_unlock;
+ pud_populate(kvm->mm, pud, new_pmd);
+ new_pmd = NULL;
+ }
+ pmd = pmd_offset(pud, gpa);
+ if (pmd_large(*pmd)) {
+ /* Someone else has instantiated a large page here; retry */
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ if (level == 1 && !pmd_none(*pmd)) {
+ /*
+ * There's a page table page here, but we wanted
+ * to install a large page. Tell the caller and let
+ * it try installing a normal page if it wants.
+ */
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ if (level == 0) {
+ if (pmd_none(*pmd)) {
+ if (!new_ptep)
+ goto out_unlock;
+ pmd_populate(kvm->mm, pmd, new_ptep);
+ new_ptep = NULL;
+ }
+ ptep = pte_offset_kernel(pmd, gpa);
+ if (pte_present(*ptep)) {
+ /* PTE was previously valid, so invalidate it */
+ old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT,
+ 0, gpa, 0);
+ kvmppc_radix_tlbie_page(kvm, gpa, 0);
+ if (old & _PAGE_DIRTY)
+ mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
+ }
+ kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
+ } else {
+ kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
+ }
+ ret = 0;
+
+ out_unlock:
+ spin_unlock(&kvm->mmu_lock);
+ if (new_pud)
+ pud_free(kvm->mm, new_pud);
+ if (new_pmd)
+ pmd_free(kvm->mm, new_pmd);
+ if (new_ptep)
+ kvmppc_pte_free(new_ptep);
+ return ret;
+}
+
+int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
+ unsigned long ea, unsigned long dsisr)
+{
+ struct kvm *kvm = vcpu->kvm;
+ unsigned long mmu_seq, pte_size;
+ unsigned long gpa, gfn, hva, pfn;
+ struct kvm_memory_slot *memslot;
+ struct page *page = NULL, *pages[1];
+ long ret, npages, ok;
+ unsigned int writing;
+ struct vm_area_struct *vma;
+ unsigned long flags;
+ pte_t pte, *ptep;
+ unsigned long pgflags;
+ unsigned int shift, level;
+
+ /* Check for unusual errors */
+ if (dsisr & DSISR_UNSUPP_MMU) {
+ pr_err("KVM: Got unsupported MMU fault\n");
+ return -EFAULT;
+ }
+ if (dsisr & DSISR_BADACCESS) {
+ /* Reflect to the guest as DSI */
+ pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
+ kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+ return RESUME_GUEST;
+ }
+
+ /* Translate the logical address and get the page */
+ gpa = vcpu->arch.fault_gpa & ~0xfffUL;
+ gpa &= ~0xF000000000000000ul;
+ gfn = gpa >> PAGE_SHIFT;
+ if (!(dsisr & DSISR_PGDIRFAULT))
+ gpa |= ea & 0xfff;
+ memslot = gfn_to_memslot(kvm, gfn);
+
+ /* No memslot means it's an emulated MMIO region */
+ if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
+ if (dsisr & (DSISR_PGDIRFAULT | DSISR_BADACCESS |
+ DSISR_SET_RC)) {
+ /*
+ * Bad address in guest page table tree, or other
+ * unusual error - reflect it to the guest as DSI.
+ */
+ kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+ return RESUME_GUEST;
+ }
+ return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
+ dsisr & DSISR_ISSTORE);
+ }
+
+ /* used to check for invalidations in progress */
+ mmu_seq = kvm->mmu_notifier_seq;
+ smp_rmb();
+
+ writing = (dsisr & DSISR_ISSTORE) != 0;
+ hva = gfn_to_hva_memslot(memslot, gfn);
+ if (dsisr & DSISR_SET_RC) {
+ /*
+ * Need to set an R or C bit in the 2nd-level tables;
+ * if the relevant bits aren't already set in the linux
+ * page tables, fall through to do the gup_fast to
+ * set them in the linux page tables too.
+ */
+ ok = 0;
+ pgflags = _PAGE_ACCESSED;
+ if (writing)
+ pgflags |= _PAGE_DIRTY;
+ local_irq_save(flags);
+ ptep = __find_linux_pte_or_hugepte(current->mm->pgd, hva,
+ NULL, NULL);
+ if (ptep) {
+ pte = READ_ONCE(*ptep);
+ if (pte_present(pte) &&
+ (pte_val(pte) & pgflags) == pgflags)
+ ok = 1;
+ }
+ local_irq_restore(flags);
+ if (ok) {
+ spin_lock(&kvm->mmu_lock);
+ if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
+ spin_unlock(&kvm->mmu_lock);
+ return RESUME_GUEST;
+ }
+ ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable,
+ gpa, NULL, &shift);
+ if (ptep && pte_present(*ptep)) {
+ kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
+ gpa, shift);
+ spin_unlock(&kvm->mmu_lock);
+ return RESUME_GUEST;
+ }
+ spin_unlock(&kvm->mmu_lock);
+ }
+ }
+
+ ret = -EFAULT;
+ pfn = 0;
+ pte_size = PAGE_SIZE;
+ pgflags = _PAGE_READ | _PAGE_EXEC;
+ level = 0;
+ npages = get_user_pages_fast(hva, 1, writing, pages);
+ if (npages < 1) {
+ /* Check if it's an I/O mapping */
+ down_read(&current->mm->mmap_sem);
+ vma = find_vma(current->mm, hva);
+ if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
+ (vma->vm_flags & VM_PFNMAP)) {
+ pfn = vma->vm_pgoff +
+ ((hva - vma->vm_start) >> PAGE_SHIFT);
+ pgflags = pgprot_val(vma->vm_page_prot);
+ }
+ up_read(&current->mm->mmap_sem);
+ if (!pfn)
+ return -EFAULT;
+ } else {
+ page = pages[0];
+ pfn = page_to_pfn(page);
+ if (PageHuge(page)) {
+ page = compound_head(page);
+ pte_size <<= compound_order(page);
+ /* See if we can insert a 2MB large-page PTE here */
+ if (pte_size >= PMD_SIZE &&
+ (gpa & PMD_MASK & PAGE_MASK) ==
+ (hva & PMD_MASK & PAGE_MASK)) {
+ level = 1;
+ pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
+ }
+ }
+ /* See if we can provide write access */
+ if (writing) {
+ /*
+ * We assume gup_fast has set dirty on the host PTE.
+ */
+ pgflags |= _PAGE_WRITE;
+ } else {
+ local_irq_save(flags);
+ ptep = __find_linux_pte_or_hugepte(current->mm->pgd,
+ hva, NULL, NULL);
+ if (ptep && pte_write(*ptep) && pte_dirty(*ptep))
+ pgflags |= _PAGE_WRITE;
+ local_irq_restore(flags);
+ }
+ }
+
+ /*
+ * Compute the PTE value that we need to insert.
+ */
+ pgflags |= _PAGE_PRESENT | _PAGE_PTE | _PAGE_ACCESSED;
+ if (pgflags & _PAGE_WRITE)
+ pgflags |= _PAGE_DIRTY;
+ pte = pfn_pte(pfn, __pgprot(pgflags));
+
+ /* Allocate space in the tree and write the PTE */
+ ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
+ if (ret == -EBUSY) {
+ /*
+ * There's already a PMD where wanted to install a large page;
+ * for now, fall back to installing a small page.
+ */
+ level = 0;
+ pfn |= gfn & ((PMD_SIZE >> PAGE_SHIFT) - 1);
+ pte = pfn_pte(pfn, __pgprot(pgflags));
+ ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
+ }
+ if (ret == 0 || ret == -EAGAIN)
+ ret = RESUME_GUEST;
+
+ if (page) {
+ /*
+ * We drop pages[0] here, not page because page might
+ * have been set to the head page of a compound, but
+ * we have to drop the reference on the correct tail
+ * page to match the get inside gup()
+ */
+ put_page(pages[0]);
+ }
+ return ret;
+}
+
+static void mark_pages_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn, unsigned int order)
+{
+ unsigned long i, limit;
+ unsigned long *dp;
+
+ if (!memslot->dirty_bitmap)
+ return;
+ limit = 1ul << order;
+ if (limit < BITS_PER_LONG) {
+ for (i = 0; i < limit; ++i)
+ mark_page_dirty(kvm, gfn + i);
+ return;
+ }
+ dp = memslot->dirty_bitmap + (gfn - memslot->base_gfn);
+ limit /= BITS_PER_LONG;
+ for (i = 0; i < limit; ++i)
+ *dp++ = ~0ul;
+}
+
+/* Called with kvm->lock held */
+int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
+{
+ pte_t *ptep;
+ unsigned long gpa = gfn << PAGE_SHIFT;
+ unsigned int shift;
+ unsigned long old;
+
+ ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
+ NULL, &shift);
+ if (ptep && pte_present(*ptep)) {
+ old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0,
+ gpa, shift);
+ kvmppc_radix_tlbie_page(kvm, gpa, shift);
+ if (old & _PAGE_DIRTY) {
+ if (!shift)
+ mark_page_dirty(kvm, gfn);
+ else
+ mark_pages_dirty(kvm, memslot,
+ gfn, shift - PAGE_SHIFT);
+ }
+ }
+ return 0;
+}
+
+/* Called with kvm->lock held */
+int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
+{
+ pte_t *ptep;
+ unsigned long gpa = gfn << PAGE_SHIFT;
+ unsigned int shift;
+ int ref = 0;
+
+ ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
+ NULL, &shift);
+ if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
+ kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
+ gpa, shift);
+ /* XXX need to flush tlb here? */
+ ref = 1;
+ }
+ return ref;
+}
+
+/* Called with kvm->lock held */
+int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
+{
+ pte_t *ptep;
+ unsigned long gpa = gfn << PAGE_SHIFT;
+ unsigned int shift;
+ int ref = 0;
+
+ ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
+ NULL, &shift);
+ if (ptep && pte_present(*ptep) && pte_young(*ptep))
+ ref = 1;
+ return ref;
+}
+
+/* Returns the number of PAGE_SIZE pages that are dirty */
+static int kvm_radix_test_clear_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *memslot, int pagenum)
+{
+ unsigned long gfn = memslot->base_gfn + pagenum;
+ unsigned long gpa = gfn << PAGE_SHIFT;
+ pte_t *ptep;
+ unsigned int shift;
+ int ret = 0;
+
+ ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
+ NULL, &shift);
+ if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) {
+ ret = 1;
+ if (shift)
+ ret = 1 << (shift - PAGE_SHIFT);
+ kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
+ gpa, shift);
+ kvmppc_radix_tlbie_page(kvm, gpa, shift);
+ }
+ return ret;
+}
+
+long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
+ struct kvm_memory_slot *memslot, unsigned long *map)
+{
+ unsigned long i, j;
+ unsigned long n, *p;
+ int npages;
+
+ /*
+ * Radix accumulates dirty bits in the first half of the
+ * memslot's dirty_bitmap area, for when pages are paged
+ * out or modified by the host directly. Pick up these
+ * bits and add them to the map.
+ */
+ n = kvm_dirty_bitmap_bytes(memslot) / sizeof(long);
+ p = memslot->dirty_bitmap;
+ for (i = 0; i < n; ++i)
+ map[i] |= xchg(&p[i], 0);
+
+ for (i = 0; i < memslot->npages; i = j) {
+ npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
+
+ /*
+ * Note that if npages > 0 then i must be a multiple of npages,
+ * since huge pages are only used to back the guest at guest
+ * real addresses that are a multiple of their size.
+ * Since we have at most one PTE covering any given guest
+ * real address, if npages > 1 we can skip to i + npages.
+ */
+ j = i + 1;
+ if (npages)
+ for (j = i; npages; ++j, --npages)
+ __set_bit_le(j, map);
+ }
+ return 0;
+}
+
+static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
+ int psize, int *indexp)
+{
+ if (!mmu_psize_defs[psize].shift)
+ return;
+ info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
+ (mmu_psize_defs[psize].ap << 29);
+ ++(*indexp);
+}
+
+int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
+{
+ int i;
+
+ if (!radix_enabled())
+ return -EINVAL;
+ memset(info, 0, sizeof(*info));
+
+ /* 4k page size */
+ info->geometries[0].page_shift = 12;
+ info->geometries[0].level_bits[0] = 9;
+ for (i = 1; i < 4; ++i)
+ info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
+ /* 64k page size */
+ info->geometries[1].page_shift = 16;
+ for (i = 0; i < 4; ++i)
+ info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
+
+ i = 0;
+ add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
+ add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
+ add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
+ add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
+
+ return 0;
+}
+
+int kvmppc_init_vm_radix(struct kvm *kvm)
+{
+ kvm->arch.pgtable = pgd_alloc(kvm->mm);
+ if (!kvm->arch.pgtable)
+ return -ENOMEM;
+ return 0;
+}
+
+void kvmppc_free_radix(struct kvm *kvm)
+{
+ unsigned long ig, iu, im;
+ pte_t *pte;
+ pmd_t *pmd;
+ pud_t *pud;
+ pgd_t *pgd;
+
+ if (!kvm->arch.pgtable)
+ return;
+ pgd = kvm->arch.pgtable;
+ for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
+ if (!pgd_present(*pgd))
+ continue;
+ pud = pud_offset(pgd, 0);
+ for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++pud) {
+ if (!pud_present(*pud))
+ continue;
+ pmd = pmd_offset(pud, 0);
+ for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) {
+ if (pmd_huge(*pmd)) {
+ pmd_clear(pmd);
+ continue;
+ }
+ if (!pmd_present(*pmd))
+ continue;
+ pte = pte_offset_map(pmd, 0);
+ memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
+ kvmppc_pte_free(pte);
+ pmd_clear(pmd);
+ }
+ pmd_free(kvm->mm, pmd_offset(pud, 0));
+ pud_clear(pud);
+ }
+ pud_free(kvm->mm, pud_offset(pgd, 0));
+ pgd_clear(pgd);
+ }
+ pgd_free(kvm->mm, kvm->arch.pgtable);
+}
+
+static void pte_ctor(void *addr)
+{
+ memset(addr, 0, PTE_TABLE_SIZE);
+}
+
+int kvmppc_radix_init(void)
+{
+ unsigned long size = sizeof(void *) << PTE_INDEX_SIZE;
+
+ kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
+ if (!kvm_pte_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void kvmppc_radix_exit(void)
+{
+ kmem_cache_destroy(kvm_pte_cache);
+}
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index c379ff5a4438..ab9d14c0e460 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -102,9 +102,9 @@ static void release_spapr_tce_table(struct rcu_head *head)
kfree(stt);
}
-static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int kvm_spapr_tce_fault(struct vm_fault *vmf)
{
- struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
+ struct kvmppc_spapr_tce_table *stt = vmf->vma->vm_file->private_data;
struct page *page;
if (vmf->pgoff >= kvmppc_tce_pages(stt->size))
@@ -171,6 +171,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
goto fail;
}
+ ret = -ENOMEM;
stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
GFP_KERNEL);
if (!stt)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index ec34e39471a7..1e107ece4e37 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -182,7 +182,8 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
++vcpu->stat.halt_wakeup;
}
- if (kvmppc_ipi_thread(vcpu->arch.thread_cpu))
+ cpu = READ_ONCE(vcpu->arch.thread_cpu);
+ if (cpu >= 0 && kvmppc_ipi_thread(cpu))
return;
/* CPU points to the first thread of the core */
@@ -773,12 +774,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
}
tvcpu->arch.prodded = 1;
smp_mb();
- if (vcpu->arch.ceded) {
- if (swait_active(&vcpu->wq)) {
- swake_up(&vcpu->wq);
- vcpu->stat.halt_wakeup++;
- }
- }
+ if (tvcpu->arch.ceded)
+ kvmppc_fast_vcpu_kick_hv(tvcpu);
break;
case H_CONFER:
target = kvmppc_get_gpr(vcpu, 4);
@@ -1135,7 +1132,7 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
/*
* Userspace can only modify DPFD (default prefetch depth),
* ILE (interrupt little-endian) and TC (translation control).
- * On POWER8 userspace can also modify AIL (alt. interrupt loc.)
+ * On POWER8 and POWER9 userspace can also modify AIL (alt. interrupt loc.).
*/
mask = LPCR_DPFD | LPCR_ILE | LPCR_TC;
if (cpu_has_feature(CPU_FTR_ARCH_207S))
@@ -1821,6 +1818,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
vcpu->arch.vcore = vcore;
vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
vcpu->arch.thread_cpu = -1;
+ vcpu->arch.prev_cpu = -1;
vcpu->arch.cpu_type = KVM_CPU_3S_64;
kvmppc_sanity_check(vcpu);
@@ -1950,11 +1948,33 @@ static void kvmppc_release_hwthread(int cpu)
tpaca->kvm_hstate.kvm_split_mode = NULL;
}
+static void do_nothing(void *x)
+{
+}
+
+static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
+{
+ int i;
+
+ cpu = cpu_first_thread_sibling(cpu);
+ cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
+ /*
+ * Make sure setting of bit in need_tlb_flush precedes
+ * testing of cpu_in_guest bits. The matching barrier on
+ * the other side is the first smp_mb() in kvmppc_run_core().
+ */
+ smp_mb();
+ for (i = 0; i < threads_per_core; ++i)
+ if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest))
+ smp_call_function_single(cpu + i, do_nothing, NULL, 1);
+}
+
static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
{
int cpu;
struct paca_struct *tpaca;
struct kvmppc_vcore *mvc = vc->master_vcore;
+ struct kvm *kvm = vc->kvm;
cpu = vc->pcpu;
if (vcpu) {
@@ -1965,6 +1985,27 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
cpu += vcpu->arch.ptid;
vcpu->cpu = mvc->pcpu;
vcpu->arch.thread_cpu = cpu;
+
+ /*
+ * With radix, the guest can do TLB invalidations itself,
+ * and it could choose to use the local form (tlbiel) if
+ * it is invalidating a translation that has only ever been
+ * used on one vcpu. However, that doesn't mean it has
+ * only ever been used on one physical cpu, since vcpus
+ * can move around between pcpus. To cope with this, when
+ * a vcpu moves from one pcpu to another, we need to tell
+ * any vcpus running on the same core as this vcpu previously
+ * ran to flush the TLB. The TLB is shared between threads,
+ * so we use a single bit in .need_tlb_flush for all 4 threads.
+ */
+ if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) {
+ if (vcpu->arch.prev_cpu >= 0 &&
+ cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
+ cpu_first_thread_sibling(cpu))
+ radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
+ vcpu->arch.prev_cpu = cpu;
+ }
+ cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest);
}
tpaca = &paca[cpu];
tpaca->kvm_hstate.kvm_vcpu = vcpu;
@@ -2552,6 +2593,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
kvmppc_release_hwthread(pcpu + i);
if (sip && sip->napped[i])
kvmppc_ipi_thread(pcpu + i);
+ cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest);
}
kvmppc_set_host_core(pcpu);
@@ -2620,7 +2662,8 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
int i;
for_each_runnable_thread(i, vcpu, vc) {
- if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded)
+ if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded ||
+ vcpu->arch.prodded)
return 1;
}
@@ -2806,7 +2849,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
break;
n_ceded = 0;
for_each_runnable_thread(i, v, vc) {
- if (!v->arch.pending_exceptions)
+ if (!v->arch.pending_exceptions && !v->arch.prodded)
n_ceded += v->arch.ceded;
else
v->arch.ceded = 0;
@@ -2877,7 +2920,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
smp_mb();
/* On the first time here, set up HTAB and VRMA */
- if (!vcpu->kvm->arch.hpte_setup_done) {
+ if (!kvm_is_radix(vcpu->kvm) && !vcpu->kvm->arch.hpte_setup_done) {
r = kvmppc_hv_setup_htab_rma(vcpu);
if (r)
goto out;
@@ -2939,6 +2982,13 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
{
struct kvm_ppc_one_seg_page_size *sps;
+ /*
+ * Since we don't yet support HPT guests on a radix host,
+ * return an error if the host uses radix.
+ */
+ if (radix_enabled())
+ return -EINVAL;
+
info->flags = KVM_PPC_PAGE_SIZES_REAL;
if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
info->flags |= KVM_PPC_1T_SEGMENTS;
@@ -2961,8 +3011,10 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
- int r;
+ int i, r;
unsigned long n;
+ unsigned long *buf;
+ struct kvm_vcpu *vcpu;
mutex_lock(&kvm->slots_lock);
@@ -2976,15 +3028,32 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
if (!memslot->dirty_bitmap)
goto out;
+ /*
+ * Use second half of bitmap area because radix accumulates
+ * bits in the first half.
+ */
n = kvm_dirty_bitmap_bytes(memslot);
- memset(memslot->dirty_bitmap, 0, n);
+ buf = memslot->dirty_bitmap + n / sizeof(long);
+ memset(buf, 0, n);
- r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap);
+ if (kvm_is_radix(kvm))
+ r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf);
+ else
+ r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf);
if (r)
goto out;
+ /* Harvest dirty bits from VPA and DTL updates */
+ /* Note: we never modify the SLB shadow buffer areas */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ spin_lock(&vcpu->arch.vpa_update_lock);
+ kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf);
+ kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf);
+ spin_unlock(&vcpu->arch.vpa_update_lock);
+ }
+
r = -EFAULT;
- if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
+ if (copy_to_user(log->dirty_bitmap, buf, n))
goto out;
r = 0;
@@ -3005,6 +3074,15 @@ static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
unsigned long npages)
{
+ /*
+ * For now, if radix_enabled() then we only support radix guests,
+ * and in that case we don't need the rmap array.
+ */
+ if (radix_enabled()) {
+ slot->arch.rmap = NULL;
+ return 0;
+ }
+
slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
if (!slot->arch.rmap)
return -ENOMEM;
@@ -3037,7 +3115,7 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
if (npages)
atomic64_inc(&kvm->arch.mmio_update);
- if (npages && old->npages) {
+ if (npages && old->npages && !kvm_is_radix(kvm)) {
/*
* If modifying a memslot, reset all the rmap dirty bits.
* If this is a new memslot, we don't need to do anything
@@ -3046,7 +3124,7 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
*/
slots = kvm_memslots(kvm);
memslot = id_to_memslot(slots, mem->slot);
- kvmppc_hv_get_dirty_log(kvm, memslot, NULL);
+ kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL);
}
}
@@ -3085,14 +3163,20 @@ static void kvmppc_setup_partition_table(struct kvm *kvm)
{
unsigned long dw0, dw1;
- /* PS field - page size for VRMA */
- dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
- ((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
- /* HTABSIZE and HTABORG fields */
- dw0 |= kvm->arch.sdr1;
+ if (!kvm_is_radix(kvm)) {
+ /* PS field - page size for VRMA */
+ dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
+ ((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
+ /* HTABSIZE and HTABORG fields */
+ dw0 |= kvm->arch.sdr1;
- /* Second dword has GR=0; other fields are unused since UPRT=0 */
- dw1 = 0;
+ /* Second dword as set by userspace */
+ dw1 = kvm->arch.process_table;
+ } else {
+ dw0 = PATB_HR | radix__get_tree_size() |
+ __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
+ dw1 = PATB_GR | kvm->arch.process_table;
+ }
mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
}
@@ -3113,12 +3197,23 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
goto out; /* another vcpu beat us to it */
/* Allocate hashed page table (if not done already) and reset it */
- if (!kvm->arch.hpt_virt) {
- err = kvmppc_alloc_hpt(kvm, NULL);
- if (err) {
+ if (!kvm->arch.hpt.virt) {
+ int order = KVM_DEFAULT_HPT_ORDER;
+ struct kvm_hpt_info info;
+
+ err = kvmppc_allocate_hpt(&info, order);
+ /* If we get here, it means userspace didn't specify a
+ * size explicitly. So, try successively smaller
+ * sizes if the default failed. */
+ while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER)
+ err = kvmppc_allocate_hpt(&info, order);
+
+ if (err < 0) {
pr_err("KVM: Couldn't alloc HPT\n");
goto out;
}
+
+ kvmppc_set_hpt(kvm, &info);
}
/* Look up the memslot for guest physical address 0 */
@@ -3262,6 +3357,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
{
unsigned long lpcr, lpid;
char buf[32];
+ int ret;
/* Allocate the guest's logical partition ID */
@@ -3309,13 +3405,33 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
lpcr |= LPCR_HVICE;
}
+ /*
+ * For now, if the host uses radix, the guest must be radix.
+ */
+ if (radix_enabled()) {
+ kvm->arch.radix = 1;
+ lpcr &= ~LPCR_VPM1;
+ lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
+ ret = kvmppc_init_vm_radix(kvm);
+ if (ret) {
+ kvmppc_free_lpid(kvm->arch.lpid);
+ return ret;
+ }
+ kvmppc_setup_partition_table(kvm);
+ }
+
kvm->arch.lpcr = lpcr;
+ /* Initialization for future HPT resizes */
+ kvm->arch.resize_hpt = NULL;
+
/*
* Work out how many sets the TLB has, for the use of
* the TLB invalidation loop in book3s_hv_rmhandlers.S.
*/
- if (cpu_has_feature(CPU_FTR_ARCH_300))
+ if (kvm_is_radix(kvm))
+ kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */
+ else if (cpu_has_feature(CPU_FTR_ARCH_300))
kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */
else if (cpu_has_feature(CPU_FTR_ARCH_207S))
kvm->arch.tlb_sets = POWER8_TLB_SETS; /* 512 */
@@ -3325,8 +3441,11 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
/*
* Track that we now have a HV mode VM active. This blocks secondary
* CPU threads from coming online.
+ * On POWER9, we only need to do this for HPT guests on a radix
+ * host, which is not yet supported.
*/
- kvm_hv_vm_activated();
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ kvm_hv_vm_activated();
/*
* Create a debugfs directory for the VM
@@ -3352,11 +3471,17 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
{
debugfs_remove_recursive(kvm->arch.debugfs_dir);
- kvm_hv_vm_deactivated();
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ kvm_hv_vm_deactivated();
kvmppc_free_vcores(kvm);
- kvmppc_free_hpt(kvm);
+ kvmppc_free_lpid(kvm->arch.lpid);
+
+ if (kvm_is_radix(kvm))
+ kvmppc_free_radix(kvm);
+ else
+ kvmppc_free_hpt(&kvm->arch.hpt);
kvmppc_free_pimap(kvm);
}
@@ -3385,11 +3510,6 @@ static int kvmppc_core_check_processor_compat_hv(void)
if (!cpu_has_feature(CPU_FTR_HVMODE) ||
!cpu_has_feature(CPU_FTR_ARCH_206))
return -EIO;
- /*
- * Disable KVM for Power9 in radix mode.
- */
- if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
- return -EIO;
return 0;
}
@@ -3587,12 +3707,9 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
r = -EFAULT;
if (get_user(htab_order, (u32 __user *)argp))
break;
- r = kvmppc_alloc_reset_hpt(kvm, &htab_order);
+ r = kvmppc_alloc_reset_hpt(kvm, htab_order);
if (r)
break;
- r = -EFAULT;
- if (put_user(htab_order, (u32 __user *)argp))
- break;
r = 0;
break;
}
@@ -3607,6 +3724,28 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
break;
}
+ case KVM_PPC_RESIZE_HPT_PREPARE: {
+ struct kvm_ppc_resize_hpt rhpt;
+
+ r = -EFAULT;
+ if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
+ break;
+
+ r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt);
+ break;
+ }
+
+ case KVM_PPC_RESIZE_HPT_COMMIT: {
+ struct kvm_ppc_resize_hpt rhpt;
+
+ r = -EFAULT;
+ if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
+ break;
+
+ r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt);
+ break;
+ }
+
default:
r = -ENOTTY;
}
@@ -3657,6 +3796,41 @@ static void init_default_hcalls(void)
}
}
+static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
+{
+ unsigned long lpcr;
+ int radix;
+
+ /* If not on a POWER9, reject it */
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ return -ENODEV;
+
+ /* If any unknown flags set, reject it */
+ if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
+ return -EINVAL;
+
+ /* We can't change a guest to/from radix yet */
+ radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
+ if (radix != kvm_is_radix(kvm))
+ return -EINVAL;
+
+ /* GR (guest radix) bit in process_table field must match */
+ if (!!(cfg->process_table & PATB_GR) != radix)
+ return -EINVAL;
+
+ /* Process table size field must be reasonable, i.e. <= 24 */
+ if ((cfg->process_table & PRTS_MASK) > 24)
+ return -EINVAL;
+
+ kvm->arch.process_table = cfg->process_table;
+ kvmppc_setup_partition_table(kvm);
+
+ lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
+ kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
+
+ return 0;
+}
+
static struct kvmppc_ops kvm_ops_hv = {
.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -3694,6 +3868,8 @@ static struct kvmppc_ops kvm_ops_hv = {
.irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
.irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
#endif
+ .configure_mmu = kvmhv_configure_mmu,
+ .get_rmmu_info = kvmhv_get_rmmu_info,
};
static int kvm_init_subcore_bitmap(void)
@@ -3728,6 +3904,11 @@ static int kvm_init_subcore_bitmap(void)
return 0;
}
+static int kvmppc_radix_possible(void)
+{
+ return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled();
+}
+
static int kvmppc_book3s_init_hv(void)
{
int r;
@@ -3767,12 +3948,19 @@ static int kvmppc_book3s_init_hv(void)
init_vcore_lists();
r = kvmppc_mmu_hv_init();
+ if (r)
+ return r;
+
+ if (kvmppc_radix_possible())
+ r = kvmppc_radix_init();
return r;
}
static void kvmppc_book3s_exit_hv(void)
{
kvmppc_free_host_rm_ops();
+ if (kvmppc_radix_possible())
+ kvmppc_radix_exit();
kvmppc_hv_ops = NULL;
}
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 5bb24be0b346..4d6c64b3041c 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -52,19 +52,20 @@ static int __init early_parse_kvm_cma_resv(char *p)
}
early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv);
-struct page *kvm_alloc_hpt(unsigned long nr_pages)
+struct page *kvm_alloc_hpt_cma(unsigned long nr_pages)
{
VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
- return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES));
+ return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES),
+ GFP_KERNEL);
}
-EXPORT_SYMBOL_GPL(kvm_alloc_hpt);
+EXPORT_SYMBOL_GPL(kvm_alloc_hpt_cma);
-void kvm_release_hpt(struct page *page, unsigned long nr_pages)
+void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages)
{
cma_release(kvm_cma, page, nr_pages);
}
-EXPORT_SYMBOL_GPL(kvm_release_hpt);
+EXPORT_SYMBOL_GPL(kvm_free_hpt_cma);
/**
* kvm_cma_reserve() - reserve area for kvm hash pagetable
@@ -200,7 +201,6 @@ static inline void rm_writeb(unsigned long paddr, u8 val)
/*
* Send an interrupt or message to another CPU.
- * This can only be called in real mode.
* The caller needs to include any barrier needed to order writes
* to memory vs. the IPI/message.
*/
@@ -229,8 +229,7 @@ void kvmhv_rm_send_ipi(int cpu)
if (xics_phys)
rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
else
- opal_rm_int_set_mfrr(get_hard_smp_processor_id(cpu),
- IPI_PRIORITY);
+ opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
}
/*
@@ -412,14 +411,13 @@ static long kvmppc_read_one_intr(bool *again)
/* Now read the interrupt from the ICP */
xics_phys = local_paca->kvm_hstate.xics_phys;
- if (!xics_phys) {
- /* Use OPAL to read the XIRR */
- rc = opal_rm_int_get_xirr(&xirr, false);
- if (rc < 0)
- return 1;
- } else {
+ rc = 0;
+ if (!xics_phys)
+ rc = opal_int_get_xirr(&xirr, false);
+ else
xirr = _lwzcix(xics_phys + XICS_XIRR);
- }
+ if (rc < 0)
+ return 1;
/*
* Save XIRR for later. Since we get control in reverse endian
@@ -445,15 +443,16 @@ static long kvmppc_read_one_intr(bool *again)
* If it is an IPI, clear the MFRR and EOI it.
*/
if (xisr == XICS_IPI) {
+ rc = 0;
if (xics_phys) {
_stbcix(xics_phys + XICS_MFRR, 0xff);
_stwcix(xics_phys + XICS_XIRR, xirr);
} else {
- opal_rm_int_set_mfrr(hard_smp_processor_id(), 0xff);
- rc = opal_rm_int_eoi(h_xirr);
- /* If rc > 0, there is another interrupt pending */
- *again = rc > 0;
+ opal_int_set_mfrr(hard_smp_processor_id(), 0xff);
+ rc = opal_int_eoi(h_xirr);
}
+ /* If rc > 0, there is another interrupt pending */
+ *again = rc > 0;
/*
* Need to ensure side effects of above stores
@@ -474,8 +473,8 @@ static long kvmppc_read_one_intr(bool *again)
if (xics_phys)
_stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
else
- opal_rm_int_set_mfrr(hard_smp_processor_id(),
- IPI_PRIORITY);
+ opal_int_set_mfrr(hard_smp_processor_id(),
+ IPI_PRIORITY);
/* Let side effects complete */
smp_mb();
return 1;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 9ef3c4be952f..6fca970373ee 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -43,6 +43,7 @@ static void *real_vmalloc_addr(void *x)
static int global_invalidates(struct kvm *kvm, unsigned long flags)
{
int global;
+ int cpu;
/*
* If there is only one vcore, and it's currently running,
@@ -60,8 +61,14 @@ static int global_invalidates(struct kvm *kvm, unsigned long flags)
/* any other core might now have stale TLB entries... */
smp_wmb();
cpumask_setall(&kvm->arch.need_tlb_flush);
- cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
- &kvm->arch.need_tlb_flush);
+ cpu = local_paca->kvm_hstate.kvm_vcore->pcpu;
+ /*
+ * On POWER9, threads are independent but the TLB is shared,
+ * so use the bit for the first thread to represent the core.
+ */
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ cpu = cpu_first_thread_sibling(cpu);
+ cpumask_clear_cpu(cpu, &kvm->arch.need_tlb_flush);
}
return global;
@@ -79,10 +86,10 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
if (*rmap & KVMPPC_RMAP_PRESENT) {
i = *rmap & KVMPPC_RMAP_INDEX;
- head = &kvm->arch.revmap[i];
+ head = &kvm->arch.hpt.rev[i];
if (realmode)
head = real_vmalloc_addr(head);
- tail = &kvm->arch.revmap[head->back];
+ tail = &kvm->arch.hpt.rev[head->back];
if (realmode)
tail = real_vmalloc_addr(tail);
rev->forw = i;
@@ -147,8 +154,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
lock_rmap(rmap);
head = *rmap & KVMPPC_RMAP_INDEX;
- next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]);
- prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]);
+ next = real_vmalloc_addr(&kvm->arch.hpt.rev[rev->forw]);
+ prev = real_vmalloc_addr(&kvm->arch.hpt.rev[rev->back]);
next->back = rev->back;
prev->forw = rev->forw;
if (head == pte_index) {
@@ -182,6 +189,8 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
unsigned long mmu_seq;
unsigned long rcbits, irq_flags = 0;
+ if (kvm_is_radix(kvm))
+ return H_FUNCTION;
psize = hpte_page_size(pteh, ptel);
if (!psize)
return H_PARAMETER;
@@ -283,11 +292,11 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
/* Find and lock the HPTEG slot to use */
do_insert:
- if (pte_index >= kvm->arch.hpt_npte)
+ if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
return H_PARAMETER;
if (likely((flags & H_EXACT) == 0)) {
pte_index &= ~7UL;
- hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+ hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
for (i = 0; i < 8; ++i) {
if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0 &&
try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
@@ -318,7 +327,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
}
pte_index += i;
} else {
- hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+ hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
HPTE_V_ABSENT)) {
/* Lock the slot and check again */
@@ -335,7 +344,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
}
/* Save away the guest's idea of the second HPTE dword */
- rev = &kvm->arch.revmap[pte_index];
+ rev = &kvm->arch.hpt.rev[pte_index];
if (realmode)
rev = real_vmalloc_addr(rev);
if (rev) {
@@ -458,9 +467,11 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
struct revmap_entry *rev;
u64 pte, orig_pte, pte_r;
- if (pte_index >= kvm->arch.hpt_npte)
+ if (kvm_is_radix(kvm))
+ return H_FUNCTION;
+ if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
return H_PARAMETER;
- hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+ hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
cpu_relax();
pte = orig_pte = be64_to_cpu(hpte[0]);
@@ -476,7 +487,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
return H_NOT_FOUND;
}
- rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+ rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
v = pte & ~HPTE_V_HVLOCK;
if (v & HPTE_V_VALID) {
hpte[0] &= ~cpu_to_be64(HPTE_V_VALID);
@@ -529,6 +540,8 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
struct revmap_entry *rev, *revs[4];
u64 hp0, hp1;
+ if (kvm_is_radix(kvm))
+ return H_FUNCTION;
global = global_invalidates(kvm, 0);
for (i = 0; i < 4 && ret == H_SUCCESS; ) {
n = 0;
@@ -544,13 +557,13 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
break;
}
if (req != 1 || flags == 3 ||
- pte_index >= kvm->arch.hpt_npte) {
+ pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) {
/* parameter error */
args[j] = ((0xa0 | flags) << 56) + pte_index;
ret = H_PARAMETER;
break;
}
- hp = (__be64 *) (kvm->arch.hpt_virt + (pte_index << 4));
+ hp = (__be64 *) (kvm->arch.hpt.virt + (pte_index << 4));
/* to avoid deadlock, don't spin except for first */
if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) {
if (n)
@@ -587,7 +600,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
}
args[j] = ((0x80 | flags) << 56) + pte_index;
- rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+ rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
note_hpte_modification(kvm, rev);
if (!(hp0 & HPTE_V_VALID)) {
@@ -642,10 +655,12 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned long v, r, rb, mask, bits;
u64 pte_v, pte_r;
- if (pte_index >= kvm->arch.hpt_npte)
+ if (kvm_is_radix(kvm))
+ return H_FUNCTION;
+ if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
return H_PARAMETER;
- hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+ hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
cpu_relax();
v = pte_v = be64_to_cpu(hpte[0]);
@@ -665,7 +680,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
/* Update guest view of 2nd HPTE dword */
mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
HPTE_R_KEY_HI | HPTE_R_KEY_LO;
- rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+ rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
if (rev) {
r = (rev->guest_rpte & ~mask) | bits;
rev->guest_rpte = r;
@@ -711,15 +726,17 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
int i, n = 1;
struct revmap_entry *rev = NULL;
- if (pte_index >= kvm->arch.hpt_npte)
+ if (kvm_is_radix(kvm))
+ return H_FUNCTION;
+ if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
return H_PARAMETER;
if (flags & H_READ_4) {
pte_index &= ~3;
n = 4;
}
- rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+ rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
for (i = 0; i < n; ++i, ++pte_index) {
- hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+ hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
r = be64_to_cpu(hpte[1]);
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
@@ -750,11 +767,13 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned long *rmap;
long ret = H_NOT_FOUND;
- if (pte_index >= kvm->arch.hpt_npte)
+ if (kvm_is_radix(kvm))
+ return H_FUNCTION;
+ if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
return H_PARAMETER;
- rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
- hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+ rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
+ hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
cpu_relax();
v = be64_to_cpu(hpte[0]);
@@ -796,11 +815,13 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned long *rmap;
long ret = H_NOT_FOUND;
- if (pte_index >= kvm->arch.hpt_npte)
+ if (kvm_is_radix(kvm))
+ return H_FUNCTION;
+ if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
return H_PARAMETER;
- rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
- hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+ rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
+ hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
cpu_relax();
v = be64_to_cpu(hpte[0]);
@@ -949,7 +970,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
somask = (1UL << 28) - 1;
vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
}
- hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask;
+ hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvmppc_hpt_mask(&kvm->arch.hpt);
avpn = slb_v & ~(somask >> 16); /* also includes B */
avpn |= (eaddr & somask) >> 16;
@@ -960,7 +981,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
val |= avpn;
for (;;) {
- hpte = (__be64 *)(kvm->arch.hpt_virt + (hash << 7));
+ hpte = (__be64 *)(kvm->arch.hpt.virt + (hash << 7));
for (i = 0; i < 16; i += 2) {
/* Read the PTE racily */
@@ -996,7 +1017,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
if (val & HPTE_V_SECONDARY)
break;
val |= HPTE_V_SECONDARY;
- hash = hash ^ kvm->arch.hpt_mask;
+ hash = hash ^ kvmppc_hpt_mask(&kvm->arch.hpt);
}
return -1;
}
@@ -1045,14 +1066,14 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
return status; /* there really was no HPTE */
return 0; /* for prot fault, HPTE disappeared */
}
- hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
+ hpte = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
v = orig_v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
r = be64_to_cpu(hpte[1]);
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
v = hpte_new_to_old_v(v, r);
r = hpte_new_to_old_r(r);
}
- rev = real_vmalloc_addr(&kvm->arch.revmap[index]);
+ rev = real_vmalloc_addr(&kvm->arch.hpt.rev[index]);
gr = rev->guest_rpte;
unlock_hpte(hpte, orig_v);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 06edc4366639..e78542d99cd6 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -35,8 +35,8 @@ int kvm_irq_bypass = 1;
EXPORT_SYMBOL(kvm_irq_bypass);
static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
- u32 new_irq);
-static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu);
+ u32 new_irq, bool check_resend);
+static int xics_opal_set_server(unsigned int hw_irq, int server_cpu);
/* -- ICS routines -- */
static void ics_rm_check_resend(struct kvmppc_xics *xics,
@@ -44,20 +44,12 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics,
{
int i;
- arch_spin_lock(&ics->lock);
-
for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
struct ics_irq_state *state = &ics->irq_state[i];
-
- if (!state->resend)
- continue;
-
- arch_spin_unlock(&ics->lock);
- icp_rm_deliver_irq(xics, icp, state->number);
- arch_spin_lock(&ics->lock);
+ if (state->resend)
+ icp_rm_deliver_irq(xics, icp, state->number, true);
}
- arch_spin_unlock(&ics->lock);
}
/* -- ICP routines -- */
@@ -70,11 +62,9 @@ static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu)
hcpu = hcore << threads_shift;
kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu;
smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION);
- if (paca[hcpu].kvm_hstate.xics_phys)
- icp_native_cause_ipi_rm(hcpu);
- else
- opal_rm_int_set_mfrr(get_hard_smp_processor_id(hcpu),
- IPI_PRIORITY);
+ kvmppc_set_host_ipi(hcpu, 1);
+ smp_mb();
+ kvmhv_rm_send_ipi(hcpu);
}
#else
static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { }
@@ -290,7 +280,7 @@ static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
}
static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
- u32 new_irq)
+ u32 new_irq, bool check_resend)
{
struct ics_irq_state *state;
struct kvmppc_ics *ics;
@@ -335,6 +325,10 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
}
}
+ if (check_resend)
+ if (!state->resend)
+ goto out;
+
/* Clear the resend bit of that interrupt */
state->resend = 0;
@@ -380,7 +374,9 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
*/
if (reject && reject != XICS_IPI) {
arch_spin_unlock(&ics->lock);
+ icp->n_reject++;
new_irq = reject;
+ check_resend = 0;
goto again;
}
} else {
@@ -388,10 +384,16 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
* We failed to deliver the interrupt we need to set the
* resend map bit and mark the ICS state as needing a resend
*/
- set_bit(ics->icsid, icp->resend_map);
state->resend = 1;
/*
+ * Make sure when checking resend, we don't miss the resend
+ * if resend_map bit is seen and cleared.
+ */
+ smp_wmb();
+ set_bit(ics->icsid, icp->resend_map);
+
+ /*
* If the need_resend flag got cleared in the ICP some time
* between icp_rm_try_to_deliver() atomic update and now, then
* we know it might have missed the resend_map bit. So we
@@ -399,7 +401,9 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
*/
smp_mb();
if (!icp->state.need_resend) {
+ state->resend = 0;
arch_spin_unlock(&ics->lock);
+ check_resend = 0;
goto again;
}
}
@@ -594,7 +598,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
/* Handle reject in real mode */
if (reject && reject != XICS_IPI) {
this_icp->n_reject++;
- icp_rm_deliver_irq(xics, icp, reject);
+ icp_rm_deliver_irq(xics, icp, reject, false);
}
/* Handle resends in real mode */
@@ -662,59 +666,45 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
*/
if (reject && reject != XICS_IPI) {
icp->n_reject++;
- icp_rm_deliver_irq(xics, icp, reject);
+ icp_rm_deliver_irq(xics, icp, reject, false);
}
bail:
return check_too_hard(xics, icp);
}
-int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq)
{
struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
struct kvmppc_icp *icp = vcpu->arch.icp;
struct kvmppc_ics *ics;
struct ics_irq_state *state;
- u32 irq = xirr & 0x00ffffff;
u16 src;
-
- if (!xics || !xics->real_mode)
- return H_TOO_HARD;
+ u32 pq_old, pq_new;
/*
- * ICP State: EOI
- *
- * Note: If EOI is incorrectly used by SW to lower the CPPR
- * value (ie more favored), we do not check for rejection of
- * a pending interrupt, this is a SW error and PAPR sepcifies
- * that we don't have to deal with it.
+ * ICS EOI handling: For LSI, if P bit is still set, we need to
+ * resend it.
*
- * The sending of an EOI to the ICS is handled after the
- * CPPR update
- *
- * ICP State: Down_CPPR which we handle
- * in a separate function as it's shared with H_CPPR.
+ * For MSI, we move Q bit into P (and clear Q). If it is set,
+ * resend it.
*/
- icp_rm_down_cppr(xics, icp, xirr >> 24);
- /* IPIs have no EOI */
- if (irq == XICS_IPI)
- goto bail;
- /*
- * EOI handling: If the interrupt is still asserted, we need to
- * resend it. We can take a lockless "peek" at the ICS state here.
- *
- * "Message" interrupts will never have "asserted" set
- */
ics = kvmppc_xics_find_ics(xics, irq, &src);
if (!ics)
goto bail;
+
state = &ics->irq_state[src];
- /* Still asserted, resend it */
- if (state->asserted) {
- icp->n_reject++;
- icp_rm_deliver_irq(xics, icp, irq);
- }
+ if (state->lsi)
+ pq_new = state->pq_state;
+ else
+ do {
+ pq_old = state->pq_state;
+ pq_new = pq_old >> 1;
+ } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+ if (pq_new & PQ_PRESENTED)
+ icp_rm_deliver_irq(xics, NULL, irq, false);
if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) {
icp->rm_action |= XICS_RM_NOTIFY_EOI;
@@ -730,15 +720,48 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
++vcpu->stat.pthru_host;
if (state->intr_cpu != pcpu) {
++vcpu->stat.pthru_bad_aff;
- xics_opal_rm_set_server(state->host_irq, pcpu);
+ xics_opal_set_server(state->host_irq, pcpu);
}
state->intr_cpu = -1;
}
}
+
bail:
return check_too_hard(xics, icp);
}
+int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+ struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+ struct kvmppc_icp *icp = vcpu->arch.icp;
+ u32 irq = xirr & 0x00ffffff;
+
+ if (!xics || !xics->real_mode)
+ return H_TOO_HARD;
+
+ /*
+ * ICP State: EOI
+ *
+ * Note: If EOI is incorrectly used by SW to lower the CPPR
+ * value (ie more favored), we do not check for rejection of
+ * a pending interrupt, this is a SW error and PAPR specifies
+ * that we don't have to deal with it.
+ *
+ * The sending of an EOI to the ICS is handled after the
+ * CPPR update
+ *
+ * ICP State: Down_CPPR which we handle
+ * in a separate function as it's shared with H_CPPR.
+ */
+ icp_rm_down_cppr(xics, icp, xirr >> 24);
+
+ /* IPIs have no EOI */
+ if (irq == XICS_IPI)
+ return check_too_hard(xics, icp);
+
+ return ics_rm_eoi(vcpu, irq);
+}
+
unsigned long eoi_rc;
static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
@@ -758,16 +781,16 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
if (xics_phys) {
_stwcix(xics_phys + XICS_XIRR, xirr);
} else {
- rc = opal_rm_int_eoi(be32_to_cpu(xirr));
+ rc = opal_int_eoi(be32_to_cpu(xirr));
*again = rc > 0;
}
}
-static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu)
+static int xics_opal_set_server(unsigned int hw_irq, int server_cpu)
{
unsigned int mangle_cpu = get_hard_smp_processor_id(server_cpu) << 2;
- return opal_rm_set_xive(hw_irq, mangle_cpu, DEFAULT_PRIORITY);
+ return opal_set_xive(hw_irq, mangle_cpu, DEFAULT_PRIORITY);
}
/*
@@ -825,14 +848,33 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
{
struct kvmppc_xics *xics;
struct kvmppc_icp *icp;
+ struct kvmppc_ics *ics;
+ struct ics_irq_state *state;
u32 irq;
+ u16 src;
+ u32 pq_old, pq_new;
irq = irq_map->v_hwirq;
xics = vcpu->kvm->arch.xics;
icp = vcpu->arch.icp;
kvmppc_rm_handle_irq_desc(irq_map->desc);
- icp_rm_deliver_irq(xics, icp, irq);
+
+ ics = kvmppc_xics_find_ics(xics, irq, &src);
+ if (!ics)
+ return 2;
+
+ state = &ics->irq_state[src];
+
+ /* only MSIs register bypass producers, so it must be MSI here */
+ do {
+ pq_old = state->pq_state;
+ pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED;
+ } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+ /* Test P=1, Q=0, this is the only case where we present */
+ if (pq_new == PQ_PRESENTED)
+ icp_rm_deliver_irq(xics, icp, irq, false);
/* EOI the interrupt */
icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr,
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 9338a818e05c..47414a6fe2dd 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -148,6 +148,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
addi r1, r1, 112
ld r7, HSTATE_HOST_MSR(r13)
+ /*
+ * If we came back from the guest via a relocation-on interrupt,
+ * we will be in virtual mode at this point, which makes it a
+ * little easier to get back to the caller.
+ */
+ mfmsr r0
+ andi. r0, r0, MSR_IR /* in real mode? */
+ bne .Lvirt_return
+
cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
beq 11f
@@ -181,6 +190,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtspr SPRN_HSRR1, r7
ba 0xe80
+ /* Virtual-mode return - can't get here for HMI or machine check */
+.Lvirt_return:
+ cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
+ beq 16f
+ cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL
+ beq 17f
+ andi. r0, r7, MSR_EE /* were interrupts hard-enabled? */
+ beq 18f
+ mtmsrd r7, 1 /* if so then re-enable them */
+18: mtlr r8
+ blr
+
+16: mtspr SPRN_HSRR0, r8 /* jump to reloc-on external vector */
+ mtspr SPRN_HSRR1, r7
+ b exc_virt_0x4500_hardware_interrupt
+
+17: mtspr SPRN_HSRR0, r8
+ mtspr SPRN_HSRR1, r7
+ b exc_virt_0x4e80_h_doorbell
+
kvmppc_primary_no_guest:
/* We handle this much like a ceded vcpu */
/* put the HDEC into the DEC, since HDEC interrupts don't wake us */
@@ -518,6 +547,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
/* Stack frame offsets */
#define STACK_SLOT_TID (112-16)
#define STACK_SLOT_PSSCR (112-24)
+#define STACK_SLOT_PID (112-32)
.global kvmppc_hv_entry
kvmppc_hv_entry:
@@ -530,6 +560,7 @@ kvmppc_hv_entry:
* R1 = host R1
* R2 = TOC
* all other volatile GPRS = free
+ * Does not preserve non-volatile GPRs or CR fields
*/
mflr r0
std r0, PPC_LR_STKOFF(r1)
@@ -549,32 +580,38 @@ kvmppc_hv_entry:
bl kvmhv_start_timing
1:
#endif
- /* Clear out SLB */
+
+ /* Use cr7 as an indication of radix mode */
+ ld r5, HSTATE_KVM_VCORE(r13)
+ ld r9, VCORE_KVM(r5) /* pointer to struct kvm */
+ lbz r0, KVM_RADIX(r9)
+ cmpwi cr7, r0, 0
+
+ /* Clear out SLB if hash */
+ bne cr7, 2f
li r6,0
slbmte r6,r6
slbia
ptesync
-
+2:
/*
* POWER7/POWER8 host -> guest partition switch code.
* We don't have to lock against concurrent tlbies,
* but we do have to coordinate across hardware threads.
*/
/* Set bit in entry map iff exit map is zero. */
- ld r5, HSTATE_KVM_VCORE(r13)
li r7, 1
lbz r6, HSTATE_PTID(r13)
sld r7, r7, r6
- addi r9, r5, VCORE_ENTRY_EXIT
-21: lwarx r3, 0, r9
+ addi r8, r5, VCORE_ENTRY_EXIT
+21: lwarx r3, 0, r8
cmpwi r3, 0x100 /* any threads starting to exit? */
bge secondary_too_late /* if so we're too late to the party */
or r3, r3, r7
- stwcx. r3, 0, r9
+ stwcx. r3, 0, r8
bne 21b
/* Primary thread switches to guest partition. */
- ld r9,VCORE_KVM(r5) /* pointer to struct kvm */
cmpwi r6,0
bne 10f
lwz r7,KVM_LPID(r9)
@@ -590,30 +627,44 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
/* See if we need to flush the TLB */
lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */
+BEGIN_FTR_SECTION
+ /*
+ * On POWER9, individual threads can come in here, but the
+ * TLB is shared between the 4 threads in a core, hence
+ * invalidating on one thread invalidates for all.
+ * Thus we make all 4 threads use the same bit here.
+ */
+ clrrdi r6,r6,2
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
clrldi r7,r6,64-6 /* extract bit number (6 bits) */
srdi r6,r6,6 /* doubleword number */
sldi r6,r6,3 /* address offset */
add r6,r6,r9
addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */
- li r0,1
- sld r0,r0,r7
+ li r8,1
+ sld r8,r8,r7
ld r7,0(r6)
- and. r7,r7,r0
+ and. r7,r7,r8
beq 22f
-23: ldarx r7,0,r6 /* if set, clear the bit */
- andc r7,r7,r0
- stdcx. r7,0,r6
- bne 23b
/* Flush the TLB of any entries for this LPID */
- lwz r6,KVM_TLB_SETS(r9)
- li r0,0 /* RS for P9 version of tlbiel */
- mtctr r6
+ lwz r0,KVM_TLB_SETS(r9)
+ mtctr r0
li r7,0x800 /* IS field = 0b10 */
ptesync
-28: tlbiel r7
+ li r0,0 /* RS for P9 version of tlbiel */
+ bne cr7, 29f
+28: tlbiel r7 /* On P9, rs=0, RIC=0, PRS=0, R=0 */
addi r7,r7,0x1000
bdnz 28b
- ptesync
+ b 30f
+29: PPC_TLBIEL(7,0,2,1,1) /* for radix, RIC=2, PRS=1, R=1 */
+ addi r7,r7,0x1000
+ bdnz 29b
+30: ptesync
+23: ldarx r7,0,r6 /* clear the bit after TLB flushed */
+ andc r7,r7,r8
+ stdcx. r7,0,r6
+ bne 23b
/* Add timebase offset onto timebase */
22: ld r8,VCORE_TB_OFFSET(r5)
@@ -658,7 +709,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
beq kvmppc_primary_no_guest
kvmppc_got_guest:
- /* Load up guest SLB entries */
+ /* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */
lwz r5,VCPU_SLB_MAX(r4)
cmpwi r5,0
beq 9f
@@ -696,8 +747,10 @@ kvmppc_got_guest:
BEGIN_FTR_SECTION
mfspr r5, SPRN_TIDR
mfspr r6, SPRN_PSSCR
+ mfspr r7, SPRN_PID
std r5, STACK_SLOT_TID(r1)
std r6, STACK_SLOT_PSSCR(r1)
+ std r7, STACK_SLOT_PID(r1)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
BEGIN_FTR_SECTION
@@ -824,6 +877,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
mtspr SPRN_PID, r7
mtspr SPRN_WORT, r8
BEGIN_FTR_SECTION
+ PPC_INVALIDATE_ERAT
+END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
+BEGIN_FTR_SECTION
/* POWER8-only registers */
ld r5, VCPU_TCSCR(r4)
ld r6, VCPU_ACOP(r4)
@@ -1057,13 +1113,13 @@ hdec_soon:
kvmppc_interrupt_hv:
/*
* Register contents:
- * R12 = interrupt vector
+ * R12 = (guest CR << 32) | interrupt vector
* R13 = PACA
- * guest CR, R12 saved in shadow VCPU SCRATCH1/0
+ * guest R12 saved in shadow VCPU SCRATCH0
+ * guest CTR saved in shadow VCPU SCRATCH1 if RELOCATABLE
* guest R13 saved in SPRN_SCRATCH0
*/
std r9, HSTATE_SCRATCH2(r13)
-
lbz r9, HSTATE_IN_GUEST(r13)
cmpwi r9, KVM_GUEST_MODE_HOST_HV
beq kvmppc_bad_host_intr
@@ -1094,8 +1150,9 @@ kvmppc_interrupt_hv:
std r10, VCPU_GPR(R10)(r9)
std r11, VCPU_GPR(R11)(r9)
ld r3, HSTATE_SCRATCH0(r13)
- lwz r4, HSTATE_SCRATCH1(r13)
std r3, VCPU_GPR(R12)(r9)
+ /* CR is in the high half of r12 */
+ srdi r4, r12, 32
stw r4, VCPU_CR(r9)
BEGIN_FTR_SECTION
ld r3, HSTATE_CFAR(r13)
@@ -1114,6 +1171,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
mfspr r11, SPRN_SRR1
std r10, VCPU_SRR0(r9)
std r11, VCPU_SRR1(r9)
+ /* trap is in the low half of r12, clear CR from the high half */
+ clrldi r12, r12, 32
andi. r0, r12, 2 /* need to read HSRR0/1? */
beq 1f
mfspr r10, SPRN_HSRR0
@@ -1149,7 +1208,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
11: stw r3,VCPU_HEIR(r9)
/* these are volatile across C function calls */
+#ifdef CONFIG_RELOCATABLE
+ ld r3, HSTATE_SCRATCH1(r13)
+ mtctr r3
+#else
mfctr r3
+#endif
mfxer r4
std r3, VCPU_CTR(r9)
std r4, VCPU_XER(r9)
@@ -1285,11 +1349,15 @@ mc_cont:
mtspr SPRN_CTRLT,r6
4:
/* Read the guest SLB and save it away */
+ ld r5, VCPU_KVM(r9)
+ lbz r0, KVM_RADIX(r5)
+ cmpwi r0, 0
+ li r5, 0
+ bne 3f /* for radix, save 0 entries */
lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */
mtctr r0
li r6,0
addi r7,r9,VCPU_SLB
- li r5,0
1: slbmfee r8,r6
andis. r0,r8,SLB_ESID_V@h
beq 2f
@@ -1301,7 +1369,7 @@ mc_cont:
addi r5,r5,1
2: addi r6,r6,1
bdnz 1b
- stw r5,VCPU_SLB_MAX(r9)
+3: stw r5,VCPU_SLB_MAX(r9)
/*
* Save the guest PURR/SPURR
@@ -1550,9 +1618,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
BEGIN_FTR_SECTION
ld r5, STACK_SLOT_TID(r1)
ld r6, STACK_SLOT_PSSCR(r1)
+ ld r7, STACK_SLOT_PID(r1)
mtspr SPRN_TIDR, r5
mtspr SPRN_PSSCR, r6
+ mtspr SPRN_PID, r7
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+BEGIN_FTR_SECTION
+ PPC_INVALIDATE_ERAT
+END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
/*
* POWER7/POWER8 guest -> host partition switch code.
@@ -1663,6 +1736,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
isync
/* load host SLB entries */
+BEGIN_MMU_FTR_SECTION
+ b 0f
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
ld r8,PACA_SLBSHADOWPTR(r13)
.rept SLB_NUM_BOLTED
@@ -1675,7 +1751,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
slbmte r6,r5
1: addi r8,r8,16
.endr
-
+0:
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
/* Finish timing, if we have a vcpu */
ld r4, HSTATE_KVM_VCPU(r13)
@@ -1702,11 +1778,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
* reflect the HDSI to the guest as a DSI.
*/
kvmppc_hdsi:
+ ld r3, VCPU_KVM(r9)
+ lbz r0, KVM_RADIX(r3)
+ cmpwi r0, 0
mfspr r4, SPRN_HDAR
mfspr r6, SPRN_HDSISR
+ bne .Lradix_hdsi /* on radix, just save DAR/DSISR/ASDR */
/* HPTE not found fault or protection fault? */
andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h
beq 1f /* if not, send it to the guest */
+BEGIN_FTR_SECTION
+ mfspr r5, SPRN_ASDR /* on POWER9, use ASDR to get VSID */
+ b 4f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
andi. r0, r11, MSR_DR /* data relocation enabled? */
beq 3f
clrrdi r0, r4, 28
@@ -1776,13 +1860,29 @@ fast_interrupt_c_return:
stb r0, HSTATE_IN_GUEST(r13)
b guest_exit_cont
+.Lradix_hdsi:
+ std r4, VCPU_FAULT_DAR(r9)
+ stw r6, VCPU_FAULT_DSISR(r9)
+.Lradix_hisi:
+ mfspr r5, SPRN_ASDR
+ std r5, VCPU_FAULT_GPA(r9)
+ b guest_exit_cont
+
/*
* Similarly for an HISI, reflect it to the guest as an ISI unless
* it is an HPTE not found fault for a page that we have paged out.
*/
kvmppc_hisi:
+ ld r3, VCPU_KVM(r9)
+ lbz r0, KVM_RADIX(r3)
+ cmpwi r0, 0
+ bne .Lradix_hisi /* for radix, just save ASDR */
andis. r0, r11, SRR1_ISI_NOPT@h
beq 1f
+BEGIN_FTR_SECTION
+ mfspr r5, SPRN_ASDR /* on POWER9, use ASDR to get VSID */
+ b 4f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
andi. r0, r11, MSR_IR /* instruction relocation enabled? */
beq 3f
clrrdi r0, r10, 28
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 1482961ceb4d..d4dfc0ca2a44 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -902,6 +902,69 @@ static void kvmppc_clear_debug(struct kvm_vcpu *vcpu)
}
}
+static int kvmppc_exit_pr_progint(struct kvm_run *run, struct kvm_vcpu *vcpu,
+ unsigned int exit_nr)
+{
+ enum emulation_result er;
+ ulong flags;
+ u32 last_inst;
+ int emul, r;
+
+ /*
+ * shadow_srr1 only contains valid flags if we came here via a program
+ * exception. The other exceptions (emulation assist, FP unavailable,
+ * etc.) do not provide flags in SRR1, so use an illegal-instruction
+ * exception when injecting a program interrupt into the guest.
+ */
+ if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
+ flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+ else
+ flags = SRR1_PROGILL;
+
+ emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
+ if (emul != EMULATE_DONE)
+ return RESUME_GUEST;
+
+ if (kvmppc_get_msr(vcpu) & MSR_PR) {
+#ifdef EXIT_DEBUG
+ pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n",
+ kvmppc_get_pc(vcpu), last_inst);
+#endif
+ if ((last_inst & 0xff0007ff) != (INS_DCBZ & 0xfffffff7)) {
+ kvmppc_core_queue_program(vcpu, flags);
+ return RESUME_GUEST;
+ }
+ }
+
+ vcpu->stat.emulated_inst_exits++;
+ er = kvmppc_emulate_instruction(run, vcpu);
+ switch (er) {
+ case EMULATE_DONE:
+ r = RESUME_GUEST_NV;
+ break;
+ case EMULATE_AGAIN:
+ r = RESUME_GUEST;
+ break;
+ case EMULATE_FAIL:
+ pr_crit("%s: emulation at %lx failed (%08x)\n",
+ __func__, kvmppc_get_pc(vcpu), last_inst);
+ kvmppc_core_queue_program(vcpu, flags);
+ r = RESUME_GUEST;
+ break;
+ case EMULATE_DO_MMIO:
+ run->exit_reason = KVM_EXIT_MMIO;
+ r = RESUME_HOST_NV;
+ break;
+ case EMULATE_EXIT_USER:
+ r = RESUME_HOST_NV;
+ break;
+ default:
+ BUG();
+ }
+
+ return r;
+}
+
int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned int exit_nr)
{
@@ -1044,71 +1107,8 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
break;
case BOOK3S_INTERRUPT_PROGRAM:
case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
- {
- enum emulation_result er;
- ulong flags;
- u32 last_inst;
- int emul;
-
-program_interrupt:
- /*
- * shadow_srr1 only contains valid flags if we came here via
- * a program exception. The other exceptions (emulation assist,
- * FP unavailable, etc.) do not provide flags in SRR1, so use
- * an illegal-instruction exception when injecting a program
- * interrupt into the guest.
- */
- if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
- flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
- else
- flags = SRR1_PROGILL;
-
- emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
- if (emul != EMULATE_DONE) {
- r = RESUME_GUEST;
- break;
- }
-
- if (kvmppc_get_msr(vcpu) & MSR_PR) {
-#ifdef EXIT_DEBUG
- pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n",
- kvmppc_get_pc(vcpu), last_inst);
-#endif
- if ((last_inst & 0xff0007ff) !=
- (INS_DCBZ & 0xfffffff7)) {
- kvmppc_core_queue_program(vcpu, flags);
- r = RESUME_GUEST;
- break;
- }
- }
-
- vcpu->stat.emulated_inst_exits++;
- er = kvmppc_emulate_instruction(run, vcpu);
- switch (er) {
- case EMULATE_DONE:
- r = RESUME_GUEST_NV;
- break;
- case EMULATE_AGAIN:
- r = RESUME_GUEST;
- break;
- case EMULATE_FAIL:
- printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
- __func__, kvmppc_get_pc(vcpu), last_inst);
- kvmppc_core_queue_program(vcpu, flags);
- r = RESUME_GUEST;
- break;
- case EMULATE_DO_MMIO:
- run->exit_reason = KVM_EXIT_MMIO;
- r = RESUME_HOST_NV;
- break;
- case EMULATE_EXIT_USER:
- r = RESUME_HOST_NV;
- break;
- default:
- BUG();
- }
+ r = kvmppc_exit_pr_progint(run, vcpu, exit_nr);
break;
- }
case BOOK3S_INTERRUPT_SYSCALL:
{
u32 last_sc;
@@ -1185,7 +1185,7 @@ program_interrupt:
emul = kvmppc_get_last_inst(vcpu, INST_GENERIC,
&last_inst);
if (emul == EMULATE_DONE)
- goto program_interrupt;
+ r = kvmppc_exit_pr_progint(run, vcpu, exit_nr);
else
r = RESUME_GUEST;
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index ca8f174289bb..2a2b96d53999 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -167,20 +167,38 @@ kvmppc_handler_trampoline_enter_end:
* *
*****************************************************************************/
-.global kvmppc_handler_trampoline_exit
-kvmppc_handler_trampoline_exit:
-
.global kvmppc_interrupt_pr
kvmppc_interrupt_pr:
+ /* 64-bit entry. Register usage at this point:
+ *
+ * SPRG_SCRATCH0 = guest R13
+ * R12 = (guest CR << 32) | exit handler id
+ * R13 = PACA
+ * HSTATE.SCRATCH0 = guest R12
+ * HSTATE.SCRATCH1 = guest CTR if RELOCATABLE
+ */
+#ifdef CONFIG_PPC64
+ /* Match 32-bit entry */
+#ifdef CONFIG_RELOCATABLE
+ std r9, HSTATE_SCRATCH2(r13)
+ ld r9, HSTATE_SCRATCH1(r13)
+ mtctr r9
+ ld r9, HSTATE_SCRATCH2(r13)
+#endif
+ rotldi r12, r12, 32 /* Flip R12 halves for stw */
+ stw r12, HSTATE_SCRATCH1(r13) /* CR is now in the low half */
+ srdi r12, r12, 32 /* shift trap into low half */
+#endif
+.global kvmppc_handler_trampoline_exit
+kvmppc_handler_trampoline_exit:
/* Register usage at this point:
*
- * SPRG_SCRATCH0 = guest R13
- * R12 = exit handler id
- * R13 = shadow vcpu (32-bit) or PACA (64-bit)
+ * SPRG_SCRATCH0 = guest R13
+ * R12 = exit handler id
+ * R13 = shadow vcpu (32-bit) or PACA (64-bit)
* HSTATE.SCRATCH0 = guest R12
* HSTATE.SCRATCH1 = guest CR
- *
*/
/* Save registers */
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 20dff102a06f..e48803e2918d 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -63,7 +63,7 @@
/* -- ICS routines -- */
static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
- u32 new_irq);
+ u32 new_irq, bool check_resend);
/*
* Return value ideally indicates how the interrupt was handled, but no
@@ -75,6 +75,7 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
struct ics_irq_state *state;
struct kvmppc_ics *ics;
u16 src;
+ u32 pq_old, pq_new;
XICS_DBG("ics deliver %#x (level: %d)\n", irq, level);
@@ -87,25 +88,41 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
if (!state->exists)
return -EINVAL;
+ if (level == KVM_INTERRUPT_SET_LEVEL || level == KVM_INTERRUPT_SET)
+ level = 1;
+ else if (level == KVM_INTERRUPT_UNSET)
+ level = 0;
/*
- * We set state->asserted locklessly. This should be fine as
- * we are the only setter, thus concurrent access is undefined
- * to begin with.
+ * Take other values the same as 1, consistent with original code.
+ * maybe WARN here?
*/
- if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL)
- state->asserted = 1;
- else if (level == 0 || level == KVM_INTERRUPT_UNSET) {
- state->asserted = 0;
+
+ if (!state->lsi && level == 0) /* noop for MSI */
return 0;
- }
+
+ do {
+ pq_old = state->pq_state;
+ if (state->lsi) {
+ if (level) {
+ if (pq_old & PQ_PRESENTED)
+ /* Setting already set LSI ... */
+ return 0;
+
+ pq_new = PQ_PRESENTED;
+ } else
+ pq_new = 0;
+ } else
+ pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED;
+ } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+ /* Test P=1, Q=0, this is the only case where we present */
+ if (pq_new == PQ_PRESENTED)
+ icp_deliver_irq(xics, NULL, irq, false);
/* Record which CPU this arrived on for passed-through interrupts */
if (state->host_irq)
state->intr_cpu = raw_smp_processor_id();
- /* Attempt delivery */
- icp_deliver_irq(xics, NULL, irq);
-
return 0;
}
@@ -114,29 +131,14 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
{
int i;
- unsigned long flags;
-
- local_irq_save(flags);
- arch_spin_lock(&ics->lock);
-
for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
struct ics_irq_state *state = &ics->irq_state[i];
-
- if (!state->resend)
- continue;
-
- XICS_DBG("resend %#x prio %#x\n", state->number,
- state->priority);
-
- arch_spin_unlock(&ics->lock);
- local_irq_restore(flags);
- icp_deliver_irq(xics, icp, state->number);
- local_irq_save(flags);
- arch_spin_lock(&ics->lock);
+ if (state->resend) {
+ XICS_DBG("resend %#x prio %#x\n", state->number,
+ state->priority);
+ icp_deliver_irq(xics, icp, state->number, true);
+ }
}
-
- arch_spin_unlock(&ics->lock);
- local_irq_restore(flags);
}
static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
@@ -155,6 +157,7 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
deliver = false;
if ((state->masked_pending || state->resend) && priority != MASKED) {
state->masked_pending = 0;
+ state->resend = 0;
deliver = true;
}
@@ -189,7 +192,7 @@ int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority)
state->masked_pending, state->resend);
if (write_xive(xics, ics, state, server, priority, priority))
- icp_deliver_irq(xics, icp, irq);
+ icp_deliver_irq(xics, icp, irq, false);
return 0;
}
@@ -242,7 +245,7 @@ int kvmppc_xics_int_on(struct kvm *kvm, u32 irq)
if (write_xive(xics, ics, state, state->server, state->saved_priority,
state->saved_priority))
- icp_deliver_irq(xics, icp, irq);
+ icp_deliver_irq(xics, icp, irq, false);
return 0;
}
@@ -376,7 +379,7 @@ static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
}
static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
- u32 new_irq)
+ u32 new_irq, bool check_resend)
{
struct ics_irq_state *state;
struct kvmppc_ics *ics;
@@ -422,6 +425,10 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
}
}
+ if (check_resend)
+ if (!state->resend)
+ goto out;
+
/* Clear the resend bit of that interrupt */
state->resend = 0;
@@ -470,6 +477,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
arch_spin_unlock(&ics->lock);
local_irq_restore(flags);
new_irq = reject;
+ check_resend = 0;
goto again;
}
} else {
@@ -477,10 +485,16 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
* We failed to deliver the interrupt we need to set the
* resend map bit and mark the ICS state as needing a resend
*/
- set_bit(ics->icsid, icp->resend_map);
state->resend = 1;
/*
+ * Make sure when checking resend, we don't miss the resend
+ * if resend_map bit is seen and cleared.
+ */
+ smp_wmb();
+ set_bit(ics->icsid, icp->resend_map);
+
+ /*
* If the need_resend flag got cleared in the ICP some time
* between icp_try_to_deliver() atomic update and now, then
* we know it might have missed the resend_map bit. So we
@@ -488,8 +502,10 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
*/
smp_mb();
if (!icp->state.need_resend) {
+ state->resend = 0;
arch_spin_unlock(&ics->lock);
local_irq_restore(flags);
+ check_resend = 0;
goto again;
}
}
@@ -681,7 +697,7 @@ static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
/* Handle reject */
if (reject && reject != XICS_IPI)
- icp_deliver_irq(xics, icp, reject);
+ icp_deliver_irq(xics, icp, reject, false);
/* Handle resend */
if (resend)
@@ -761,17 +777,54 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
* attempt (see comments in icp_deliver_irq).
*/
if (reject && reject != XICS_IPI)
- icp_deliver_irq(xics, icp, reject);
+ icp_deliver_irq(xics, icp, reject, false);
}
-static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+static int ics_eoi(struct kvm_vcpu *vcpu, u32 irq)
{
struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
struct kvmppc_icp *icp = vcpu->arch.icp;
struct kvmppc_ics *ics;
struct ics_irq_state *state;
- u32 irq = xirr & 0x00ffffff;
u16 src;
+ u32 pq_old, pq_new;
+
+ /*
+ * ICS EOI handling: For LSI, if P bit is still set, we need to
+ * resend it.
+ *
+ * For MSI, we move Q bit into P (and clear Q). If it is set,
+ * resend it.
+ */
+
+ ics = kvmppc_xics_find_ics(xics, irq, &src);
+ if (!ics) {
+ XICS_DBG("ios_eoi: IRQ 0x%06x not found !\n", irq);
+ return H_PARAMETER;
+ }
+ state = &ics->irq_state[src];
+
+ if (state->lsi)
+ pq_new = state->pq_state;
+ else
+ do {
+ pq_old = state->pq_state;
+ pq_new = pq_old >> 1;
+ } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+ if (pq_new & PQ_PRESENTED)
+ icp_deliver_irq(xics, icp, irq, false);
+
+ kvm_notify_acked_irq(vcpu->kvm, 0, irq);
+
+ return H_SUCCESS;
+}
+
+static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+ struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+ struct kvmppc_icp *icp = vcpu->arch.icp;
+ u32 irq = xirr & 0x00ffffff;
XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr);
@@ -794,26 +847,8 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
/* IPIs have no EOI */
if (irq == XICS_IPI)
return H_SUCCESS;
- /*
- * EOI handling: If the interrupt is still asserted, we need to
- * resend it. We can take a lockless "peek" at the ICS state here.
- *
- * "Message" interrupts will never have "asserted" set
- */
- ics = kvmppc_xics_find_ics(xics, irq, &src);
- if (!ics) {
- XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq);
- return H_PARAMETER;
- }
- state = &ics->irq_state[src];
- /* Still asserted, resend it */
- if (state->asserted)
- icp_deliver_irq(xics, icp, irq);
-
- kvm_notify_acked_irq(vcpu->kvm, 0, irq);
-
- return H_SUCCESS;
+ return ics_eoi(vcpu, irq);
}
int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
@@ -832,10 +867,6 @@ int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
icp->n_rm_check_resend++;
icp_check_resend(xics, icp->rm_resend_icp);
}
- if (icp->rm_action & XICS_RM_REJECT) {
- icp->n_rm_reject++;
- icp_deliver_irq(xics, icp, icp->rm_reject);
- }
if (icp->rm_action & XICS_RM_NOTIFY_EOI) {
icp->n_rm_notify_eoi++;
kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq);
@@ -920,7 +951,7 @@ static int xics_debug_show(struct seq_file *m, void *private)
int icsid, i;
unsigned long flags;
unsigned long t_rm_kick_vcpu, t_rm_check_resend;
- unsigned long t_rm_reject, t_rm_notify_eoi;
+ unsigned long t_rm_notify_eoi;
unsigned long t_reject, t_check_resend;
if (!kvm)
@@ -929,7 +960,6 @@ static int xics_debug_show(struct seq_file *m, void *private)
t_rm_kick_vcpu = 0;
t_rm_notify_eoi = 0;
t_rm_check_resend = 0;
- t_rm_reject = 0;
t_check_resend = 0;
t_reject = 0;
@@ -952,14 +982,13 @@ static int xics_debug_show(struct seq_file *m, void *private)
t_rm_kick_vcpu += icp->n_rm_kick_vcpu;
t_rm_notify_eoi += icp->n_rm_notify_eoi;
t_rm_check_resend += icp->n_rm_check_resend;
- t_rm_reject += icp->n_rm_reject;
t_check_resend += icp->n_check_resend;
t_reject += icp->n_reject;
}
- seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n",
+ seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu notify_eoi=%lu\n",
t_rm_kick_vcpu, t_rm_check_resend,
- t_rm_reject, t_rm_notify_eoi);
+ t_rm_notify_eoi);
seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n",
t_check_resend, t_reject);
for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
@@ -977,9 +1006,9 @@ static int xics_debug_show(struct seq_file *m, void *private)
for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
struct ics_irq_state *irq = &ics->irq_state[i];
- seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n",
+ seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x pq_state %d resend %d masked pending %d\n",
irq->number, irq->server, irq->priority,
- irq->saved_priority, irq->asserted,
+ irq->saved_priority, irq->pq_state,
irq->resend, irq->masked_pending);
}
@@ -1198,10 +1227,17 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
val |= prio << KVM_XICS_PRIORITY_SHIFT;
if (irqp->lsi) {
val |= KVM_XICS_LEVEL_SENSITIVE;
- if (irqp->asserted)
+ if (irqp->pq_state & PQ_PRESENTED)
val |= KVM_XICS_PENDING;
} else if (irqp->masked_pending || irqp->resend)
val |= KVM_XICS_PENDING;
+
+ if (irqp->pq_state & PQ_PRESENTED)
+ val |= KVM_XICS_PRESENTED;
+
+ if (irqp->pq_state & PQ_QUEUED)
+ val |= KVM_XICS_QUEUED;
+
ret = 0;
}
arch_spin_unlock(&ics->lock);
@@ -1253,18 +1289,20 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
irqp->resend = 0;
irqp->masked_pending = 0;
irqp->lsi = 0;
- irqp->asserted = 0;
- if (val & KVM_XICS_LEVEL_SENSITIVE) {
+ irqp->pq_state = 0;
+ if (val & KVM_XICS_LEVEL_SENSITIVE)
irqp->lsi = 1;
- if (val & KVM_XICS_PENDING)
- irqp->asserted = 1;
- }
+ /* If PENDING, set P in case P is not saved because of old code */
+ if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING)
+ irqp->pq_state |= PQ_PRESENTED;
+ if (val & KVM_XICS_QUEUED)
+ irqp->pq_state |= PQ_QUEUED;
irqp->exists = 1;
arch_spin_unlock(&ics->lock);
local_irq_restore(flags);
if (val & KVM_XICS_PENDING)
- icp_deliver_irq(xics, NULL, irqp->number);
+ icp_deliver_irq(xics, NULL, irqp->number, false);
return 0;
}
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index 2a50320b55ca..ec5474cf70c6 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -31,16 +31,19 @@
/* Priority value to use for disabling an interrupt */
#define MASKED 0xff
+#define PQ_PRESENTED 1
+#define PQ_QUEUED 2
+
/* State for one irq source */
struct ics_irq_state {
u32 number;
u32 server;
+ u32 pq_state;
u8 priority;
u8 saved_priority;
u8 resend;
u8 masked_pending;
u8 lsi; /* level-sensitive interrupt */
- u8 asserted; /* Only for LSI */
u8 exists;
int intr_cpu;
u32 host_irq;
@@ -73,7 +76,6 @@ struct kvmppc_icp {
*/
#define XICS_RM_KICK_VCPU 0x1
#define XICS_RM_CHECK_RESEND 0x2
-#define XICS_RM_REJECT 0x4
#define XICS_RM_NOTIFY_EOI 0x8
u32 rm_action;
struct kvm_vcpu *rm_kick_target;
@@ -84,7 +86,6 @@ struct kvmppc_icp {
/* Counters for each reason we exited real mode */
unsigned long n_rm_kick_vcpu;
unsigned long n_rm_check_resend;
- unsigned long n_rm_reject;
unsigned long n_rm_notify_eoi;
/* Counters for handling ICP processing in real mode */
unsigned long n_check_resend;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index cd892dec7cb6..2b38d824e9e5 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -511,6 +511,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ONE_REG:
case KVM_CAP_IOEVENTFD:
case KVM_CAP_DEVICE_CTRL:
+ case KVM_CAP_IMMEDIATE_EXIT:
r = 1;
break;
case KVM_CAP_PPC_PAIRED_SINGLES:
@@ -565,6 +566,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_PPC_HWRNG:
r = kvmppc_hwrng_present();
break;
+ case KVM_CAP_PPC_MMU_RADIX:
+ r = !!(hv_enabled && radix_enabled());
+ break;
+ case KVM_CAP_PPC_MMU_HASH_V3:
+ r = !!(hv_enabled && !radix_enabled() &&
+ cpu_has_feature(CPU_FTR_ARCH_300));
+ break;
#endif
case KVM_CAP_SYNC_MMU:
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -605,6 +613,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SPAPR_MULTITCE:
r = 1;
break;
+ case KVM_CAP_SPAPR_RESIZE_HPT:
+ /* Disable this on POWER9 until code handles new HPTE format */
+ r = !!hv_enabled && !cpu_has_feature(CPU_FTR_ARCH_300);
+ break;
#endif
case KVM_CAP_PPC_HTM:
r = cpu_has_feature(CPU_FTR_TM_COMP) &&
@@ -1107,7 +1119,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
#endif
}
- r = kvmppc_vcpu_run(run, vcpu);
+ if (run->immediate_exit)
+ r = -EINTR;
+ else
+ r = kvmppc_vcpu_run(run, vcpu);
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &sigsaved, NULL);
@@ -1468,6 +1483,31 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = kvm_vm_ioctl_rtas_define_token(kvm, argp);
break;
}
+ case KVM_PPC_CONFIGURE_V3_MMU: {
+ struct kvm *kvm = filp->private_data;
+ struct kvm_ppc_mmuv3_cfg cfg;
+
+ r = -EINVAL;
+ if (!kvm->arch.kvm_ops->configure_mmu)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(&cfg, argp, sizeof(cfg)))
+ goto out;
+ r = kvm->arch.kvm_ops->configure_mmu(kvm, &cfg);
+ break;
+ }
+ case KVM_PPC_GET_RMMU_INFO: {
+ struct kvm *kvm = filp->private_data;
+ struct kvm_ppc_rmmu_info info;
+
+ r = -EINVAL;
+ if (!kvm->arch.kvm_ops->get_rmmu_info)
+ goto out;
+ r = kvm->arch.kvm_ops->get_rmmu_info(kvm, &info);
+ if (r >= 0 && copy_to_user(argp, &info, sizeof(info)))
+ r = -EFAULT;
+ break;
+ }
default: {
struct kvm *kvm = filp->private_data;
r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg);