summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOliver Upton <oliver.upton@linux.dev>2024-03-07 00:56:01 +0000
committerOliver Upton <oliver.upton@linux.dev>2024-03-07 00:56:01 +0000
commit9bd8d7df1971c2ebdcaf4526cf7a3f4ea38d0ede (patch)
tree1a773f15a68befda40169b1a8432742909d2b3f3
parent8dbc41105e96641e9c1569f512d19f0046a02463 (diff)
parenta39d3a966a090989b89c0287a67cd98c85ae2f52 (diff)
Merge branch kvm-arm64/vfio-normal-nc into kvmarm/next
* kvm-arm64/vfio-normal-nc: : Normal-NC support for vfio-pci @ stage-2, courtesy of Ankit Agrawal : : KVM's policy to date has been that any and all MMIO mapping at stage-2 : is treated as Device-nGnRE. This is primarily done due to concerns of : the guest triggering uncontainable failures in the system if they manage : to tickle the device / memory system the wrong way, though this is : unnecessarily restrictive for devices that can be reasoned as 'safe'. : : Unsurprisingly, the Device-* mapping can really hurt the performance of : assigned devices that can handle Gathering, and can be an outright : correctness issue if the guest driver does unaligned accesses. : : Rather than opening the floodgates to the full ecosystem of devices that : can be exposed to VMs, take the conservative approach and allow PCI : devices to be mapped as Normal-NC since it has been determined to be : 'safe'. vfio: Convey kvm that the vfio-pci device is wc safe KVM: arm64: Set io memory s2 pte as normalnc for vfio pci device mm: Introduce new flag to indicate wc safe KVM: arm64: Introduce new flag for non-cacheable IO memory Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
-rw-r--r--arch/arm64/include/asm/kvm_pgtable.h2
-rw-r--r--arch/arm64/include/asm/memory.h2
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c24
-rw-r--r--arch/arm64/kvm/mmu.c14
-rw-r--r--drivers/vfio/pci/vfio_pci_core.c19
-rw-r--r--include/linux/mm.h14
6 files changed, 65 insertions, 10 deletions
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index cfdf40f734b1..19278dfe7978 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -197,6 +197,7 @@ enum kvm_pgtable_stage2_flags {
* @KVM_PGTABLE_PROT_W: Write permission.
* @KVM_PGTABLE_PROT_R: Read permission.
* @KVM_PGTABLE_PROT_DEVICE: Device attributes.
+ * @KVM_PGTABLE_PROT_NORMAL_NC: Normal noncacheable attributes.
* @KVM_PGTABLE_PROT_SW0: Software bit 0.
* @KVM_PGTABLE_PROT_SW1: Software bit 1.
* @KVM_PGTABLE_PROT_SW2: Software bit 2.
@@ -208,6 +209,7 @@ enum kvm_pgtable_prot {
KVM_PGTABLE_PROT_R = BIT(2),
KVM_PGTABLE_PROT_DEVICE = BIT(3),
+ KVM_PGTABLE_PROT_NORMAL_NC = BIT(4),
KVM_PGTABLE_PROT_SW0 = BIT(55),
KVM_PGTABLE_PROT_SW1 = BIT(56),
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index d82305ab420f..449ca2ff1df6 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -173,6 +173,7 @@
* Memory types for Stage-2 translation
*/
#define MT_S2_NORMAL 0xf
+#define MT_S2_NORMAL_NC 0x5
#define MT_S2_DEVICE_nGnRE 0x1
/*
@@ -180,6 +181,7 @@
* Stage-2 enforces Normal-WB and Device-nGnRE
*/
#define MT_S2_FWB_NORMAL 6
+#define MT_S2_FWB_NORMAL_NC 5
#define MT_S2_FWB_DEVICE_nGnRE 1
#ifdef CONFIG_ARM64_4K_PAGES
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index c651df904fe3..e2982a8922c3 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -717,15 +717,29 @@ void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot,
kvm_pte_t *ptep)
{
- bool device = prot & KVM_PGTABLE_PROT_DEVICE;
- kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) :
- KVM_S2_MEMATTR(pgt, NORMAL);
+ kvm_pte_t attr;
u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
+ switch (prot & (KVM_PGTABLE_PROT_DEVICE |
+ KVM_PGTABLE_PROT_NORMAL_NC)) {
+ case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC:
+ return -EINVAL;
+ case KVM_PGTABLE_PROT_DEVICE:
+ if (prot & KVM_PGTABLE_PROT_X)
+ return -EINVAL;
+ attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE);
+ break;
+ case KVM_PGTABLE_PROT_NORMAL_NC:
+ if (prot & KVM_PGTABLE_PROT_X)
+ return -EINVAL;
+ attr = KVM_S2_MEMATTR(pgt, NORMAL_NC);
+ break;
+ default:
+ attr = KVM_S2_MEMATTR(pgt, NORMAL);
+ }
+
if (!(prot & KVM_PGTABLE_PROT_X))
attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
- else if (device)
- return -EINVAL;
if (prot & KVM_PGTABLE_PROT_R)
attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index d14504821b79..1742fdccb432 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1381,7 +1381,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
int ret = 0;
bool write_fault, writable, force_pte = false;
bool exec_fault, mte_allowed;
- bool device = false;
+ bool device = false, vfio_allow_any_uc = false;
unsigned long mmu_seq;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
@@ -1472,6 +1472,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
gfn = fault_ipa >> PAGE_SHIFT;
mte_allowed = kvm_vma_mte_allowed(vma);
+ vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
+
/* Don't use the VMA after the unlock -- it may have vanished */
vma = NULL;
@@ -1557,10 +1559,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (exec_fault)
prot |= KVM_PGTABLE_PROT_X;
- if (device)
- prot |= KVM_PGTABLE_PROT_DEVICE;
- else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC))
+ if (device) {
+ if (vfio_allow_any_uc)
+ prot |= KVM_PGTABLE_PROT_NORMAL_NC;
+ else
+ prot |= KVM_PGTABLE_PROT_DEVICE;
+ } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) {
prot |= KVM_PGTABLE_PROT_X;
+ }
/*
* Under the premise of getting a FSC_PERM fault, we just need to relax
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 1cbc990d42e0..df6f99bdf70d 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1862,8 +1862,25 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
/*
* See remap_pfn_range(), called from vfio_pci_fault() but we can't
* change vm_flags within the fault handler. Set them now.
+ *
+ * VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64,
+ * allowing KVM stage 2 device mapping attributes to use Normal-NC
+ * rather than DEVICE_nGnRE, which allows guest mappings
+ * supporting write-combining attributes (WC). ARM does not
+ * architecturally guarantee this is safe, and indeed some MMIO
+ * regions like the GICv2 VCPU interface can trigger uncontained
+ * faults if Normal-NC is used.
+ *
+ * To safely use VFIO in KVM the platform must guarantee full
+ * safety in the guest where no action taken against a MMIO
+ * mapping can trigger an uncontained failure. The assumption is
+ * that most VFIO PCI platforms support this for both mapping types,
+ * at least in common flows, based on some expectations of how
+ * PCI IP is integrated. Hence VM_ALLOW_ANY_UNCACHED is set in
+ * the VMA flags.
*/
- vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
+ vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
+ VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &vfio_pci_mmap_ops;
return 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f5a97dec5169..59576e56c58b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -391,6 +391,20 @@ extern unsigned int kobjsize(const void *objp);
# define VM_UFFD_MINOR VM_NONE
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+/*
+ * This flag is used to connect VFIO to arch specific KVM code. It
+ * indicates that the memory under this VMA is safe for use with any
+ * non-cachable memory type inside KVM. Some VFIO devices, on some
+ * platforms, are thought to be unsafe and can cause machine crashes
+ * if KVM does not lock down the memory type.
+ */
+#ifdef CONFIG_64BIT
+#define VM_ALLOW_ANY_UNCACHED_BIT 39
+#define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT)
+#else
+#define VM_ALLOW_ANY_UNCACHED VM_NONE
+#endif
+
/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)