summaryrefslogtreecommitdiff
path: root/virt/kvm/guest_memfd.c
diff options
context:
space:
mode:
Diffstat (limited to 'virt/kvm/guest_memfd.c')
-rw-r--r--virt/kvm/guest_memfd.c544
1 files changed, 424 insertions, 120 deletions
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 47a9f68f7b24..fdaea3422c30 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -1,18 +1,47 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/anon_inodes.h>
#include <linux/backing-dev.h>
#include <linux/falloc.h>
+#include <linux/fs.h>
#include <linux/kvm_host.h>
+#include <linux/mempolicy.h>
+#include <linux/pseudo_fs.h>
#include <linux/pagemap.h>
-#include <linux/anon_inodes.h>
#include "kvm_mm.h"
-struct kvm_gmem {
+static struct vfsmount *kvm_gmem_mnt;
+
+/*
+ * A guest_memfd instance can be associated multiple VMs, each with its own
+ * "view" of the underlying physical memory.
+ *
+ * The gmem's inode is effectively the raw underlying physical storage, and is
+ * used to track properties of the physical memory, while each gmem file is
+ * effectively a single VM's view of that storage, and is used to track assets
+ * specific to its associated VM, e.g. memslots=>gmem bindings.
+ */
+struct gmem_file {
struct kvm *kvm;
struct xarray bindings;
struct list_head entry;
};
+struct gmem_inode {
+ struct shared_policy policy;
+ struct inode vfs_inode;
+
+ u64 flags;
+};
+
+static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
+{
+ return container_of(inode, struct gmem_inode, vfs_inode);
+}
+
+#define kvm_gmem_for_each_file(f, mapping) \
+ list_for_each_entry(f, &(mapping)->i_private_list, entry)
+
/**
* folio_file_pfn - like folio_file_page, but return a pfn.
* @folio: The folio which contains this index.
@@ -25,6 +54,11 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
}
+static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ return gfn - slot->base_gfn + slot->gmem.pgoff;
+}
+
static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
pgoff_t index, struct folio *folio)
{
@@ -77,9 +111,9 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
* The order will be passed when creating the guest_memfd, and
* checked when creating memslots.
*/
- WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio)));
- index = gfn - slot->base_gfn + slot->gmem.pgoff;
- index = ALIGN_DOWN(index, 1 << folio_order(folio));
+ WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio)));
+ index = kvm_gmem_get_index(slot, gfn);
+ index = ALIGN_DOWN(index, folio_nr_pages(folio));
r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
if (!r)
kvm_gmem_mark_prepared(folio);
@@ -99,18 +133,45 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
{
/* TODO: Support huge pages. */
- return filemap_grab_folio(inode->i_mapping, index);
+ struct mempolicy *policy;
+ struct folio *folio;
+
+ /*
+ * Fast-path: See if folio is already present in mapping to avoid
+ * policy_lookup.
+ */
+ folio = __filemap_get_folio(inode->i_mapping, index,
+ FGP_LOCK | FGP_ACCESSED, 0);
+ if (!IS_ERR(folio))
+ return folio;
+
+ policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
+ folio = __filemap_get_folio_mpol(inode->i_mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mapping_gfp_mask(inode->i_mapping), policy);
+ mpol_cond_put(policy);
+
+ return folio;
}
-static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
- pgoff_t end)
+static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode)
+{
+ if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)
+ return KVM_FILTER_SHARED;
+
+ return KVM_FILTER_PRIVATE;
+}
+
+static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start,
+ pgoff_t end,
+ enum kvm_gfn_range_filter attr_filter)
{
bool flush = false, found_memslot = false;
struct kvm_memory_slot *slot;
- struct kvm *kvm = gmem->kvm;
+ struct kvm *kvm = f->kvm;
unsigned long index;
- xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
+ xa_for_each_range(&f->bindings, index, slot, start, end - 1) {
pgoff_t pgoff = slot->gmem.pgoff;
struct kvm_gfn_range gfn_range = {
@@ -118,6 +179,7 @@ static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
.slot = slot,
.may_block = true,
+ .attr_filter = attr_filter,
};
if (!found_memslot) {
@@ -137,24 +199,43 @@ static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
KVM_MMU_UNLOCK(kvm);
}
-static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
- pgoff_t end)
+static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start,
+ pgoff_t end)
+{
+ enum kvm_gfn_range_filter attr_filter;
+ struct gmem_file *f;
+
+ attr_filter = kvm_gmem_get_invalidate_filter(inode);
+
+ kvm_gmem_for_each_file(f, inode->i_mapping)
+ __kvm_gmem_invalidate_begin(f, start, end, attr_filter);
+}
+
+static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start,
+ pgoff_t end)
{
- struct kvm *kvm = gmem->kvm;
+ struct kvm *kvm = f->kvm;
- if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
+ if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
KVM_MMU_LOCK(kvm);
kvm_mmu_invalidate_end(kvm);
KVM_MMU_UNLOCK(kvm);
}
}
+static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start,
+ pgoff_t end)
+{
+ struct gmem_file *f;
+
+ kvm_gmem_for_each_file(f, inode->i_mapping)
+ __kvm_gmem_invalidate_end(f, start, end);
+}
+
static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
- struct list_head *gmem_list = &inode->i_mapping->i_private_list;
pgoff_t start = offset >> PAGE_SHIFT;
pgoff_t end = (offset + len) >> PAGE_SHIFT;
- struct kvm_gmem *gmem;
/*
* Bindings must be stable across invalidation to ensure the start+end
@@ -162,13 +243,11 @@ static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
*/
filemap_invalidate_lock(inode->i_mapping);
- list_for_each_entry(gmem, gmem_list, entry)
- kvm_gmem_invalidate_begin(gmem, start, end);
+ kvm_gmem_invalidate_begin(inode, start, end);
truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
- list_for_each_entry(gmem, gmem_list, entry)
- kvm_gmem_invalidate_end(gmem, start, end);
+ kvm_gmem_invalidate_end(inode, start, end);
filemap_invalidate_unlock(inode->i_mapping);
@@ -248,9 +327,9 @@ static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
static int kvm_gmem_release(struct inode *inode, struct file *file)
{
- struct kvm_gmem *gmem = file->private_data;
+ struct gmem_file *f = file->private_data;
struct kvm_memory_slot *slot;
- struct kvm *kvm = gmem->kvm;
+ struct kvm *kvm = f->kvm;
unsigned long index;
/*
@@ -259,32 +338,37 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
* dereferencing the slot for existing bindings needs to be protected
* against memslot updates, specifically so that unbind doesn't race
* and free the memslot (kvm_gmem_get_file() will return NULL).
+ *
+ * Since .release is called only when the reference count is zero,
+ * after which file_ref_get() and get_file_active() fail,
+ * kvm_gmem_get_pfn() cannot be using the file concurrently.
+ * file_ref_put() provides a full barrier, and get_file_active() the
+ * matching acquire barrier.
*/
mutex_lock(&kvm->slots_lock);
filemap_invalidate_lock(inode->i_mapping);
- xa_for_each(&gmem->bindings, index, slot)
- rcu_assign_pointer(slot->gmem.file, NULL);
-
- synchronize_rcu();
+ xa_for_each(&f->bindings, index, slot)
+ WRITE_ONCE(slot->gmem.file, NULL);
/*
* All in-flight operations are gone and new bindings can be created.
* Zap all SPTEs pointed at by this file. Do not free the backing
* memory, as its lifetime is associated with the inode, not the file.
*/
- kvm_gmem_invalidate_begin(gmem, 0, -1ul);
- kvm_gmem_invalidate_end(gmem, 0, -1ul);
+ __kvm_gmem_invalidate_begin(f, 0, -1ul,
+ kvm_gmem_get_invalidate_filter(inode));
+ __kvm_gmem_invalidate_end(f, 0, -1ul);
- list_del(&gmem->entry);
+ list_del(&f->entry);
filemap_invalidate_unlock(inode->i_mapping);
mutex_unlock(&kvm->slots_lock);
- xa_destroy(&gmem->bindings);
- kfree(gmem);
+ xa_destroy(&f->bindings);
+ kfree(f);
kvm_put_kvm(kvm);
@@ -296,28 +380,118 @@ static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
/*
* Do not return slot->gmem.file if it has already been closed;
* there might be some time between the last fput() and when
- * kvm_gmem_release() clears slot->gmem.file, and you do not
- * want to spin in the meanwhile.
+ * kvm_gmem_release() clears slot->gmem.file.
*/
return get_file_active(&slot->gmem.file);
}
-static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
+DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T),
+ kvm_gmem_get_file(slot), struct kvm_memory_slot *slot);
+
+static bool kvm_gmem_supports_mmap(struct inode *inode)
{
- return gfn - slot->base_gfn + slot->gmem.pgoff;
+ return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP;
+}
+
+static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct folio *folio;
+ vm_fault_t ret = VM_FAULT_LOCKED;
+
+ if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
+ return VM_FAULT_SIGBUS;
+
+ if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
+ return VM_FAULT_SIGBUS;
+
+ folio = kvm_gmem_get_folio(inode, vmf->pgoff);
+ if (IS_ERR(folio)) {
+ if (PTR_ERR(folio) == -EAGAIN)
+ return VM_FAULT_RETRY;
+
+ return vmf_error(PTR_ERR(folio));
+ }
+
+ if (WARN_ON_ONCE(folio_test_large(folio))) {
+ ret = VM_FAULT_SIGBUS;
+ goto out_folio;
+ }
+
+ if (!folio_test_uptodate(folio)) {
+ clear_highpage(folio_page(folio, 0));
+ kvm_gmem_mark_prepared(folio);
+ }
+
+ vmf->page = folio_file_page(folio, vmf->pgoff);
+
+out_folio:
+ if (ret != VM_FAULT_LOCKED) {
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_NUMA
+static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+
+ return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol);
+}
+
+static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
+ unsigned long addr, pgoff_t *pgoff)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+
+ *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
+
+ /*
+ * Return the memory policy for this index, or NULL if none is set.
+ *
+ * Returning NULL, e.g. instead of the current task's memory policy, is
+ * important for the .get_policy kernel ABI: it indicates that no
+ * explicit policy has been set via mbind() for this memory. The caller
+ * can then replace NULL with the default memory policy instead of the
+ * current task's memory policy.
+ */
+ return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
+}
+#endif /* CONFIG_NUMA */
+
+static const struct vm_operations_struct kvm_gmem_vm_ops = {
+ .fault = kvm_gmem_fault_user_mapping,
+#ifdef CONFIG_NUMA
+ .get_policy = kvm_gmem_get_policy,
+ .set_policy = kvm_gmem_set_policy,
+#endif
+};
+
+static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ if (!kvm_gmem_supports_mmap(file_inode(file)))
+ return -ENODEV;
+
+ if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
+ (VM_SHARED | VM_MAYSHARE)) {
+ return -EINVAL;
+ }
+
+ vma->vm_ops = &kvm_gmem_vm_ops;
+
+ return 0;
}
static struct file_operations kvm_gmem_fops = {
+ .mmap = kvm_gmem_mmap,
.open = generic_file_open,
.release = kvm_gmem_release,
.fallocate = kvm_gmem_fallocate,
};
-void kvm_gmem_init(struct module *module)
-{
- kvm_gmem_fops.owner = module;
-}
-
static int kvm_gmem_migrate_folio(struct address_space *mapping,
struct folio *dst, struct folio *src,
enum migrate_mode mode)
@@ -328,8 +502,6 @@ static int kvm_gmem_migrate_folio(struct address_space *mapping,
static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
{
- struct list_head *gmem_list = &mapping->i_private_list;
- struct kvm_gmem *gmem;
pgoff_t start, end;
filemap_invalidate_lock_shared(mapping);
@@ -337,8 +509,7 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
start = folio->index;
end = start + folio_nr_pages(folio);
- list_for_each_entry(gmem, gmem_list, entry)
- kvm_gmem_invalidate_begin(gmem, start, end);
+ kvm_gmem_invalidate_begin(mapping->host, start, end);
/*
* Do not truncate the range, what action is taken in response to the
@@ -349,8 +520,7 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
* error to userspace.
*/
- list_for_each_entry(gmem, gmem_list, entry)
- kvm_gmem_invalidate_end(gmem, start, end);
+ kvm_gmem_invalidate_end(mapping->host, start, end);
filemap_invalidate_unlock_shared(mapping);
@@ -377,30 +547,24 @@ static const struct address_space_operations kvm_gmem_aops = {
#endif
};
-static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path,
- struct kstat *stat, u32 request_mask,
- unsigned int query_flags)
-{
- struct inode *inode = path->dentry->d_inode;
-
- generic_fillattr(idmap, request_mask, inode, stat);
- return 0;
-}
-
static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr)
{
return -EINVAL;
}
static const struct inode_operations kvm_gmem_iops = {
- .getattr = kvm_gmem_getattr,
.setattr = kvm_gmem_setattr,
};
+bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
+{
+ return true;
+}
+
static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
{
- const char *anon_name = "[kvm-gmem]";
- struct kvm_gmem *gmem;
+ static const char *name = "[kvm-gmem]";
+ struct gmem_file *f;
struct inode *inode;
struct file *file;
int fd, err;
@@ -409,25 +573,24 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
if (fd < 0)
return fd;
- gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
- if (!gmem) {
+ f = kzalloc(sizeof(*f), GFP_KERNEL);
+ if (!f) {
err = -ENOMEM;
goto err_fd;
}
- file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem,
- O_RDWR, NULL);
- if (IS_ERR(file)) {
- err = PTR_ERR(file);
+ /* __fput() will take care of fops_put(). */
+ if (!fops_get(&kvm_gmem_fops)) {
+ err = -ENOENT;
goto err_gmem;
}
- file->f_flags |= O_LARGEFILE;
-
- inode = file->f_inode;
- WARN_ON(file->f_mapping != inode->i_mapping);
+ inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto err_fops;
+ }
- inode->i_private = (void *)(unsigned long)flags;
inode->i_op = &kvm_gmem_iops;
inode->i_mapping->a_ops = &kvm_gmem_aops;
inode->i_mode |= S_IFREG;
@@ -437,16 +600,31 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
/* Unmovable mappings are supposed to be marked unevictable as well. */
WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
+ GMEM_I(inode)->flags = flags;
+
+ file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto err_inode;
+ }
+
+ file->f_flags |= O_LARGEFILE;
+ file->private_data = f;
+
kvm_get_kvm(kvm);
- gmem->kvm = kvm;
- xa_init(&gmem->bindings);
- list_add(&gmem->entry, &inode->i_mapping->i_private_list);
+ f->kvm = kvm;
+ xa_init(&f->bindings);
+ list_add(&f->entry, &inode->i_mapping->i_private_list);
fd_install(fd, file);
return fd;
+err_inode:
+ iput(inode);
+err_fops:
+ fops_put(&kvm_gmem_fops);
err_gmem:
- kfree(gmem);
+ kfree(f);
err_fd:
put_unused_fd(fd);
return err;
@@ -456,9 +634,8 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
{
loff_t size = args->size;
u64 flags = args->flags;
- u64 valid_flags = 0;
- if (flags & ~valid_flags)
+ if (flags & ~kvm_gmem_get_supported_flags(kvm))
return -EINVAL;
if (size <= 0 || !PAGE_ALIGNED(size))
@@ -472,7 +649,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
{
loff_t size = slot->npages << PAGE_SHIFT;
unsigned long start, end;
- struct kvm_gmem *gmem;
+ struct gmem_file *f;
struct inode *inode;
struct file *file;
int r = -EINVAL;
@@ -486,8 +663,8 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
if (file->f_op != &kvm_gmem_fops)
goto err;
- gmem = file->private_data;
- if (gmem->kvm != kvm)
+ f = file->private_data;
+ if (f->kvm != kvm)
goto err;
inode = file_inode(file);
@@ -501,21 +678,23 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
start = offset >> PAGE_SHIFT;
end = start + slot->npages;
- if (!xa_empty(&gmem->bindings) &&
- xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
+ if (!xa_empty(&f->bindings) &&
+ xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
filemap_invalidate_unlock(inode->i_mapping);
goto err;
}
/*
- * No synchronize_rcu() needed, any in-flight readers are guaranteed to
- * be see either a NULL file or this new file, no need for them to go
- * away.
+ * memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so
+ * kvm_gmem_bind() must occur on a new memslot. Because the memslot
+ * is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file.
*/
- rcu_assign_pointer(slot->gmem.file, file);
+ WRITE_ONCE(slot->gmem.file, file);
slot->gmem.pgoff = start;
+ if (kvm_gmem_supports_mmap(inode))
+ slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
- xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
+ xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL);
filemap_invalidate_unlock(inode->i_mapping);
/*
@@ -529,30 +708,49 @@ err:
return r;
}
-void kvm_gmem_unbind(struct kvm_memory_slot *slot)
+static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct gmem_file *f)
{
unsigned long start = slot->gmem.pgoff;
unsigned long end = start + slot->npages;
- struct kvm_gmem *gmem;
- struct file *file;
+
+ xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL);
/*
- * Nothing to do if the underlying file was already closed (or is being
- * closed right now), kvm_gmem_release() invalidates all bindings.
+ * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
+ * cannot see this memslot.
*/
- file = kvm_gmem_get_file(slot);
- if (!file)
+ WRITE_ONCE(slot->gmem.file, NULL);
+}
+
+void kvm_gmem_unbind(struct kvm_memory_slot *slot)
+{
+ /*
+ * Nothing to do if the underlying file was _already_ closed, as
+ * kvm_gmem_release() invalidates and nullifies all bindings.
+ */
+ if (!slot->gmem.file)
return;
- gmem = file->private_data;
+ CLASS(gmem_get_file, file)(slot);
+
+ /*
+ * However, if the file is _being_ closed, then the bindings need to be
+ * removed as kvm_gmem_release() might not run until after the memslot
+ * is freed. Note, modifying the bindings is safe even though the file
+ * is dying as kvm_gmem_release() nullifies slot->gmem.file under
+ * slots_lock, and only puts its reference to KVM after destroying all
+ * bindings. I.e. reaching this point means kvm_gmem_release() hasn't
+ * yet destroyed the bindings or freed the gmem_file, and can't do so
+ * until the caller drops slots_lock.
+ */
+ if (!file) {
+ __kvm_gmem_unbind(slot, slot->gmem.file->private_data);
+ return;
+ }
filemap_invalidate_lock(file->f_mapping);
- xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
- rcu_assign_pointer(slot->gmem.file, NULL);
- synchronize_rcu();
+ __kvm_gmem_unbind(slot, file->private_data);
filemap_invalidate_unlock(file->f_mapping);
-
- fput(file);
}
/* Returns a locked folio on success. */
@@ -561,17 +759,17 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file,
pgoff_t index, kvm_pfn_t *pfn,
bool *is_prepared, int *max_order)
{
- struct kvm_gmem *gmem = file->private_data;
+ struct file *slot_file = READ_ONCE(slot->gmem.file);
+ struct gmem_file *f = file->private_data;
struct folio *folio;
- if (file != slot->gmem.file) {
- WARN_ON_ONCE(slot->gmem.file);
+ if (file != slot_file) {
+ WARN_ON_ONCE(slot_file);
return ERR_PTR(-EFAULT);
}
- gmem = file->private_data;
- if (xa_load(&gmem->bindings, index) != slot) {
- WARN_ON_ONCE(xa_load(&gmem->bindings, index));
+ if (xa_load(&f->bindings, index) != slot) {
+ WARN_ON_ONCE(xa_load(&f->bindings, index));
return ERR_PTR(-EIO);
}
@@ -598,19 +796,17 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
int *max_order)
{
pgoff_t index = kvm_gmem_get_index(slot, gfn);
- struct file *file = kvm_gmem_get_file(slot);
struct folio *folio;
bool is_prepared = false;
int r = 0;
+ CLASS(gmem_get_file, file)(slot);
if (!file)
return -EFAULT;
folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
- if (IS_ERR(folio)) {
- r = PTR_ERR(folio);
- goto out;
- }
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
if (!is_prepared)
r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
@@ -622,17 +818,14 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
else
folio_put(folio);
-out:
- fput(file);
return r;
}
-EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
-#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
kvm_gmem_populate_cb post_populate, void *opaque)
{
- struct file *file;
struct kvm_memory_slot *slot;
void __user *p;
@@ -640,14 +833,15 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
long i;
lockdep_assert_held(&kvm->slots_lock);
- if (npages < 0)
+
+ if (WARN_ON_ONCE(npages <= 0))
return -EINVAL;
slot = gfn_to_memslot(kvm, start_gfn);
- if (!kvm_slot_can_be_private(slot))
+ if (!kvm_slot_has_gmem(slot))
return -EINVAL;
- file = kvm_gmem_get_file(slot);
+ CLASS(gmem_get_file, file)(slot);
if (!file)
return -EFAULT;
@@ -705,8 +899,118 @@ put_folio_and_exit:
filemap_invalidate_unlock(file->f_mapping);
- fput(file);
return ret && !i ? ret : i;
}
-EXPORT_SYMBOL_GPL(kvm_gmem_populate);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate);
#endif
+
+static struct kmem_cache *kvm_gmem_inode_cachep;
+
+static void kvm_gmem_init_inode_once(void *__gi)
+{
+ struct gmem_inode *gi = __gi;
+
+ /*
+ * Note! Don't initialize the inode with anything specific to the
+ * guest_memfd instance, or that might be specific to how the inode is
+ * used (from the VFS-layer's perspective). This hook is called only
+ * during the initial slab allocation, i.e. only fields/state that are
+ * idempotent across _all_ use of the inode _object_ can be initialized
+ * at this time!
+ */
+ inode_init_once(&gi->vfs_inode);
+}
+
+static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
+{
+ struct gmem_inode *gi;
+
+ gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL);
+ if (!gi)
+ return NULL;
+
+ mpol_shared_policy_init(&gi->policy, NULL);
+
+ gi->flags = 0;
+ return &gi->vfs_inode;
+}
+
+static void kvm_gmem_destroy_inode(struct inode *inode)
+{
+ mpol_free_shared_policy(&GMEM_I(inode)->policy);
+}
+
+static void kvm_gmem_free_inode(struct inode *inode)
+{
+ kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode));
+}
+
+static const struct super_operations kvm_gmem_super_operations = {
+ .statfs = simple_statfs,
+ .alloc_inode = kvm_gmem_alloc_inode,
+ .destroy_inode = kvm_gmem_destroy_inode,
+ .free_inode = kvm_gmem_free_inode,
+};
+
+static int kvm_gmem_init_fs_context(struct fs_context *fc)
+{
+ struct pseudo_fs_context *ctx;
+
+ if (!init_pseudo(fc, GUEST_MEMFD_MAGIC))
+ return -ENOMEM;
+
+ fc->s_iflags |= SB_I_NOEXEC;
+ fc->s_iflags |= SB_I_NODEV;
+ ctx = fc->fs_private;
+ ctx->ops = &kvm_gmem_super_operations;
+
+ return 0;
+}
+
+static struct file_system_type kvm_gmem_fs = {
+ .name = "guest_memfd",
+ .init_fs_context = kvm_gmem_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+static int kvm_gmem_init_mount(void)
+{
+ kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
+
+ if (IS_ERR(kvm_gmem_mnt))
+ return PTR_ERR(kvm_gmem_mnt);
+
+ kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
+ return 0;
+}
+
+int kvm_gmem_init(struct module *module)
+{
+ struct kmem_cache_args args = {
+ .align = 0,
+ .ctor = kvm_gmem_init_inode_once,
+ };
+ int ret;
+
+ kvm_gmem_fops.owner = module;
+ kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache",
+ sizeof(struct gmem_inode),
+ &args, SLAB_ACCOUNT);
+ if (!kvm_gmem_inode_cachep)
+ return -ENOMEM;
+
+ ret = kvm_gmem_init_mount();
+ if (ret) {
+ kmem_cache_destroy(kvm_gmem_inode_cachep);
+ return ret;
+ }
+ return 0;
+}
+
+void kvm_gmem_exit(void)
+{
+ kern_unmount(kvm_gmem_mnt);
+ kvm_gmem_mnt = NULL;
+ rcu_barrier();
+ kmem_cache_destroy(kvm_gmem_inode_cachep);
+}