summaryrefslogtreecommitdiff
path: root/mm/vmalloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmalloc.c')
-rw-r--r--mm/vmalloc.c5650
1 files changed, 4200 insertions, 1450 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 13a54953a273..ecbac900c35f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1,36 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/mm/vmalloc.c
- *
* Copyright (C) 1993 Linus Torvalds
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
* Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
* Numa awareness, Christoph Lameter, SGI, June 2005
+ * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
*/
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/highmem.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/set_memory.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/list.h>
+#include <linux/notifier.h>
#include <linux/rbtree.h>
-#include <linux/radix-tree.h>
+#include <linux/xarray.h>
+#include <linux/io.h>
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/memcontrol.h>
#include <linux/llist.h>
-#include <asm/uaccess.h>
+#include <linux/uio.h>
+#include <linux/bitops.h>
+#include <linux/rbtree_augmented.h>
+#include <linux/overflow.h>
+#include <linux/pgtable.h>
+#include <linux/hugetlb.h>
+#include <linux/sched/mm.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
+#include <linux/page_owner.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/vmalloc.h>
+
+#include "internal.h"
+#include "pgalloc-track.h"
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1;
+
+static int __init set_nohugeiomap(char *str)
+{
+ ioremap_max_page_shift = PAGE_SHIFT;
+ return 0;
+}
+early_param("nohugeiomap", set_nohugeiomap);
+#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+static const unsigned int ioremap_max_page_shift = PAGE_SHIFT;
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+static bool __ro_after_init vmap_allow_huge = true;
+
+static int __init set_nohugevmalloc(char *str)
+{
+ vmap_allow_huge = false;
+ return 0;
+}
+early_param("nohugevmalloc", set_nohugevmalloc);
+#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+static const bool vmap_allow_huge = false;
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+
+bool is_vmalloc_addr(const void *x)
+{
+ unsigned long addr = (unsigned long)kasan_reset_tag(x);
+
+ return addr >= VMALLOC_START && addr < VMALLOC_END;
+}
+EXPORT_SYMBOL(is_vmalloc_addr);
struct vfree_deferred {
struct llist_head list;
@@ -38,78 +90,438 @@ struct vfree_deferred {
};
static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
-static void __vunmap(const void *, int);
+/*** Page table manipulation functions ***/
+static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ pte_t *pte;
+ u64 pfn;
+ struct page *page;
+ unsigned long size = PAGE_SIZE;
+
+ if (WARN_ON_ONCE(!PAGE_ALIGNED(end - addr)))
+ return -EINVAL;
-static void free_work(struct work_struct *w)
+ pfn = phys_addr >> PAGE_SHIFT;
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
+ if (!pte)
+ return -ENOMEM;
+
+ arch_enter_lazy_mmu_mode();
+
+ do {
+ if (unlikely(!pte_none(ptep_get(pte)))) {
+ if (pfn_valid(pfn)) {
+ page = pfn_to_page(pfn);
+ dump_page(page, "remapping already mapped page");
+ }
+ BUG();
+ }
+
+#ifdef CONFIG_HUGETLB_PAGE
+ size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
+ if (size != PAGE_SIZE) {
+ pte_t entry = pfn_pte(pfn, prot);
+
+ entry = arch_make_huge_pte(entry, ilog2(size), 0);
+ set_huge_pte_at(&init_mm, addr, pte, entry, size);
+ pfn += PFN_DOWN(size);
+ continue;
+ }
+#endif
+ set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
+ pfn++;
+ } while (pte += PFN_DOWN(size), addr += size, addr != end);
+
+ arch_leave_lazy_mmu_mode();
+ *mask |= PGTBL_PTE_MODIFIED;
+ return 0;
+}
+
+static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
{
- struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
- struct llist_node *llnode = llist_del_all(&p->list);
- while (llnode) {
- void *p = llnode;
- llnode = llist_next(llnode);
- __vunmap(p, 1);
- }
+ if (max_page_shift < PMD_SHIFT)
+ return 0;
+
+ if (!arch_vmap_pmd_supported(prot))
+ return 0;
+
+ if ((end - addr) != PMD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PMD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PMD_SIZE))
+ return 0;
+
+ if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
+ return 0;
+
+ return pmd_set_huge(pmd, phys_addr, prot);
}
-/*** Page table manipulation functions ***/
+static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ pmd_t *pmd;
+ unsigned long next;
+ int err = 0;
+
+ pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
+ if (!pmd)
+ return -ENOMEM;
+ do {
+ next = pmd_addr_end(addr, end);
+
+ if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
+ max_page_shift)) {
+ *mask |= PGTBL_PMD_MODIFIED;
+ continue;
+ }
+
+ err = vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask);
+ if (err)
+ break;
+ } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
+ return err;
+}
+
+static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < PUD_SHIFT)
+ return 0;
+
+ if (!arch_vmap_pud_supported(prot))
+ return 0;
+
+ if ((end - addr) != PUD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PUD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PUD_SIZE))
+ return 0;
+
+ if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
+ return 0;
+
+ return pud_set_huge(pud, phys_addr, prot);
+}
+
+static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ pud_t *pud;
+ unsigned long next;
+ int err = 0;
+
+ pud = pud_alloc_track(&init_mm, p4d, addr, mask);
+ if (!pud)
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+
+ if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
+ max_page_shift)) {
+ *mask |= PGTBL_PUD_MODIFIED;
+ continue;
+ }
+
+ err = vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift, mask);
+ if (err)
+ break;
+ } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
+ return err;
+}
+
+static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < P4D_SHIFT)
+ return 0;
+
+ if (!arch_vmap_p4d_supported(prot))
+ return 0;
+
+ if ((end - addr) != P4D_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, P4D_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, P4D_SIZE))
+ return 0;
+
+ if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
+ return 0;
-static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
+ return p4d_set_huge(p4d, phys_addr, prot);
+}
+
+static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ p4d_t *p4d;
+ unsigned long next;
+ int err = 0;
+
+ p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
+ if (!p4d)
+ return -ENOMEM;
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
+ max_page_shift)) {
+ *mask |= PGTBL_P4D_MODIFIED;
+ continue;
+ }
+
+ err = vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift, mask);
+ if (err)
+ break;
+ } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
+ return err;
+}
+
+static int vmap_range_noflush(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ pgd_t *pgd;
+ unsigned long start;
+ unsigned long next;
+ int err;
+ pgtbl_mod_mask mask = 0;
+
+ might_sleep();
+ BUG_ON(addr >= end);
+
+ start = addr;
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
+ max_page_shift, &mask);
+ if (err)
+ break;
+ } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
+
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+
+ return err;
+}
+
+int vmap_page_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot)
+{
+ int err;
+
+ err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
+ ioremap_max_page_shift);
+ flush_cache_vmap(addr, end);
+ if (!err)
+ err = kmsan_ioremap_page_range(addr, end, phys_addr, prot,
+ ioremap_max_page_shift);
+ return err;
+}
+
+int ioremap_page_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot)
+{
+ struct vm_struct *area;
+
+ area = find_vm_area((void *)addr);
+ if (!area || !(area->flags & VM_IOREMAP)) {
+ WARN_ONCE(1, "vm_area at addr %lx is not marked as VM_IOREMAP\n", addr);
+ return -EINVAL;
+ }
+ if (addr != (unsigned long)area->addr ||
+ (void *)end != area->addr + get_vm_area_size(area)) {
+ WARN_ONCE(1, "ioremap request [%lx,%lx) doesn't match vm_area [%lx, %lx)\n",
+ addr, end, (long)area->addr,
+ (long)area->addr + get_vm_area_size(area));
+ return -ERANGE;
+ }
+ return vmap_page_range(addr, end, phys_addr, prot);
+}
+
+static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
pte_t *pte;
+ pte_t ptent;
+ unsigned long size = PAGE_SIZE;
pte = pte_offset_kernel(pmd, addr);
+ arch_enter_lazy_mmu_mode();
+
do {
- pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
+#ifdef CONFIG_HUGETLB_PAGE
+ size = arch_vmap_pte_range_unmap_size(addr, pte);
+ if (size != PAGE_SIZE) {
+ if (WARN_ON(!IS_ALIGNED(addr, size))) {
+ addr = ALIGN_DOWN(addr, size);
+ pte = PTR_ALIGN_DOWN(pte, sizeof(*pte) * (size >> PAGE_SHIFT));
+ }
+ ptent = huge_ptep_get_and_clear(&init_mm, addr, pte, size);
+ if (WARN_ON(end - addr < size))
+ size = end - addr;
+ } else
+#endif
+ ptent = ptep_get_and_clear(&init_mm, addr, pte);
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end);
+
+ arch_leave_lazy_mmu_mode();
+ *mask |= PGTBL_PTE_MODIFIED;
}
-static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
+static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
+ int cleared;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+
+ cleared = pmd_clear_huge(pmd);
+ if (cleared || pmd_bad(*pmd))
+ *mask |= PGTBL_PMD_MODIFIED;
+
+ if (cleared) {
+ WARN_ON(next - addr < PMD_SIZE);
+ continue;
+ }
if (pmd_none_or_clear_bad(pmd))
continue;
- vunmap_pte_range(pmd, addr, next);
+ vunmap_pte_range(pmd, addr, next, mask);
+
+ cond_resched();
} while (pmd++, addr = next, addr != end);
}
-static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
+static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
+ int cleared;
- pud = pud_offset(pgd, addr);
+ pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
+
+ cleared = pud_clear_huge(pud);
+ if (cleared || pud_bad(*pud))
+ *mask |= PGTBL_PUD_MODIFIED;
+
+ if (cleared) {
+ WARN_ON(next - addr < PUD_SIZE);
+ continue;
+ }
if (pud_none_or_clear_bad(pud))
continue;
- vunmap_pmd_range(pud, addr, next);
+ vunmap_pmd_range(pud, addr, next, mask);
} while (pud++, addr = next, addr != end);
}
-static void vunmap_page_range(unsigned long addr, unsigned long end)
+static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
- pgd_t *pgd;
+ p4d_t *p4d;
unsigned long next;
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+
+ p4d_clear_huge(p4d);
+ if (p4d_bad(*p4d))
+ *mask |= PGTBL_P4D_MODIFIED;
+
+ if (p4d_none_or_clear_bad(p4d))
+ continue;
+ vunmap_pud_range(p4d, addr, next, mask);
+ } while (p4d++, addr = next, addr != end);
+}
+
+/*
+ * vunmap_range_noflush is similar to vunmap_range, but does not
+ * flush caches or TLBs.
+ *
+ * The caller is responsible for calling flush_cache_vmap() before calling
+ * this function, and flush_tlb_kernel_range after it has returned
+ * successfully (and before the addresses are expected to cause a page fault
+ * or be re-mapped for something else, if TLB flushes are being delayed or
+ * coalesced).
+ *
+ * This is an internal function only. Do not use outside mm/.
+ */
+void __vunmap_range_noflush(unsigned long start, unsigned long end)
+{
+ unsigned long next;
+ pgd_t *pgd;
+ unsigned long addr = start;
+ pgtbl_mod_mask mask = 0;
+
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
+ if (pgd_bad(*pgd))
+ mask |= PGTBL_PGD_MODIFIED;
if (pgd_none_or_clear_bad(pgd))
continue;
- vunmap_pud_range(pgd, addr, next);
+ vunmap_p4d_range(pgd, addr, next, &mask);
} while (pgd++, addr = next, addr != end);
+
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+}
+
+void vunmap_range_noflush(unsigned long start, unsigned long end)
+{
+ kmsan_vunmap_range_noflush(start, end);
+ __vunmap_range_noflush(start, end);
+}
+
+/**
+ * vunmap_range - unmap kernel virtual addresses
+ * @addr: start of the VM area to unmap
+ * @end: end of the VM area to unmap (non-inclusive)
+ *
+ * Clears any present PTEs in the virtual address range, flushes TLBs and
+ * caches. Any subsequent access to the address before it has been re-mapped
+ * is a kernel bug.
+ */
+void vunmap_range(unsigned long addr, unsigned long end)
+{
+ flush_cache_vunmap(addr, end);
+ vunmap_range_noflush(addr, end);
+ flush_tlb_kernel_range(addr, end);
}
-static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
{
+ int err = 0;
pte_t *pte;
/*
@@ -117,91 +529,245 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
* callers keep track of where we're up to.
*/
- pte = pte_alloc_kernel(pmd, addr);
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
+
+ arch_enter_lazy_mmu_mode();
+
do {
struct page *page = pages[*nr];
- if (WARN_ON(!pte_none(*pte)))
- return -EBUSY;
- if (WARN_ON(!page))
- return -ENOMEM;
+ if (WARN_ON(!pte_none(ptep_get(pte)))) {
+ err = -EBUSY;
+ break;
+ }
+ if (WARN_ON(!page)) {
+ err = -ENOMEM;
+ break;
+ }
+ if (WARN_ON(!pfn_valid(page_to_pfn(page)))) {
+ err = -EINVAL;
+ break;
+ }
+
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
- return 0;
+
+ arch_leave_lazy_mmu_mode();
+ *mask |= PGTBL_PTE_MODIFIED;
+
+ return err;
}
-static int vmap_pmd_range(pud_t *pud, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
- pmd = pmd_alloc(&init_mm, pud, addr);
+ pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
if (!pmd)
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
- if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
+ if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}
-static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
- pud = pud_alloc(&init_mm, pgd, addr);
+ pud = pud_alloc_track(&init_mm, p4d, addr, mask);
if (!pud)
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
- if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
+ if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}
-/*
- * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
- * will have pfns corresponding to the "pages" array.
- *
- * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
- */
-static int vmap_page_range_noflush(unsigned long start, unsigned long end,
- pgprot_t prot, struct page **pages)
+static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
+ if (!p4d)
+ return -ENOMEM;
+ do {
+ next = p4d_addr_end(addr, end);
+ if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
+ return -ENOMEM;
+ } while (p4d++, addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages)
{
+ unsigned long start = addr;
pgd_t *pgd;
unsigned long next;
- unsigned long addr = start;
int err = 0;
int nr = 0;
+ pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
- err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
+ if (pgd_bad(*pgd))
+ mask |= PGTBL_PGD_MODIFIED;
+ err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
- return err;
+ break;
} while (pgd++, addr = next, addr != end);
- return nr;
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+
+ return err;
}
-static int vmap_page_range(unsigned long start, unsigned long end,
- pgprot_t prot, struct page **pages)
+/*
+ * vmap_pages_range_noflush is similar to vmap_pages_range, but does not
+ * flush caches.
+ *
+ * The caller is responsible for calling flush_cache_vmap() after this
+ * function returns successfully and before the addresses are accessed.
+ *
+ * This is an internal function only. Do not use outside mm/.
+ */
+int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
{
- int ret;
+ unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
+
+ WARN_ON(page_shift < PAGE_SHIFT);
+
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
+ page_shift == PAGE_SHIFT)
+ return vmap_small_pages_range_noflush(addr, end, prot, pages);
- ret = vmap_page_range_noflush(start, end, prot, pages);
- flush_cache_vmap(start, end);
- return ret;
+ for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
+ int err;
+
+ err = vmap_range_noflush(addr, addr + (1UL << page_shift),
+ page_to_phys(pages[i]), prot,
+ page_shift);
+ if (err)
+ return err;
+
+ addr += 1UL << page_shift;
+ }
+
+ return 0;
+}
+
+int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift,
+ gfp_t gfp_mask)
+{
+ int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
+ page_shift, gfp_mask);
+
+ if (ret)
+ return ret;
+ return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
+}
+
+static int __vmap_pages_range(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift,
+ gfp_t gfp_mask)
+{
+ int err;
+
+ err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift, gfp_mask);
+ flush_cache_vmap(addr, end);
+ return err;
+}
+
+/**
+ * vmap_pages_range - map pages to a kernel virtual address
+ * @addr: start of the VM area to map
+ * @end: end of the VM area to map (non-inclusive)
+ * @prot: page protection flags to use
+ * @pages: pages to map (always PAGE_SIZE pages)
+ * @page_shift: maximum shift that the pages may be mapped with, @pages must
+ * be aligned and contiguous up to at least this shift.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int vmap_pages_range(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+ return __vmap_pages_range(addr, end, prot, pages, page_shift, GFP_KERNEL);
+}
+
+static int check_sparse_vm_area(struct vm_struct *area, unsigned long start,
+ unsigned long end)
+{
+ might_sleep();
+ if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS))
+ return -EINVAL;
+ if (WARN_ON_ONCE(area->flags & VM_NO_GUARD))
+ return -EINVAL;
+ if (WARN_ON_ONCE(!(area->flags & VM_SPARSE)))
+ return -EINVAL;
+ if ((end - start) >> PAGE_SHIFT > totalram_pages())
+ return -E2BIG;
+ if (start < (unsigned long)area->addr ||
+ (void *)end > area->addr + get_vm_area_size(area))
+ return -ERANGE;
+ return 0;
+}
+
+/**
+ * vm_area_map_pages - map pages inside given sparse vm_area
+ * @area: vm_area
+ * @start: start address inside vm_area
+ * @end: end address inside vm_area
+ * @pages: pages to map (always PAGE_SIZE pages)
+ */
+int vm_area_map_pages(struct vm_struct *area, unsigned long start,
+ unsigned long end, struct page **pages)
+{
+ int err;
+
+ err = check_sparse_vm_area(area, start, end);
+ if (err)
+ return err;
+
+ return vmap_pages_range(start, end, PAGE_KERNEL, pages, PAGE_SHIFT);
+}
+
+/**
+ * vm_area_unmap_pages - unmap pages inside given sparse vm_area
+ * @area: vm_area
+ * @start: start address inside vm_area
+ * @end: end address inside vm_area
+ */
+void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
+ unsigned long end)
+{
+ if (check_sparse_vm_area(area, start, end))
+ return;
+
+ vunmap_range(start, end);
}
int is_vmalloc_or_module_addr(const void *x)
@@ -211,22 +777,29 @@ int is_vmalloc_or_module_addr(const void *x)
* and fall back on vmalloc() if that fails. Others
* just put it in the vmalloc space.
*/
-#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
- unsigned long addr = (unsigned long)x;
+#if defined(CONFIG_EXECMEM) && defined(MODULES_VADDR)
+ unsigned long addr = (unsigned long)kasan_reset_tag(x);
if (addr >= MODULES_VADDR && addr < MODULES_END)
return 1;
#endif
return is_vmalloc_addr(x);
}
+EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr);
/*
- * Walk a vmap address to the struct page it maps.
+ * Walk a vmap address to the struct page it maps. Huge vmap mappings will
+ * return the tail page that corresponds to the base page address, which
+ * matches small vmap mappings.
*/
struct page *vmalloc_to_page(const void *vmalloc_addr)
{
unsigned long addr = (unsigned long) vmalloc_addr;
struct page *page = NULL;
pgd_t *pgd = pgd_offset_k(addr);
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep, pte;
/*
* XXX we might need to change this if we add VIRTUAL_BUG_ON for
@@ -234,21 +807,42 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
*/
VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
- if (!pgd_none(*pgd)) {
- pud_t *pud = pud_offset(pgd, addr);
- if (!pud_none(*pud)) {
- pmd_t *pmd = pmd_offset(pud, addr);
- if (!pmd_none(*pmd)) {
- pte_t *ptep, pte;
-
- ptep = pte_offset_map(pmd, addr);
- pte = *ptep;
- if (pte_present(pte))
- page = pte_page(pte);
- pte_unmap(ptep);
- }
- }
- }
+ if (pgd_none(*pgd))
+ return NULL;
+ if (WARN_ON_ONCE(pgd_leaf(*pgd)))
+ return NULL; /* XXX: no allowance for huge pgd */
+ if (WARN_ON_ONCE(pgd_bad(*pgd)))
+ return NULL;
+
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ return NULL;
+ if (p4d_leaf(*p4d))
+ return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(p4d_bad(*p4d)))
+ return NULL;
+
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
+ return NULL;
+ if (pud_leaf(*pud))
+ return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(pud_bad(*pud)))
+ return NULL;
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ return NULL;
+ if (pmd_leaf(*pmd))
+ return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(pmd_bad(*pmd)))
+ return NULL;
+
+ ptep = pte_offset_kernel(pmd, addr);
+ pte = ptep_get(ptep);
+ if (pte_present(pte))
+ page = pte_page(pte);
+
return page;
}
EXPORT_SYMBOL(vmalloc_to_page);
@@ -265,26 +859,223 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
/*** Global kva allocator ***/
-#define VM_LAZY_FREE 0x01
-#define VM_LAZY_FREEING 0x02
-#define VM_VM_AREA 0x04
+#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
+#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
+
+
+static DEFINE_SPINLOCK(free_vmap_area_lock);
+static bool vmap_initialized __read_mostly;
+
+/*
+ * This kmem_cache is used for vmap_area objects. Instead of
+ * allocating from slab we reuse an object from this cache to
+ * make things faster. Especially in "no edge" splitting of
+ * free block.
+ */
+static struct kmem_cache *vmap_area_cachep;
+
+/*
+ * This linked list is used in pair with free_vmap_area_root.
+ * It gives O(1) access to prev/next to perform fast coalescing.
+ */
+static LIST_HEAD(free_vmap_area_list);
+
+/*
+ * This augment red-black tree represents the free vmap space.
+ * All vmap_area objects in this tree are sorted by va->va_start
+ * address. It is used for allocation and merging when a vmap
+ * object is released.
+ *
+ * Each vmap_area node contains a maximum available free block
+ * of its sub-tree, right or left. Therefore it is possible to
+ * find a lowest match of free area.
+ */
+static struct rb_root free_vmap_area_root = RB_ROOT;
+
+/*
+ * Preload a CPU with one object for "no edge" split case. The
+ * aim is to get rid of allocations from the atomic context, thus
+ * to use more permissive allocation masks.
+ */
+static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
+
+/*
+ * This structure defines a single, solid model where a list and
+ * rb-tree are part of one entity protected by the lock. Nodes are
+ * sorted in ascending order, thus for O(1) access to left/right
+ * neighbors a list is used as well as for sequential traversal.
+ */
+struct rb_list {
+ struct rb_root root;
+ struct list_head head;
+ spinlock_t lock;
+};
+
+/*
+ * A fast size storage contains VAs up to 1M size. A pool consists
+ * of linked between each other ready to go VAs of certain sizes.
+ * An index in the pool-array corresponds to number of pages + 1.
+ */
+#define MAX_VA_SIZE_PAGES 256
+
+struct vmap_pool {
+ struct list_head head;
+ unsigned long len;
+};
+
+/*
+ * An effective vmap-node logic. Users make use of nodes instead
+ * of a global heap. It allows to balance an access and mitigate
+ * contention.
+ */
+static struct vmap_node {
+ /* Simple size segregated storage. */
+ struct vmap_pool pool[MAX_VA_SIZE_PAGES];
+ spinlock_t pool_lock;
+ bool skip_populate;
+
+ /* Bookkeeping data of this node. */
+ struct rb_list busy;
+ struct rb_list lazy;
+
+ /*
+ * Ready-to-free areas.
+ */
+ struct list_head purge_list;
+ struct work_struct purge_work;
+ unsigned long nr_purged;
+} single;
+
+/*
+ * Initial setup consists of one single node, i.e. a balancing
+ * is fully disabled. Later on, after vmap is initialized these
+ * parameters are updated based on a system capacity.
+ */
+static struct vmap_node *vmap_nodes = &single;
+static __read_mostly unsigned int nr_vmap_nodes = 1;
+static __read_mostly unsigned int vmap_zone_size = 1;
+
+/* A simple iterator over all vmap-nodes. */
+#define for_each_vmap_node(vn) \
+ for ((vn) = &vmap_nodes[0]; \
+ (vn) < &vmap_nodes[nr_vmap_nodes]; (vn)++)
+
+static inline unsigned int
+addr_to_node_id(unsigned long addr)
+{
+ return (addr / vmap_zone_size) % nr_vmap_nodes;
+}
+
+static inline struct vmap_node *
+addr_to_node(unsigned long addr)
+{
+ return &vmap_nodes[addr_to_node_id(addr)];
+}
+
+static inline struct vmap_node *
+id_to_node(unsigned int id)
+{
+ return &vmap_nodes[id % nr_vmap_nodes];
+}
+
+static inline unsigned int
+node_to_id(struct vmap_node *node)
+{
+ /* Pointer arithmetic. */
+ unsigned int id = node - vmap_nodes;
+
+ if (likely(id < nr_vmap_nodes))
+ return id;
+
+ WARN_ONCE(1, "An address 0x%p is out-of-bounds.\n", node);
+ return 0;
+}
-static DEFINE_SPINLOCK(vmap_area_lock);
-/* Export for kexec only */
-LIST_HEAD(vmap_area_list);
-static struct rb_root vmap_area_root = RB_ROOT;
+/*
+ * We use the value 0 to represent "no node", that is why
+ * an encoded value will be the node-id incremented by 1.
+ * It is always greater then 0. A valid node_id which can
+ * be encoded is [0:nr_vmap_nodes - 1]. If a passed node_id
+ * is not valid 0 is returned.
+ */
+static unsigned int
+encode_vn_id(unsigned int node_id)
+{
+ /* Can store U8_MAX [0:254] nodes. */
+ if (node_id < nr_vmap_nodes)
+ return (node_id + 1) << BITS_PER_BYTE;
-/* The vmap cache globals are protected by vmap_area_lock */
-static struct rb_node *free_vmap_cache;
-static unsigned long cached_hole_size;
-static unsigned long cached_vstart;
-static unsigned long cached_align;
+ /* Warn and no node encoded. */
+ WARN_ONCE(1, "Encode wrong node id (%u)\n", node_id);
+ return 0;
+}
-static unsigned long vmap_area_pcpu_hole;
+/*
+ * Returns an encoded node-id, the valid range is within
+ * [0:nr_vmap_nodes-1] values. Otherwise nr_vmap_nodes is
+ * returned if extracted data is wrong.
+ */
+static unsigned int
+decode_vn_id(unsigned int val)
+{
+ unsigned int node_id = (val >> BITS_PER_BYTE) - 1;
+
+ /* Can store U8_MAX [0:254] nodes. */
+ if (node_id < nr_vmap_nodes)
+ return node_id;
+
+ /* If it was _not_ zero, warn. */
+ WARN_ONCE(node_id != UINT_MAX,
+ "Decode wrong node id (%d)\n", node_id);
-static struct vmap_area *__find_vmap_area(unsigned long addr)
+ return nr_vmap_nodes;
+}
+
+static bool
+is_vn_id_valid(unsigned int node_id)
{
- struct rb_node *n = vmap_area_root.rb_node;
+ if (node_id < nr_vmap_nodes)
+ return true;
+
+ return false;
+}
+
+static __always_inline unsigned long
+va_size(struct vmap_area *va)
+{
+ return (va->va_end - va->va_start);
+}
+
+static __always_inline unsigned long
+get_subtree_max_size(struct rb_node *node)
+{
+ struct vmap_area *va;
+
+ va = rb_entry_safe(node, struct vmap_area, rb_node);
+ return va ? va->subtree_max_size : 0;
+}
+
+RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
+ struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
+
+static void reclaim_and_purge_vmap_areas(void);
+static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
+static void drain_vmap_area_work(struct work_struct *work);
+static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
+
+static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages;
+static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr;
+
+unsigned long vmalloc_nr_pages(void)
+{
+ return atomic_long_read(&nr_vmalloc_pages);
+}
+
+static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
+{
+ struct rb_node *n = root->rb_node;
+
+ addr = (unsigned long)kasan_reset_tag((void *)addr);
while (n) {
struct vmap_area *va;
@@ -301,200 +1092,775 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
return NULL;
}
-static void __insert_vmap_area(struct vmap_area *va)
+/* Look up the first VA which satisfies addr < va_end, NULL if none. */
+static struct vmap_area *
+__find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
{
- struct rb_node **p = &vmap_area_root.rb_node;
- struct rb_node *parent = NULL;
- struct rb_node *tmp;
+ struct vmap_area *va = NULL;
+ struct rb_node *n = root->rb_node;
- while (*p) {
- struct vmap_area *tmp_va;
+ addr = (unsigned long)kasan_reset_tag((void *)addr);
- parent = *p;
- tmp_va = rb_entry(parent, struct vmap_area, rb_node);
- if (va->va_start < tmp_va->va_end)
- p = &(*p)->rb_left;
- else if (va->va_end > tmp_va->va_start)
- p = &(*p)->rb_right;
- else
- BUG();
+ while (n) {
+ struct vmap_area *tmp;
+
+ tmp = rb_entry(n, struct vmap_area, rb_node);
+ if (tmp->va_end > addr) {
+ va = tmp;
+ if (tmp->va_start <= addr)
+ break;
+
+ n = n->rb_left;
+ } else
+ n = n->rb_right;
}
- rb_link_node(&va->rb_node, parent, p);
- rb_insert_color(&va->rb_node, &vmap_area_root);
+ return va;
+}
- /* address-sort this list */
- tmp = rb_prev(&va->rb_node);
- if (tmp) {
- struct vmap_area *prev;
- prev = rb_entry(tmp, struct vmap_area, rb_node);
- list_add_rcu(&va->list, &prev->list);
- } else
- list_add_rcu(&va->list, &vmap_area_list);
+/*
+ * Returns a node where a first VA, that satisfies addr < va_end, resides.
+ * If success, a node is locked. A user is responsible to unlock it when a
+ * VA is no longer needed to be accessed.
+ *
+ * Returns NULL if nothing found.
+ */
+static struct vmap_node *
+find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
+{
+ unsigned long va_start_lowest;
+ struct vmap_node *vn;
+
+repeat:
+ va_start_lowest = 0;
+
+ for_each_vmap_node(vn) {
+ spin_lock(&vn->busy.lock);
+ *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root);
+
+ if (*va)
+ if (!va_start_lowest || (*va)->va_start < va_start_lowest)
+ va_start_lowest = (*va)->va_start;
+ spin_unlock(&vn->busy.lock);
+ }
+
+ /*
+ * Check if found VA exists, it might have gone away. In this case we
+ * repeat the search because a VA has been removed concurrently and we
+ * need to proceed to the next one, which is a rare case.
+ */
+ if (va_start_lowest) {
+ vn = addr_to_node(va_start_lowest);
+
+ spin_lock(&vn->busy.lock);
+ *va = __find_vmap_area(va_start_lowest, &vn->busy.root);
+
+ if (*va)
+ return vn;
+
+ spin_unlock(&vn->busy.lock);
+ goto repeat;
+ }
+
+ return NULL;
+}
+
+/*
+ * This function returns back addresses of parent node
+ * and its left or right link for further processing.
+ *
+ * Otherwise NULL is returned. In that case all further
+ * steps regarding inserting of conflicting overlap range
+ * have to be declined and actually considered as a bug.
+ */
+static __always_inline struct rb_node **
+find_va_links(struct vmap_area *va,
+ struct rb_root *root, struct rb_node *from,
+ struct rb_node **parent)
+{
+ struct vmap_area *tmp_va;
+ struct rb_node **link;
+
+ if (root) {
+ link = &root->rb_node;
+ if (unlikely(!*link)) {
+ *parent = NULL;
+ return link;
+ }
+ } else {
+ link = &from;
+ }
+
+ /*
+ * Go to the bottom of the tree. When we hit the last point
+ * we end up with parent rb_node and correct direction, i name
+ * it link, where the new va->rb_node will be attached to.
+ */
+ do {
+ tmp_va = rb_entry(*link, struct vmap_area, rb_node);
+
+ /*
+ * During the traversal we also do some sanity check.
+ * Trigger the BUG() if there are sides(left/right)
+ * or full overlaps.
+ */
+ if (va->va_end <= tmp_va->va_start)
+ link = &(*link)->rb_left;
+ else if (va->va_start >= tmp_va->va_end)
+ link = &(*link)->rb_right;
+ else {
+ WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
+ va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
+
+ return NULL;
+ }
+ } while (*link);
+
+ *parent = &tmp_va->rb_node;
+ return link;
+}
+
+static __always_inline struct list_head *
+get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
+{
+ struct list_head *list;
+
+ if (unlikely(!parent))
+ /*
+ * The red-black tree where we try to find VA neighbors
+ * before merging or inserting is empty, i.e. it means
+ * there is no free vmap space. Normally it does not
+ * happen but we handle this case anyway.
+ */
+ return NULL;
+
+ list = &rb_entry(parent, struct vmap_area, rb_node)->list;
+ return (&parent->rb_right == link ? list->next : list);
+}
+
+static __always_inline void
+__link_va(struct vmap_area *va, struct rb_root *root,
+ struct rb_node *parent, struct rb_node **link,
+ struct list_head *head, bool augment)
+{
+ /*
+ * VA is still not in the list, but we can
+ * identify its future previous list_head node.
+ */
+ if (likely(parent)) {
+ head = &rb_entry(parent, struct vmap_area, rb_node)->list;
+ if (&parent->rb_right != link)
+ head = head->prev;
+ }
+
+ /* Insert to the rb-tree */
+ rb_link_node(&va->rb_node, parent, link);
+ if (augment) {
+ /*
+ * Some explanation here. Just perform simple insertion
+ * to the tree. We do not set va->subtree_max_size to
+ * its current size before calling rb_insert_augmented().
+ * It is because we populate the tree from the bottom
+ * to parent levels when the node _is_ in the tree.
+ *
+ * Therefore we set subtree_max_size to zero after insertion,
+ * to let __augment_tree_propagate_from() puts everything to
+ * the correct order later on.
+ */
+ rb_insert_augmented(&va->rb_node,
+ root, &free_vmap_area_rb_augment_cb);
+ va->subtree_max_size = 0;
+ } else {
+ rb_insert_color(&va->rb_node, root);
+ }
+
+ /* Address-sort this list */
+ list_add(&va->list, head);
}
-static void purge_vmap_area_lazy(void);
+static __always_inline void
+link_va(struct vmap_area *va, struct rb_root *root,
+ struct rb_node *parent, struct rb_node **link,
+ struct list_head *head)
+{
+ __link_va(va, root, parent, link, head, false);
+}
+
+static __always_inline void
+link_va_augment(struct vmap_area *va, struct rb_root *root,
+ struct rb_node *parent, struct rb_node **link,
+ struct list_head *head)
+{
+ __link_va(va, root, parent, link, head, true);
+}
+
+static __always_inline void
+__unlink_va(struct vmap_area *va, struct rb_root *root, bool augment)
+{
+ if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
+ return;
+
+ if (augment)
+ rb_erase_augmented(&va->rb_node,
+ root, &free_vmap_area_rb_augment_cb);
+ else
+ rb_erase(&va->rb_node, root);
+
+ list_del_init(&va->list);
+ RB_CLEAR_NODE(&va->rb_node);
+}
+
+static __always_inline void
+unlink_va(struct vmap_area *va, struct rb_root *root)
+{
+ __unlink_va(va, root, false);
+}
+
+static __always_inline void
+unlink_va_augment(struct vmap_area *va, struct rb_root *root)
+{
+ __unlink_va(va, root, true);
+}
+#if DEBUG_AUGMENT_PROPAGATE_CHECK
/*
- * Allocate a region of KVA of the specified size and alignment, within the
- * vstart and vend.
+ * Gets called when remove the node and rotate.
*/
-static struct vmap_area *alloc_vmap_area(unsigned long size,
- unsigned long align,
- unsigned long vstart, unsigned long vend,
- int node, gfp_t gfp_mask)
+static __always_inline unsigned long
+compute_subtree_max_size(struct vmap_area *va)
+{
+ return max3(va_size(va),
+ get_subtree_max_size(va->rb_node.rb_left),
+ get_subtree_max_size(va->rb_node.rb_right));
+}
+
+static void
+augment_tree_propagate_check(void)
{
struct vmap_area *va;
- struct rb_node *n;
- unsigned long addr;
- int purged = 0;
- struct vmap_area *first;
+ unsigned long computed_size;
- BUG_ON(!size);
- BUG_ON(size & ~PAGE_MASK);
- BUG_ON(!is_power_of_2(align));
+ list_for_each_entry(va, &free_vmap_area_list, list) {
+ computed_size = compute_subtree_max_size(va);
+ if (computed_size != va->subtree_max_size)
+ pr_emerg("tree is corrupted: %lu, %lu\n",
+ va_size(va), va->subtree_max_size);
+ }
+}
+#endif
- va = kmalloc_node(sizeof(struct vmap_area),
- gfp_mask & GFP_RECLAIM_MASK, node);
- if (unlikely(!va))
- return ERR_PTR(-ENOMEM);
+/*
+ * This function populates subtree_max_size from bottom to upper
+ * levels starting from VA point. The propagation must be done
+ * when VA size is modified by changing its va_start/va_end. Or
+ * in case of newly inserting of VA to the tree.
+ *
+ * It means that __augment_tree_propagate_from() must be called:
+ * - After VA has been inserted to the tree(free path);
+ * - After VA has been shrunk(allocation path);
+ * - After VA has been increased(merging path).
+ *
+ * Please note that, it does not mean that upper parent nodes
+ * and their subtree_max_size are recalculated all the time up
+ * to the root node.
+ *
+ * 4--8
+ * /\
+ * / \
+ * / \
+ * 2--2 8--8
+ *
+ * For example if we modify the node 4, shrinking it to 2, then
+ * no any modification is required. If we shrink the node 2 to 1
+ * its subtree_max_size is updated only, and set to 1. If we shrink
+ * the node 8 to 6, then its subtree_max_size is set to 6 and parent
+ * node becomes 4--6.
+ */
+static __always_inline void
+augment_tree_propagate_from(struct vmap_area *va)
+{
+ /*
+ * Populate the tree from bottom towards the root until
+ * the calculated maximum available size of checked node
+ * is equal to its current one.
+ */
+ free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
+
+#if DEBUG_AUGMENT_PROPAGATE_CHECK
+ augment_tree_propagate_check();
+#endif
+}
+
+static void
+insert_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ struct rb_node **link;
+ struct rb_node *parent;
+
+ link = find_va_links(va, root, NULL, &parent);
+ if (link)
+ link_va(va, root, parent, link, head);
+}
+
+static void
+insert_vmap_area_augment(struct vmap_area *va,
+ struct rb_node *from, struct rb_root *root,
+ struct list_head *head)
+{
+ struct rb_node **link;
+ struct rb_node *parent;
+
+ if (from)
+ link = find_va_links(va, NULL, from, &parent);
+ else
+ link = find_va_links(va, root, NULL, &parent);
+
+ if (link) {
+ link_va_augment(va, root, parent, link, head);
+ augment_tree_propagate_from(va);
+ }
+}
+
+/*
+ * Merge de-allocated chunk of VA memory with previous
+ * and next free blocks. If coalesce is not done a new
+ * free area is inserted. If VA has been merged, it is
+ * freed.
+ *
+ * Please note, it can return NULL in case of overlap
+ * ranges, followed by WARN() report. Despite it is a
+ * buggy behaviour, a system can be alive and keep
+ * ongoing.
+ */
+static __always_inline struct vmap_area *
+__merge_or_add_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head, bool augment)
+{
+ struct vmap_area *sibling;
+ struct list_head *next;
+ struct rb_node **link;
+ struct rb_node *parent;
+ bool merged = false;
-retry:
- spin_lock(&vmap_area_lock);
/*
- * Invalidate cache if we have more permissive parameters.
- * cached_hole_size notes the largest hole noticed _below_
- * the vmap_area cached in free_vmap_cache: if size fits
- * into that hole, we want to scan from vstart to reuse
- * the hole instead of allocating above free_vmap_cache.
- * Note that __free_vmap_area may update free_vmap_cache
- * without updating cached_hole_size or cached_align.
+ * Find a place in the tree where VA potentially will be
+ * inserted, unless it is merged with its sibling/siblings.
*/
- if (!free_vmap_cache ||
- size < cached_hole_size ||
- vstart < cached_vstart ||
- align < cached_align) {
-nocache:
- cached_hole_size = 0;
- free_vmap_cache = NULL;
- }
- /* record if we encounter less permissive parameters */
- cached_vstart = vstart;
- cached_align = align;
-
- /* find starting point for our search */
- if (free_vmap_cache) {
- first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
- addr = ALIGN(first->va_end, align);
- if (addr < vstart)
- goto nocache;
- if (addr + size < addr)
- goto overflow;
+ link = find_va_links(va, root, NULL, &parent);
+ if (!link)
+ return NULL;
- } else {
- addr = ALIGN(vstart, align);
- if (addr + size < addr)
- goto overflow;
+ /*
+ * Get next node of VA to check if merging can be done.
+ */
+ next = get_va_next_sibling(parent, link);
+ if (unlikely(next == NULL))
+ goto insert;
- n = vmap_area_root.rb_node;
- first = NULL;
+ /*
+ * start end
+ * | |
+ * |<------VA------>|<-----Next----->|
+ * | |
+ * start end
+ */
+ if (next != head) {
+ sibling = list_entry(next, struct vmap_area, list);
+ if (sibling->va_start == va->va_end) {
+ sibling->va_start = va->va_start;
- while (n) {
- struct vmap_area *tmp;
- tmp = rb_entry(n, struct vmap_area, rb_node);
- if (tmp->va_end >= addr) {
- first = tmp;
- if (tmp->va_start <= addr)
- break;
- n = n->rb_left;
- } else
- n = n->rb_right;
- }
+ /* Free vmap_area object. */
+ kmem_cache_free(vmap_area_cachep, va);
- if (!first)
- goto found;
+ /* Point to the new merged area. */
+ va = sibling;
+ merged = true;
+ }
}
- /* from the starting point, walk areas until a suitable hole is found */
- while (addr + size > first->va_start && addr + size <= vend) {
- if (addr + cached_hole_size < first->va_start)
- cached_hole_size = first->va_start - addr;
- addr = ALIGN(first->va_end, align);
- if (addr + size < addr)
- goto overflow;
+ /*
+ * start end
+ * | |
+ * |<-----Prev----->|<------VA------>|
+ * | |
+ * start end
+ */
+ if (next->prev != head) {
+ sibling = list_entry(next->prev, struct vmap_area, list);
+ if (sibling->va_end == va->va_start) {
+ /*
+ * If both neighbors are coalesced, it is important
+ * to unlink the "next" node first, followed by merging
+ * with "previous" one. Otherwise the tree might not be
+ * fully populated if a sibling's augmented value is
+ * "normalized" because of rotation operations.
+ */
+ if (merged)
+ __unlink_va(va, root, augment);
+
+ sibling->va_end = va->va_end;
- if (list_is_last(&first->list, &vmap_area_list))
- goto found;
+ /* Free vmap_area object. */
+ kmem_cache_free(vmap_area_cachep, va);
- first = list_entry(first->list.next,
- struct vmap_area, list);
+ /* Point to the new merged area. */
+ va = sibling;
+ merged = true;
+ }
}
-found:
- if (addr + size > vend)
- goto overflow;
+insert:
+ if (!merged)
+ __link_va(va, root, parent, link, head, augment);
- va->va_start = addr;
- va->va_end = addr + size;
- va->flags = 0;
- __insert_vmap_area(va);
- free_vmap_cache = &va->rb_node;
- spin_unlock(&vmap_area_lock);
+ return va;
+}
- BUG_ON(va->va_start & (align-1));
- BUG_ON(va->va_start < vstart);
- BUG_ON(va->va_end > vend);
+static __always_inline struct vmap_area *
+merge_or_add_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ return __merge_or_add_vmap_area(va, root, head, false);
+}
+
+static __always_inline struct vmap_area *
+merge_or_add_vmap_area_augment(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ va = __merge_or_add_vmap_area(va, root, head, true);
+ if (va)
+ augment_tree_propagate_from(va);
return va;
+}
-overflow:
- spin_unlock(&vmap_area_lock);
- if (!purged) {
- purge_vmap_area_lazy();
- purged = 1;
- goto retry;
- }
- if (printk_ratelimit())
- printk(KERN_WARNING
- "vmap allocation for size %lu failed: "
- "use vmalloc=<size> to increase size.\n", size);
- kfree(va);
- return ERR_PTR(-EBUSY);
+static __always_inline bool
+is_within_this_va(struct vmap_area *va, unsigned long size,
+ unsigned long align, unsigned long vstart)
+{
+ unsigned long nva_start_addr;
+
+ if (va->va_start > vstart)
+ nva_start_addr = ALIGN(va->va_start, align);
+ else
+ nva_start_addr = ALIGN(vstart, align);
+
+ /* Can be overflowed due to big size or alignment. */
+ if (nva_start_addr + size < nva_start_addr ||
+ nva_start_addr < vstart)
+ return false;
+
+ return (nva_start_addr + size <= va->va_end);
}
-static void __free_vmap_area(struct vmap_area *va)
+/*
+ * Find the first free block(lowest start address) in the tree,
+ * that will accomplish the request corresponding to passing
+ * parameters. Please note, with an alignment bigger than PAGE_SIZE,
+ * a search length is adjusted to account for worst case alignment
+ * overhead.
+ */
+static __always_inline struct vmap_area *
+find_vmap_lowest_match(struct rb_root *root, unsigned long size,
+ unsigned long align, unsigned long vstart, bool adjust_search_size)
{
- BUG_ON(RB_EMPTY_NODE(&va->rb_node));
+ struct vmap_area *va;
+ struct rb_node *node;
+ unsigned long length;
+
+ /* Start from the root. */
+ node = root->rb_node;
+
+ /* Adjust the search size for alignment overhead. */
+ length = adjust_search_size ? size + align - 1 : size;
+
+ while (node) {
+ va = rb_entry(node, struct vmap_area, rb_node);
- if (free_vmap_cache) {
- if (va->va_end < cached_vstart) {
- free_vmap_cache = NULL;
+ if (get_subtree_max_size(node->rb_left) >= length &&
+ vstart < va->va_start) {
+ node = node->rb_left;
} else {
- struct vmap_area *cache;
- cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
- if (va->va_start <= cache->va_start) {
- free_vmap_cache = rb_prev(&va->rb_node);
- /*
- * We don't try to update cached_hole_size or
- * cached_align, but it won't go very wrong.
- */
+ if (is_within_this_va(va, size, align, vstart))
+ return va;
+
+ /*
+ * Does not make sense to go deeper towards the right
+ * sub-tree if it does not have a free block that is
+ * equal or bigger to the requested search length.
+ */
+ if (get_subtree_max_size(node->rb_right) >= length) {
+ node = node->rb_right;
+ continue;
+ }
+
+ /*
+ * OK. We roll back and find the first right sub-tree,
+ * that will satisfy the search criteria. It can happen
+ * due to "vstart" restriction or an alignment overhead
+ * that is bigger then PAGE_SIZE.
+ */
+ while ((node = rb_parent(node))) {
+ va = rb_entry(node, struct vmap_area, rb_node);
+ if (is_within_this_va(va, size, align, vstart))
+ return va;
+
+ if (get_subtree_max_size(node->rb_right) >= length &&
+ vstart <= va->va_start) {
+ /*
+ * Shift the vstart forward. Please note, we update it with
+ * parent's start address adding "1" because we do not want
+ * to enter same sub-tree after it has already been checked
+ * and no suitable free block found there.
+ */
+ vstart = va->va_start + 1;
+ node = node->rb_right;
+ break;
+ }
}
}
}
- rb_erase(&va->rb_node, &vmap_area_root);
- RB_CLEAR_NODE(&va->rb_node);
- list_del_rcu(&va->list);
+
+ return NULL;
+}
+
+#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
+#include <linux/random.h>
+
+static struct vmap_area *
+find_vmap_lowest_linear_match(struct list_head *head, unsigned long size,
+ unsigned long align, unsigned long vstart)
+{
+ struct vmap_area *va;
+
+ list_for_each_entry(va, head, list) {
+ if (!is_within_this_va(va, size, align, vstart))
+ continue;
+
+ return va;
+ }
+
+ return NULL;
+}
+
+static void
+find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head,
+ unsigned long size, unsigned long align)
+{
+ struct vmap_area *va_1, *va_2;
+ unsigned long vstart;
+ unsigned int rnd;
+
+ get_random_bytes(&rnd, sizeof(rnd));
+ vstart = VMALLOC_START + rnd;
+
+ va_1 = find_vmap_lowest_match(root, size, align, vstart, false);
+ va_2 = find_vmap_lowest_linear_match(head, size, align, vstart);
+
+ if (va_1 != va_2)
+ pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
+ va_1, va_2, vstart);
+}
+#endif
+
+enum fit_type {
+ NOTHING_FIT = 0,
+ FL_FIT_TYPE = 1, /* full fit */
+ LE_FIT_TYPE = 2, /* left edge fit */
+ RE_FIT_TYPE = 3, /* right edge fit */
+ NE_FIT_TYPE = 4 /* no edge fit */
+};
+
+static __always_inline enum fit_type
+classify_va_fit_type(struct vmap_area *va,
+ unsigned long nva_start_addr, unsigned long size)
+{
+ enum fit_type type;
+
+ /* Check if it is within VA. */
+ if (nva_start_addr < va->va_start ||
+ nva_start_addr + size > va->va_end)
+ return NOTHING_FIT;
+
+ /* Now classify. */
+ if (va->va_start == nva_start_addr) {
+ if (va->va_end == nva_start_addr + size)
+ type = FL_FIT_TYPE;
+ else
+ type = LE_FIT_TYPE;
+ } else if (va->va_end == nva_start_addr + size) {
+ type = RE_FIT_TYPE;
+ } else {
+ type = NE_FIT_TYPE;
+ }
+
+ return type;
+}
+
+static __always_inline int
+va_clip(struct rb_root *root, struct list_head *head,
+ struct vmap_area *va, unsigned long nva_start_addr,
+ unsigned long size)
+{
+ struct vmap_area *lva = NULL;
+ enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);
+
+ if (type == FL_FIT_TYPE) {
+ /*
+ * No need to split VA, it fully fits.
+ *
+ * | |
+ * V NVA V
+ * |---------------|
+ */
+ unlink_va_augment(va, root);
+ kmem_cache_free(vmap_area_cachep, va);
+ } else if (type == LE_FIT_TYPE) {
+ /*
+ * Split left edge of fit VA.
+ *
+ * | |
+ * V NVA V R
+ * |-------|-------|
+ */
+ va->va_start += size;
+ } else if (type == RE_FIT_TYPE) {
+ /*
+ * Split right edge of fit VA.
+ *
+ * | |
+ * L V NVA V
+ * |-------|-------|
+ */
+ va->va_end = nva_start_addr;
+ } else if (type == NE_FIT_TYPE) {
+ /*
+ * Split no edge of fit VA.
+ *
+ * | |
+ * L V NVA V R
+ * |---|-------|---|
+ */
+ lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
+ if (unlikely(!lva)) {
+ /*
+ * For percpu allocator we do not do any pre-allocation
+ * and leave it as it is. The reason is it most likely
+ * never ends up with NE_FIT_TYPE splitting. In case of
+ * percpu allocations offsets and sizes are aligned to
+ * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
+ * are its main fitting cases.
+ *
+ * There are a few exceptions though, as an example it is
+ * a first allocation (early boot up) when we have "one"
+ * big free space that has to be split.
+ *
+ * Also we can hit this path in case of regular "vmap"
+ * allocations, if "this" current CPU was not preloaded.
+ * See the comment in alloc_vmap_area() why. If so, then
+ * GFP_NOWAIT is used instead to get an extra object for
+ * split purpose. That is rare and most time does not
+ * occur.
+ *
+ * What happens if an allocation gets failed. Basically,
+ * an "overflow" path is triggered to purge lazily freed
+ * areas to free some memory, then, the "retry" path is
+ * triggered to repeat one more time. See more details
+ * in alloc_vmap_area() function.
+ */
+ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
+ if (!lva)
+ return -ENOMEM;
+ }
+
+ /*
+ * Build the remainder.
+ */
+ lva->va_start = va->va_start;
+ lva->va_end = nva_start_addr;
+
+ /*
+ * Shrink this VA to remaining size.
+ */
+ va->va_start = nva_start_addr + size;
+ } else {
+ return -EINVAL;
+ }
+
+ if (type != FL_FIT_TYPE) {
+ augment_tree_propagate_from(va);
+
+ if (lva) /* type == NE_FIT_TYPE */
+ insert_vmap_area_augment(lva, &va->rb_node, root, head);
+ }
+
+ return 0;
+}
+
+static unsigned long
+va_alloc(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head,
+ unsigned long size, unsigned long align,
+ unsigned long vstart, unsigned long vend)
+{
+ unsigned long nva_start_addr;
+ int ret;
+
+ if (va->va_start > vstart)
+ nva_start_addr = ALIGN(va->va_start, align);
+ else
+ nva_start_addr = ALIGN(vstart, align);
+
+ /* Check the "vend" restriction. */
+ if (nva_start_addr + size > vend)
+ return -ERANGE;
+
+ /* Update the free vmap_area. */
+ ret = va_clip(root, head, va, nva_start_addr, size);
+ if (WARN_ON_ONCE(ret))
+ return ret;
+
+ return nva_start_addr;
+}
+
+/*
+ * Returns a start address of the newly allocated area, if success.
+ * Otherwise an error value is returned that indicates failure.
+ */
+static __always_inline unsigned long
+__alloc_vmap_area(struct rb_root *root, struct list_head *head,
+ unsigned long size, unsigned long align,
+ unsigned long vstart, unsigned long vend)
+{
+ bool adjust_search_size = true;
+ unsigned long nva_start_addr;
+ struct vmap_area *va;
/*
- * Track the highest possible candidate for pcpu area
- * allocation. Areas outside of vmalloc area can be returned
- * here too, consider only end addresses which fall inside
- * vmalloc area proper.
+ * Do not adjust when:
+ * a) align <= PAGE_SIZE, because it does not make any sense.
+ * All blocks(their start addresses) are at least PAGE_SIZE
+ * aligned anyway;
+ * b) a short range where a requested size corresponds to exactly
+ * specified [vstart:vend] interval and an alignment > PAGE_SIZE.
+ * With adjusted search length an allocation would not succeed.
*/
- if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
- vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
+ if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
+ adjust_search_size = false;
+
+ va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
+ if (unlikely(!va))
+ return -ENOENT;
- kfree_rcu(va, rcu_head);
+ nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend);
+
+#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
+ if (!IS_ERR_VALUE(nva_start_addr))
+ find_vmap_lowest_match_check(root, head, size, align);
+#endif
+
+ return nva_start_addr;
}
/*
@@ -502,39 +1868,302 @@ static void __free_vmap_area(struct vmap_area *va)
*/
static void free_vmap_area(struct vmap_area *va)
{
- spin_lock(&vmap_area_lock);
- __free_vmap_area(va);
- spin_unlock(&vmap_area_lock);
+ struct vmap_node *vn = addr_to_node(va->va_start);
+
+ /*
+ * Remove from the busy tree/list.
+ */
+ spin_lock(&vn->busy.lock);
+ unlink_va(va, &vn->busy.root);
+ spin_unlock(&vn->busy.lock);
+
+ /*
+ * Insert/Merge it back to the free tree/list.
+ */
+ spin_lock(&free_vmap_area_lock);
+ merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
+ spin_unlock(&free_vmap_area_lock);
+}
+
+static inline void
+preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
+{
+ struct vmap_area *va = NULL, *tmp;
+
+ /*
+ * Preload this CPU with one extra vmap_area object. It is used
+ * when fit type of free area is NE_FIT_TYPE. It guarantees that
+ * a CPU that does an allocation is preloaded.
+ *
+ * We do it in non-atomic context, thus it allows us to use more
+ * permissive allocation masks to be more stable under low memory
+ * condition and high memory pressure.
+ */
+ if (!this_cpu_read(ne_fit_preload_node))
+ va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
+
+ spin_lock(lock);
+
+ tmp = NULL;
+ if (va && !__this_cpu_try_cmpxchg(ne_fit_preload_node, &tmp, va))
+ kmem_cache_free(vmap_area_cachep, va);
}
-/*
- * Clear the pagetable entries of a given vmap_area
- */
-static void unmap_vmap_area(struct vmap_area *va)
+static struct vmap_pool *
+size_to_va_pool(struct vmap_node *vn, unsigned long size)
+{
+ unsigned int idx = (size - 1) / PAGE_SIZE;
+
+ if (idx < MAX_VA_SIZE_PAGES)
+ return &vn->pool[idx];
+
+ return NULL;
+}
+
+static bool
+node_pool_add_va(struct vmap_node *n, struct vmap_area *va)
+{
+ struct vmap_pool *vp;
+
+ vp = size_to_va_pool(n, va_size(va));
+ if (!vp)
+ return false;
+
+ spin_lock(&n->pool_lock);
+ list_add(&va->list, &vp->head);
+ WRITE_ONCE(vp->len, vp->len + 1);
+ spin_unlock(&n->pool_lock);
+
+ return true;
+}
+
+static struct vmap_area *
+node_pool_del_va(struct vmap_node *vn, unsigned long size,
+ unsigned long align, unsigned long vstart,
+ unsigned long vend)
+{
+ struct vmap_area *va = NULL;
+ struct vmap_pool *vp;
+ int err = 0;
+
+ vp = size_to_va_pool(vn, size);
+ if (!vp || list_empty(&vp->head))
+ return NULL;
+
+ spin_lock(&vn->pool_lock);
+ if (!list_empty(&vp->head)) {
+ va = list_first_entry(&vp->head, struct vmap_area, list);
+
+ if (IS_ALIGNED(va->va_start, align)) {
+ /*
+ * Do some sanity check and emit a warning
+ * if one of below checks detects an error.
+ */
+ err |= (va_size(va) != size);
+ err |= (va->va_start < vstart);
+ err |= (va->va_end > vend);
+
+ if (!WARN_ON_ONCE(err)) {
+ list_del_init(&va->list);
+ WRITE_ONCE(vp->len, vp->len - 1);
+ } else {
+ va = NULL;
+ }
+ } else {
+ list_move_tail(&va->list, &vp->head);
+ va = NULL;
+ }
+ }
+ spin_unlock(&vn->pool_lock);
+
+ return va;
+}
+
+static struct vmap_area *
+node_alloc(unsigned long size, unsigned long align,
+ unsigned long vstart, unsigned long vend,
+ unsigned long *addr, unsigned int *vn_id)
{
- vunmap_page_range(va->va_start, va->va_end);
+ struct vmap_area *va;
+
+ *vn_id = 0;
+ *addr = -EINVAL;
+
+ /*
+ * Fallback to a global heap if not vmalloc or there
+ * is only one node.
+ */
+ if (vstart != VMALLOC_START || vend != VMALLOC_END ||
+ nr_vmap_nodes == 1)
+ return NULL;
+
+ *vn_id = raw_smp_processor_id() % nr_vmap_nodes;
+ va = node_pool_del_va(id_to_node(*vn_id), size, align, vstart, vend);
+ *vn_id = encode_vn_id(*vn_id);
+
+ if (va)
+ *addr = va->va_start;
+
+ return va;
}
-static void vmap_debug_free_range(unsigned long start, unsigned long end)
+static inline void setup_vmalloc_vm(struct vm_struct *vm,
+ struct vmap_area *va, unsigned long flags, const void *caller)
{
+ vm->flags = flags;
+ vm->addr = (void *)va->va_start;
+ vm->size = vm->requested_size = va_size(va);
+ vm->caller = caller;
+ va->vm = vm;
+}
+
+/*
+ * Allocate a region of KVA of the specified size and alignment, within the
+ * vstart and vend. If vm is passed in, the two will also be bound.
+ */
+static struct vmap_area *alloc_vmap_area(unsigned long size,
+ unsigned long align,
+ unsigned long vstart, unsigned long vend,
+ int node, gfp_t gfp_mask,
+ unsigned long va_flags, struct vm_struct *vm)
+{
+ struct vmap_node *vn;
+ struct vmap_area *va;
+ unsigned long freed;
+ unsigned long addr;
+ unsigned int vn_id;
+ bool allow_block;
+ int purged = 0;
+ int ret;
+
+ if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align)))
+ return ERR_PTR(-EINVAL);
+
+ if (unlikely(!vmap_initialized))
+ return ERR_PTR(-EBUSY);
+
+ /* Only reclaim behaviour flags are relevant. */
+ gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
+ allow_block = gfpflags_allow_blocking(gfp_mask);
+ might_sleep_if(allow_block);
+
/*
- * Unmap page tables and force a TLB flush immediately if
- * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free
- * bugs similarly to those in linear kernel virtual address
- * space after a page has been freed.
+ * If a VA is obtained from a global heap(if it fails here)
+ * it is anyway marked with this "vn_id" so it is returned
+ * to this pool's node later. Such way gives a possibility
+ * to populate pools based on users demand.
*
- * All the lazy freeing logic is still retained, in order to
- * minimise intrusiveness of this debugging feature.
- *
- * This is going to be *slow* (linear kernel virtual address
- * debugging doesn't do a broadcast TLB flush so it is a lot
- * faster).
+ * On success a ready to go VA is returned.
*/
-#ifdef CONFIG_DEBUG_PAGEALLOC
- vunmap_page_range(start, end);
- flush_tlb_kernel_range(start, end);
-#endif
+ va = node_alloc(size, align, vstart, vend, &addr, &vn_id);
+ if (!va) {
+ va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
+ if (unlikely(!va))
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Only scan the relevant parts containing pointers to other objects
+ * to avoid false negatives.
+ */
+ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
+ }
+
+retry:
+ if (IS_ERR_VALUE(addr)) {
+ preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
+ addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
+ size, align, vstart, vend);
+ spin_unlock(&free_vmap_area_lock);
+
+ /*
+ * This is not a fast path. Check if yielding is needed. This
+ * is the only reschedule point in the vmalloc() path.
+ */
+ if (allow_block)
+ cond_resched();
+ }
+
+ trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr));
+
+ /*
+ * If an allocation fails, the error value is
+ * returned. Therefore trigger the overflow path.
+ */
+ if (IS_ERR_VALUE(addr)) {
+ if (allow_block)
+ goto overflow;
+
+ /*
+ * We can not trigger any reclaim logic because
+ * sleeping is not allowed, thus fail an allocation.
+ */
+ goto out_free_va;
+ }
+
+ va->va_start = addr;
+ va->va_end = addr + size;
+ va->vm = NULL;
+ va->flags = (va_flags | vn_id);
+
+ if (vm) {
+ vm->addr = (void *)va->va_start;
+ vm->size = va_size(va);
+ va->vm = vm;
+ }
+
+ vn = addr_to_node(va->va_start);
+
+ spin_lock(&vn->busy.lock);
+ insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
+ spin_unlock(&vn->busy.lock);
+
+ BUG_ON(!IS_ALIGNED(va->va_start, align));
+ BUG_ON(va->va_start < vstart);
+ BUG_ON(va->va_end > vend);
+
+ ret = kasan_populate_vmalloc(addr, size, gfp_mask);
+ if (ret) {
+ free_vmap_area(va);
+ return ERR_PTR(ret);
+ }
+
+ return va;
+
+overflow:
+ if (!purged) {
+ reclaim_and_purge_vmap_areas();
+ purged = 1;
+ goto retry;
+ }
+
+ freed = 0;
+ blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
+
+ if (freed > 0) {
+ purged = 0;
+ goto retry;
+ }
+
+ if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
+ pr_warn("vmalloc_node_range for size %lu failed: Address range restricted to %#lx - %#lx\n",
+ size, vstart, vend);
+
+out_free_va:
+ kmem_cache_free(vmap_area_cachep, va);
+ return ERR_PTR(-EBUSY);
+}
+
+int register_vmap_purge_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&vmap_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
+
+int unregister_vmap_purge_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
}
+EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
/*
* lazy_max_pages is the maximum amount of virtual address space we gather up
@@ -561,125 +2190,275 @@ static unsigned long lazy_max_pages(void)
return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}
-static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+/*
+ * Serialize vmap purging. There is no actual critical section protected
+ * by this lock, but we want to avoid concurrent calls for performance
+ * reasons and to make the pcpu_get_vm_areas more deterministic.
+ */
+static DEFINE_MUTEX(vmap_purge_lock);
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
-/*
- * called before a call to iounmap() if the caller wants vm_area_struct's
- * immediately freed.
- */
-void set_iounmap_nonlazy(void)
+static void
+reclaim_list_global(struct list_head *head)
{
- atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
+ struct vmap_area *va, *n;
+
+ if (list_empty(head))
+ return;
+
+ spin_lock(&free_vmap_area_lock);
+ list_for_each_entry_safe(va, n, head, list)
+ merge_or_add_vmap_area_augment(va,
+ &free_vmap_area_root, &free_vmap_area_list);
+ spin_unlock(&free_vmap_area_lock);
}
-/*
- * Purges all lazily-freed vmap areas.
- *
- * If sync is 0 then don't purge if there is already a purge in progress.
- * If force_flush is 1, then flush kernel TLBs between *start and *end even
- * if we found no lazy vmap areas to unmap (callers can use this to optimise
- * their own TLB flushing).
- * Returns with *start = min(*start, lowest purged address)
- * *end = max(*end, highest purged address)
- */
-static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
- int sync, int force_flush)
+static void
+decay_va_pool_node(struct vmap_node *vn, bool full_decay)
{
- static DEFINE_SPINLOCK(purge_lock);
- LIST_HEAD(valist);
- struct vmap_area *va;
- struct vmap_area *n_va;
- int nr = 0;
+ LIST_HEAD(decay_list);
+ struct rb_root decay_root = RB_ROOT;
+ struct vmap_area *va, *nva;
+ unsigned long n_decay, pool_len;
+ int i;
- /*
- * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
- * should not expect such behaviour. This just simplifies locking for
- * the case that isn't actually used at the moment anyway.
- */
- if (!sync && !force_flush) {
- if (!spin_trylock(&purge_lock))
- return;
- } else
- spin_lock(&purge_lock);
+ for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
+ LIST_HEAD(tmp_list);
+
+ if (list_empty(&vn->pool[i].head))
+ continue;
- if (sync)
- purge_fragmented_blocks_allcpus();
+ /* Detach the pool, so no-one can access it. */
+ spin_lock(&vn->pool_lock);
+ list_replace_init(&vn->pool[i].head, &tmp_list);
+ spin_unlock(&vn->pool_lock);
- rcu_read_lock();
- list_for_each_entry_rcu(va, &vmap_area_list, list) {
- if (va->flags & VM_LAZY_FREE) {
- if (va->va_start < *start)
- *start = va->va_start;
- if (va->va_end > *end)
- *end = va->va_end;
- nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
- list_add_tail(&va->purge_list, &valist);
- va->flags |= VM_LAZY_FREEING;
- va->flags &= ~VM_LAZY_FREE;
+ pool_len = n_decay = vn->pool[i].len;
+ WRITE_ONCE(vn->pool[i].len, 0);
+
+ /* Decay a pool by ~25% out of left objects. */
+ if (!full_decay)
+ n_decay >>= 2;
+ pool_len -= n_decay;
+
+ list_for_each_entry_safe(va, nva, &tmp_list, list) {
+ if (!n_decay--)
+ break;
+
+ list_del_init(&va->list);
+ merge_or_add_vmap_area(va, &decay_root, &decay_list);
+ }
+
+ /*
+ * Attach the pool back if it has been partly decayed.
+ * Please note, it is supposed that nobody(other contexts)
+ * can populate the pool therefore a simple list replace
+ * operation takes place here.
+ */
+ if (!list_empty(&tmp_list)) {
+ spin_lock(&vn->pool_lock);
+ list_replace_init(&tmp_list, &vn->pool[i].head);
+ WRITE_ONCE(vn->pool[i].len, pool_len);
+ spin_unlock(&vn->pool_lock);
}
}
- rcu_read_unlock();
- if (nr)
- atomic_sub(nr, &vmap_lazy_nr);
+ reclaim_list_global(&decay_list);
+}
+
+static void
+kasan_release_vmalloc_node(struct vmap_node *vn)
+{
+ struct vmap_area *va;
+ unsigned long start, end;
- if (nr || force_flush)
- flush_tlb_kernel_range(*start, *end);
+ start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
+ end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
- if (nr) {
- spin_lock(&vmap_area_lock);
- list_for_each_entry_safe(va, n_va, &valist, purge_list)
- __free_vmap_area(va);
- spin_unlock(&vmap_area_lock);
+ list_for_each_entry(va, &vn->purge_list, list) {
+ if (is_vmalloc_or_module_addr((void *) va->va_start))
+ kasan_release_vmalloc(va->va_start, va->va_end,
+ va->va_start, va->va_end,
+ KASAN_VMALLOC_PAGE_RANGE);
}
- spin_unlock(&purge_lock);
+
+ kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
}
-/*
- * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
- * is already purging.
- */
-static void try_purge_vmap_area_lazy(void)
+static void purge_vmap_node(struct work_struct *work)
{
- unsigned long start = ULONG_MAX, end = 0;
+ struct vmap_node *vn = container_of(work,
+ struct vmap_node, purge_work);
+ unsigned long nr_purged_pages = 0;
+ struct vmap_area *va, *n_va;
+ LIST_HEAD(local_list);
+
+ if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
+ kasan_release_vmalloc_node(vn);
- __purge_vmap_area_lazy(&start, &end, 0, 0);
+ vn->nr_purged = 0;
+
+ list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
+ unsigned long nr = va_size(va) >> PAGE_SHIFT;
+ unsigned int vn_id = decode_vn_id(va->flags);
+
+ list_del_init(&va->list);
+
+ nr_purged_pages += nr;
+ vn->nr_purged++;
+
+ if (is_vn_id_valid(vn_id) && !vn->skip_populate)
+ if (node_pool_add_va(vn, va))
+ continue;
+
+ /* Go back to global. */
+ list_add(&va->list, &local_list);
+ }
+
+ atomic_long_sub(nr_purged_pages, &vmap_lazy_nr);
+
+ reclaim_list_global(&local_list);
}
/*
- * Kick off a purge of the outstanding lazy areas.
+ * Purges all lazily-freed vmap areas.
*/
-static void purge_vmap_area_lazy(void)
+static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
+ bool full_pool_decay)
{
- unsigned long start = ULONG_MAX, end = 0;
+ unsigned long nr_purged_areas = 0;
+ unsigned int nr_purge_helpers;
+ static cpumask_t purge_nodes;
+ unsigned int nr_purge_nodes;
+ struct vmap_node *vn;
+ int i;
+
+ lockdep_assert_held(&vmap_purge_lock);
+
+ /*
+ * Use cpumask to mark which node has to be processed.
+ */
+ purge_nodes = CPU_MASK_NONE;
+
+ for_each_vmap_node(vn) {
+ INIT_LIST_HEAD(&vn->purge_list);
+ vn->skip_populate = full_pool_decay;
+ decay_va_pool_node(vn, full_pool_decay);
+
+ if (RB_EMPTY_ROOT(&vn->lazy.root))
+ continue;
+
+ spin_lock(&vn->lazy.lock);
+ WRITE_ONCE(vn->lazy.root.rb_node, NULL);
+ list_replace_init(&vn->lazy.head, &vn->purge_list);
+ spin_unlock(&vn->lazy.lock);
+
+ start = min(start, list_first_entry(&vn->purge_list,
+ struct vmap_area, list)->va_start);
+
+ end = max(end, list_last_entry(&vn->purge_list,
+ struct vmap_area, list)->va_end);
- __purge_vmap_area_lazy(&start, &end, 1, 0);
+ cpumask_set_cpu(node_to_id(vn), &purge_nodes);
+ }
+
+ nr_purge_nodes = cpumask_weight(&purge_nodes);
+ if (nr_purge_nodes > 0) {
+ flush_tlb_kernel_range(start, end);
+
+ /* One extra worker is per a lazy_max_pages() full set minus one. */
+ nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages();
+ nr_purge_helpers = clamp(nr_purge_helpers, 1U, nr_purge_nodes) - 1;
+
+ for_each_cpu(i, &purge_nodes) {
+ vn = &vmap_nodes[i];
+
+ if (nr_purge_helpers > 0) {
+ INIT_WORK(&vn->purge_work, purge_vmap_node);
+
+ if (cpumask_test_cpu(i, cpu_online_mask))
+ schedule_work_on(i, &vn->purge_work);
+ else
+ schedule_work(&vn->purge_work);
+
+ nr_purge_helpers--;
+ } else {
+ vn->purge_work.func = NULL;
+ purge_vmap_node(&vn->purge_work);
+ nr_purged_areas += vn->nr_purged;
+ }
+ }
+
+ for_each_cpu(i, &purge_nodes) {
+ vn = &vmap_nodes[i];
+
+ if (vn->purge_work.func) {
+ flush_work(&vn->purge_work);
+ nr_purged_areas += vn->nr_purged;
+ }
+ }
+ }
+
+ trace_purge_vmap_area_lazy(start, end, nr_purged_areas);
+ return nr_purged_areas > 0;
}
/*
- * Free a vmap area, caller ensuring that the area has been unmapped
- * and flush_cache_vunmap had been called for the correct range
- * previously.
+ * Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list.
*/
-static void free_vmap_area_noflush(struct vmap_area *va)
+static void reclaim_and_purge_vmap_areas(void)
+
+{
+ mutex_lock(&vmap_purge_lock);
+ purge_fragmented_blocks_allcpus();
+ __purge_vmap_area_lazy(ULONG_MAX, 0, true);
+ mutex_unlock(&vmap_purge_lock);
+}
+
+static void drain_vmap_area_work(struct work_struct *work)
{
- va->flags |= VM_LAZY_FREE;
- atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
- if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
- try_purge_vmap_area_lazy();
+ mutex_lock(&vmap_purge_lock);
+ __purge_vmap_area_lazy(ULONG_MAX, 0, false);
+ mutex_unlock(&vmap_purge_lock);
}
/*
- * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
- * called for the correct range previously.
+ * Free a vmap area, caller ensuring that the area has been unmapped,
+ * unlinked and flush_cache_vunmap had been called for the correct
+ * range previously.
*/
-static void free_unmap_vmap_area_noflush(struct vmap_area *va)
+static void free_vmap_area_noflush(struct vmap_area *va)
{
- unmap_vmap_area(va);
- free_vmap_area_noflush(va);
+ unsigned long nr_lazy_max = lazy_max_pages();
+ unsigned long va_start = va->va_start;
+ unsigned int vn_id = decode_vn_id(va->flags);
+ struct vmap_node *vn;
+ unsigned long nr_lazy;
+
+ if (WARN_ON_ONCE(!list_empty(&va->list)))
+ return;
+
+ nr_lazy = atomic_long_add_return_relaxed(va_size(va) >> PAGE_SHIFT,
+ &vmap_lazy_nr);
+
+ /*
+ * If it was request by a certain node we would like to
+ * return it to that node, i.e. its pool for later reuse.
+ */
+ vn = is_vn_id_valid(vn_id) ?
+ id_to_node(vn_id):addr_to_node(va->va_start);
+
+ spin_lock(&vn->lazy.lock);
+ insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head);
+ spin_unlock(&vn->lazy.lock);
+
+ trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
+
+ /* After this point, we may free va at any time */
+ if (unlikely(nr_lazy > nr_lazy_max))
+ schedule_work(&drain_vmap_work);
}
/*
@@ -688,29 +2467,75 @@ static void free_unmap_vmap_area_noflush(struct vmap_area *va)
static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
- free_unmap_vmap_area_noflush(va);
+ vunmap_range_noflush(va->va_start, va->va_end);
+ if (debug_pagealloc_enabled_static())
+ flush_tlb_kernel_range(va->va_start, va->va_end);
+
+ free_vmap_area_noflush(va);
}
-static struct vmap_area *find_vmap_area(unsigned long addr)
+struct vmap_area *find_vmap_area(unsigned long addr)
{
+ struct vmap_node *vn;
struct vmap_area *va;
+ int i, j;
- spin_lock(&vmap_area_lock);
- va = __find_vmap_area(addr);
- spin_unlock(&vmap_area_lock);
+ if (unlikely(!vmap_initialized))
+ return NULL;
- return va;
+ /*
+ * An addr_to_node_id(addr) converts an address to a node index
+ * where a VA is located. If VA spans several zones and passed
+ * addr is not the same as va->va_start, what is not common, we
+ * may need to scan extra nodes. See an example:
+ *
+ * <----va---->
+ * -|-----|-----|-----|-----|-
+ * 1 2 0 1
+ *
+ * VA resides in node 1 whereas it spans 1, 2 an 0. If passed
+ * addr is within 2 or 0 nodes we should do extra work.
+ */
+ i = j = addr_to_node_id(addr);
+ do {
+ vn = &vmap_nodes[i];
+
+ spin_lock(&vn->busy.lock);
+ va = __find_vmap_area(addr, &vn->busy.root);
+ spin_unlock(&vn->busy.lock);
+
+ if (va)
+ return va;
+ } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);
+
+ return NULL;
}
-static void free_unmap_vmap_area_addr(unsigned long addr)
+static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
{
+ struct vmap_node *vn;
struct vmap_area *va;
+ int i, j;
- va = find_vmap_area(addr);
- BUG_ON(!va);
- free_unmap_vmap_area(va);
-}
+ /*
+ * Check the comment in the find_vmap_area() about the loop.
+ */
+ i = j = addr_to_node_id(addr);
+ do {
+ vn = &vmap_nodes[i];
+ spin_lock(&vn->busy.lock);
+ va = __find_vmap_area(addr, &vn->busy.root);
+ if (va)
+ unlink_va(va, &vn->busy.root);
+ spin_unlock(&vn->busy.lock);
+
+ if (va)
+ return va;
+ } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);
+
+ return NULL;
+}
/*** Per cpu kva allocator ***/
@@ -742,34 +2567,94 @@ static void free_unmap_vmap_area_addr(unsigned long addr)
#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
-static bool vmap_initialized __read_mostly = false;
+/*
+ * Purge threshold to prevent overeager purging of fragmented blocks for
+ * regular operations: Purge if vb->free is less than 1/4 of the capacity.
+ */
+#define VMAP_PURGE_THRESHOLD (VMAP_BBMAP_BITS / 4)
+
+#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/
+#define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/
+#define VMAP_FLAGS_MASK 0x3
struct vmap_block_queue {
spinlock_t lock;
struct list_head free;
+
+ /*
+ * An xarray requires an extra memory dynamically to
+ * be allocated. If it is an issue, we can use rb-tree
+ * instead.
+ */
+ struct xarray vmap_blocks;
};
struct vmap_block {
spinlock_t lock;
struct vmap_area *va;
- struct vmap_block_queue *vbq;
unsigned long free, dirty;
- DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
+ DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS);
+ unsigned long dirty_min, dirty_max; /*< dirty range */
struct list_head free_list;
struct rcu_head rcu_head;
struct list_head purge;
+ unsigned int cpu;
};
/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
/*
- * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
- * in the free path. Could get rid of this if we change the API to return a
- * "cookie" from alloc, to be passed to free. But no big deal yet.
+ * In order to fast access to any "vmap_block" associated with a
+ * specific address, we use a hash.
+ *
+ * A per-cpu vmap_block_queue is used in both ways, to serialize
+ * an access to free block chains among CPUs(alloc path) and it
+ * also acts as a vmap_block hash(alloc/free paths). It means we
+ * overload it, since we already have the per-cpu array which is
+ * used as a hash table. When used as a hash a 'cpu' passed to
+ * per_cpu() is not actually a CPU but rather a hash index.
+ *
+ * A hash function is addr_to_vb_xa() which hashes any address
+ * to a specific index(in a hash) it belongs to. This then uses a
+ * per_cpu() macro to access an array with generated index.
+ *
+ * An example:
+ *
+ * CPU_1 CPU_2 CPU_0
+ * | | |
+ * V V V
+ * 0 10 20 30 40 50 60
+ * |------|------|------|------|------|------|...<vmap address space>
+ * CPU0 CPU1 CPU2 CPU0 CPU1 CPU2
+ *
+ * - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus
+ * it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock;
+ *
+ * - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus
+ * it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock;
+ *
+ * - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus
+ * it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock.
+ *
+ * This technique almost always avoids lock contention on insert/remove,
+ * however xarray spinlocks protect against any contention that remains.
*/
-static DEFINE_SPINLOCK(vmap_block_tree_lock);
-static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
+static struct xarray *
+addr_to_vb_xa(unsigned long addr)
+{
+ int index = (addr / VMAP_BLOCK_SIZE) % nr_cpu_ids;
+
+ /*
+ * Please note, nr_cpu_ids points on a highest set
+ * possible bit, i.e. we never invoke cpumask_next()
+ * if an index points on it which is nr_cpu_ids - 1.
+ */
+ if (!cpu_possible(index))
+ index = cpumask_next(index, cpu_possible_mask);
+
+ return &per_cpu(vmap_block_queue, index).vmap_blocks;
+}
/*
* We should probably have a fallback mechanism to allocate virtual memory
@@ -785,107 +2670,161 @@ static unsigned long addr_to_vb_idx(unsigned long addr)
return addr;
}
-static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
+static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
+{
+ unsigned long addr;
+
+ addr = va_start + (pages_off << PAGE_SHIFT);
+ BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
+ return (void *)addr;
+}
+
+/**
+ * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
+ * block. Of course pages number can't exceed VMAP_BBMAP_BITS
+ * @order: how many 2^order pages should be occupied in newly allocated block
+ * @gfp_mask: flags for the page level allocator
+ *
+ * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
+ */
+static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
{
struct vmap_block_queue *vbq;
struct vmap_block *vb;
struct vmap_area *va;
+ struct xarray *xa;
unsigned long vb_idx;
int node, err;
+ void *vaddr;
node = numa_node_id();
- vb = kmalloc_node(sizeof(struct vmap_block),
- gfp_mask & GFP_RECLAIM_MASK, node);
+ vb = kmalloc_node(sizeof(struct vmap_block), gfp_mask, node);
if (unlikely(!vb))
return ERR_PTR(-ENOMEM);
va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
VMALLOC_START, VMALLOC_END,
- node, gfp_mask);
+ node, gfp_mask,
+ VMAP_RAM|VMAP_BLOCK, NULL);
if (IS_ERR(va)) {
kfree(vb);
return ERR_CAST(va);
}
- err = radix_tree_preload(gfp_mask);
- if (unlikely(err)) {
- kfree(vb);
- free_vmap_area(va);
- return ERR_PTR(err);
- }
-
+ vaddr = vmap_block_vaddr(va->va_start, 0);
spin_lock_init(&vb->lock);
vb->va = va;
- vb->free = VMAP_BBMAP_BITS;
+ /* At least something should be left free */
+ BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
+ bitmap_zero(vb->used_map, VMAP_BBMAP_BITS);
+ vb->free = VMAP_BBMAP_BITS - (1UL << order);
vb->dirty = 0;
- bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
+ vb->dirty_min = VMAP_BBMAP_BITS;
+ vb->dirty_max = 0;
+ bitmap_set(vb->used_map, 0, (1UL << order));
INIT_LIST_HEAD(&vb->free_list);
+ vb->cpu = raw_smp_processor_id();
+ xa = addr_to_vb_xa(va->va_start);
vb_idx = addr_to_vb_idx(va->va_start);
- spin_lock(&vmap_block_tree_lock);
- err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
- spin_unlock(&vmap_block_tree_lock);
- BUG_ON(err);
- radix_tree_preload_end();
-
- vbq = &get_cpu_var(vmap_block_queue);
- vb->vbq = vbq;
+ err = xa_insert(xa, vb_idx, vb, gfp_mask);
+ if (err) {
+ kfree(vb);
+ free_vmap_area(va);
+ return ERR_PTR(err);
+ }
+ /*
+ * list_add_tail_rcu could happened in another core
+ * rather than vb->cpu due to task migration, which
+ * is safe as list_add_tail_rcu will ensure the list's
+ * integrity together with list_for_each_rcu from read
+ * side.
+ */
+ vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu);
spin_lock(&vbq->lock);
- list_add_rcu(&vb->free_list, &vbq->free);
+ list_add_tail_rcu(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);
- put_cpu_var(vmap_block_queue);
- return vb;
+ return vaddr;
}
static void free_vmap_block(struct vmap_block *vb)
{
+ struct vmap_node *vn;
struct vmap_block *tmp;
- unsigned long vb_idx;
+ struct xarray *xa;
- vb_idx = addr_to_vb_idx(vb->va->va_start);
- spin_lock(&vmap_block_tree_lock);
- tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
- spin_unlock(&vmap_block_tree_lock);
+ xa = addr_to_vb_xa(vb->va->va_start);
+ tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start));
BUG_ON(tmp != vb);
+ vn = addr_to_node(vb->va->va_start);
+ spin_lock(&vn->busy.lock);
+ unlink_va(vb->va, &vn->busy.root);
+ spin_unlock(&vn->busy.lock);
+
free_vmap_area_noflush(vb->va);
kfree_rcu(vb, rcu_head);
}
+static bool purge_fragmented_block(struct vmap_block *vb,
+ struct list_head *purge_list, bool force_purge)
+{
+ struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, vb->cpu);
+
+ if (vb->free + vb->dirty != VMAP_BBMAP_BITS ||
+ vb->dirty == VMAP_BBMAP_BITS)
+ return false;
+
+ /* Don't overeagerly purge usable blocks unless requested */
+ if (!(force_purge || vb->free < VMAP_PURGE_THRESHOLD))
+ return false;
+
+ /* prevent further allocs after releasing lock */
+ WRITE_ONCE(vb->free, 0);
+ /* prevent purging it again */
+ WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS);
+ vb->dirty_min = 0;
+ vb->dirty_max = VMAP_BBMAP_BITS;
+ spin_lock(&vbq->lock);
+ list_del_rcu(&vb->free_list);
+ spin_unlock(&vbq->lock);
+ list_add_tail(&vb->purge, purge_list);
+ return true;
+}
+
+static void free_purged_blocks(struct list_head *purge_list)
+{
+ struct vmap_block *vb, *n_vb;
+
+ list_for_each_entry_safe(vb, n_vb, purge_list, purge) {
+ list_del(&vb->purge);
+ free_vmap_block(vb);
+ }
+}
+
static void purge_fragmented_blocks(int cpu)
{
LIST_HEAD(purge);
struct vmap_block *vb;
- struct vmap_block *n_vb;
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
rcu_read_lock();
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+ unsigned long free = READ_ONCE(vb->free);
+ unsigned long dirty = READ_ONCE(vb->dirty);
- if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
+ if (free + dirty != VMAP_BBMAP_BITS ||
+ dirty == VMAP_BBMAP_BITS)
continue;
spin_lock(&vb->lock);
- if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
- vb->free = 0; /* prevent further allocs after releasing lock */
- vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
- bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
- spin_lock(&vbq->lock);
- list_del_rcu(&vb->free_list);
- spin_unlock(&vbq->lock);
- spin_unlock(&vb->lock);
- list_add_tail(&vb->purge, &purge);
- } else
- spin_unlock(&vb->lock);
+ purge_fragmented_block(vb, &purge, true);
+ spin_unlock(&vb->lock);
}
rcu_read_unlock();
-
- list_for_each_entry_safe(vb, n_vb, &purge, purge) {
- list_del(&vb->purge);
- free_vmap_block(vb);
- }
+ free_purged_blocks(&purge);
}
static void purge_fragmented_blocks_allcpus(void)
@@ -900,10 +2839,10 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
{
struct vmap_block_queue *vbq;
struct vmap_block *vb;
- unsigned long addr = 0;
+ void *vaddr = NULL;
unsigned int order;
- BUG_ON(size & ~PAGE_MASK);
+ BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
if (WARN_ON(size == 0)) {
/*
@@ -911,77 +2850,81 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
* get_order(0) returns funny result. Just warn and terminate
* early.
*/
- return NULL;
+ return ERR_PTR(-EINVAL);
}
order = get_order(size);
-again:
rcu_read_lock();
- vbq = &get_cpu_var(vmap_block_queue);
+ vbq = raw_cpu_ptr(&vmap_block_queue);
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
- int i;
+ unsigned long pages_off;
+
+ if (READ_ONCE(vb->free) < (1UL << order))
+ continue;
spin_lock(&vb->lock);
- if (vb->free < 1UL << order)
- goto next;
-
- i = VMAP_BBMAP_BITS - vb->free;
- addr = vb->va->va_start + (i << PAGE_SHIFT);
- BUG_ON(addr_to_vb_idx(addr) !=
- addr_to_vb_idx(vb->va->va_start));
- vb->free -= 1UL << order;
+ if (vb->free < (1UL << order)) {
+ spin_unlock(&vb->lock);
+ continue;
+ }
+
+ pages_off = VMAP_BBMAP_BITS - vb->free;
+ vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
+ WRITE_ONCE(vb->free, vb->free - (1UL << order));
+ bitmap_set(vb->used_map, pages_off, (1UL << order));
if (vb->free == 0) {
spin_lock(&vbq->lock);
list_del_rcu(&vb->free_list);
spin_unlock(&vbq->lock);
}
+
spin_unlock(&vb->lock);
break;
-next:
- spin_unlock(&vb->lock);
}
- put_cpu_var(vmap_block_queue);
rcu_read_unlock();
- if (!addr) {
- vb = new_vmap_block(gfp_mask);
- if (IS_ERR(vb))
- return vb;
- goto again;
- }
+ /* Allocate new block if nothing was found */
+ if (!vaddr)
+ vaddr = new_vmap_block(order, gfp_mask);
- return (void *)addr;
+ return vaddr;
}
-static void vb_free(const void *addr, unsigned long size)
+static void vb_free(unsigned long addr, unsigned long size)
{
unsigned long offset;
- unsigned long vb_idx;
unsigned int order;
struct vmap_block *vb;
+ struct xarray *xa;
- BUG_ON(size & ~PAGE_MASK);
+ BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
- flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
+ flush_cache_vunmap(addr, addr + size);
order = get_order(size);
+ offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
- offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
+ xa = addr_to_vb_xa(addr);
+ vb = xa_load(xa, addr_to_vb_idx(addr));
- vb_idx = addr_to_vb_idx((unsigned long)addr);
- rcu_read_lock();
- vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
- rcu_read_unlock();
- BUG_ON(!vb);
+ spin_lock(&vb->lock);
+ bitmap_clear(vb->used_map, offset, (1UL << order));
+ spin_unlock(&vb->lock);
+
+ vunmap_range_noflush(addr, addr + size);
- vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
+ if (debug_pagealloc_enabled_static())
+ flush_tlb_kernel_range(addr, addr + size);
spin_lock(&vb->lock);
- BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
- vb->dirty += 1UL << order;
+ /* Expand the not yet TLB flushed dirty range */
+ vb->dirty_min = min(vb->dirty_min, offset);
+ vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
+
+ WRITE_ONCE(vb->dirty, vb->dirty + (1UL << order));
if (vb->dirty == VMAP_BBMAP_BITS) {
BUG_ON(vb->free);
spin_unlock(&vb->lock);
@@ -990,63 +2933,74 @@ static void vb_free(const void *addr, unsigned long size)
spin_unlock(&vb->lock);
}
-/**
- * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
- *
- * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
- * to amortize TLB flushing overheads. What this means is that any page you
- * have now, may, in a former life, have been mapped into kernel virtual
- * address by the vmap layer and so there might be some CPUs with TLB entries
- * still referencing that page (additional to the regular 1:1 kernel mapping).
- *
- * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
- * be sure that none of the pages we have control over will have any aliases
- * from the vmap layer.
- */
-void vm_unmap_aliases(void)
+static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
{
- unsigned long start = ULONG_MAX, end = 0;
+ LIST_HEAD(purge_list);
int cpu;
- int flush = 0;
if (unlikely(!vmap_initialized))
return;
+ mutex_lock(&vmap_purge_lock);
+
for_each_possible_cpu(cpu) {
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
struct vmap_block *vb;
+ unsigned long idx;
rcu_read_lock();
- list_for_each_entry_rcu(vb, &vbq->free, free_list) {
- int i;
-
+ xa_for_each(&vbq->vmap_blocks, idx, vb) {
spin_lock(&vb->lock);
- i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
- while (i < VMAP_BBMAP_BITS) {
+
+ /*
+ * Try to purge a fragmented block first. If it's
+ * not purgeable, check whether there is dirty
+ * space to be flushed.
+ */
+ if (!purge_fragmented_block(vb, &purge_list, false) &&
+ vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) {
+ unsigned long va_start = vb->va->va_start;
unsigned long s, e;
- int j;
- j = find_next_zero_bit(vb->dirty_map,
- VMAP_BBMAP_BITS, i);
- s = vb->va->va_start + (i << PAGE_SHIFT);
- e = vb->va->va_start + (j << PAGE_SHIFT);
- flush = 1;
+ s = va_start + (vb->dirty_min << PAGE_SHIFT);
+ e = va_start + (vb->dirty_max << PAGE_SHIFT);
- if (s < start)
- start = s;
- if (e > end)
- end = e;
+ start = min(s, start);
+ end = max(e, end);
- i = j;
- i = find_next_bit(vb->dirty_map,
- VMAP_BBMAP_BITS, i);
+ /* Prevent that this is flushed again */
+ vb->dirty_min = VMAP_BBMAP_BITS;
+ vb->dirty_max = 0;
+
+ flush = 1;
}
spin_unlock(&vb->lock);
}
rcu_read_unlock();
}
+ free_purged_blocks(&purge_list);
- __purge_vmap_area_lazy(&start, &end, 1, flush);
+ if (!__purge_vmap_area_lazy(start, end, false) && flush)
+ flush_tlb_kernel_range(start, end);
+ mutex_unlock(&vmap_purge_lock);
+}
+
+/**
+ * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
+ *
+ * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
+ * to amortize TLB flushing overheads. What this means is that any page you
+ * have now, may, in a former life, have been mapped into kernel virtual
+ * address by the vmap layer and so there might be some CPUs with TLB entries
+ * still referencing that page (additional to the regular 1:1 kernel mapping).
+ *
+ * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
+ * be sure that none of the pages we have control over will have any aliases
+ * from the vmap layer.
+ */
+void vm_unmap_aliases(void)
+{
+ _vm_unmap_aliases(ULONG_MAX, 0, 0);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
@@ -1057,21 +3011,30 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases);
*/
void vm_unmap_ram(const void *mem, unsigned int count)
{
- unsigned long size = count << PAGE_SHIFT;
- unsigned long addr = (unsigned long)mem;
+ unsigned long size = (unsigned long)count << PAGE_SHIFT;
+ unsigned long addr = (unsigned long)kasan_reset_tag(mem);
+ struct vmap_area *va;
+ might_sleep();
BUG_ON(!addr);
BUG_ON(addr < VMALLOC_START);
BUG_ON(addr > VMALLOC_END);
- BUG_ON(addr & (PAGE_SIZE-1));
+ BUG_ON(!PAGE_ALIGNED(addr));
- debug_check_no_locks_freed(mem, size);
- vmap_debug_free_range(addr, addr+size);
+ kasan_poison_vmalloc(mem, size);
- if (likely(count <= VMAP_MAX_ALLOC))
- vb_free(mem, size);
- else
- free_unmap_vmap_area_addr(addr);
+ if (likely(count <= VMAP_MAX_ALLOC)) {
+ debug_check_no_locks_freed(mem, size);
+ vb_free(addr, size);
+ return;
+ }
+
+ va = find_unlink_vmap_area(addr);
+ if (WARN_ON_ONCE(!va))
+ return;
+
+ debug_check_no_locks_freed((void *)va->va_start, va_size(va));
+ free_unmap_vmap_area(va);
}
EXPORT_SYMBOL(vm_unmap_ram);
@@ -1080,13 +3043,18 @@ EXPORT_SYMBOL(vm_unmap_ram);
* @pages: an array of pointers to the pages to be mapped
* @count: number of pages
* @node: prefer to allocate data structures on this node
- * @prot: memory protection to use. PAGE_KERNEL for regular RAM
+ *
+ * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
+ * faster than vmap so it's good. But if you mix long-life and short-life
+ * objects with vm_map_ram(), it could consume lots of address space through
+ * fragmentation (especially on a 32bit machine). You could see failures in
+ * the end. Please use this function for short-lived objects.
*
* Returns: a pointer to the address that has been mapped, or %NULL on failure
*/
-void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
+void *vm_map_ram(struct page **pages, unsigned int count, int node)
{
- unsigned long size = count << PAGE_SHIFT;
+ unsigned long size = (unsigned long)count << PAGE_SHIFT;
unsigned long addr;
void *mem;
@@ -1098,22 +3066,58 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
} else {
struct vmap_area *va;
va = alloc_vmap_area(size, PAGE_SIZE,
- VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
+ VMALLOC_START, VMALLOC_END,
+ node, GFP_KERNEL, VMAP_RAM,
+ NULL);
if (IS_ERR(va))
return NULL;
addr = va->va_start;
mem = (void *)addr;
}
- if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
+
+ if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
+ pages, PAGE_SHIFT) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
+
+ /*
+ * Mark the pages as accessible, now that they are mapped.
+ * With hardware tag-based KASAN, marking is skipped for
+ * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
+ */
+ mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);
+
return mem;
}
EXPORT_SYMBOL(vm_map_ram);
static struct vm_struct *vmlist __initdata;
+
+static inline unsigned int vm_area_page_order(struct vm_struct *vm)
+{
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+ return vm->page_order;
+#else
+ return 0;
+#endif
+}
+
+unsigned int get_vm_area_page_order(struct vm_struct *vm)
+{
+ return vm_area_page_order(vm);
+}
+
+static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
+{
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+ vm->page_order = order;
+#else
+ BUG_ON(order != 0);
+#endif
+}
+
/**
* vm_area_add_early - add vmap area early during boot
* @vm: vm_struct to add
@@ -1154,139 +3158,22 @@ void __init vm_area_add_early(struct vm_struct *vm)
*/
void __init vm_area_register_early(struct vm_struct *vm, size_t align)
{
- static size_t vm_init_off __initdata;
- unsigned long addr;
-
- addr = ALIGN(VMALLOC_START + vm_init_off, align);
- vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
-
- vm->addr = (void *)addr;
-
- vm_area_add_early(vm);
-}
-
-void __init vmalloc_init(void)
-{
- struct vmap_area *va;
- struct vm_struct *tmp;
- int i;
-
- for_each_possible_cpu(i) {
- struct vmap_block_queue *vbq;
- struct vfree_deferred *p;
-
- vbq = &per_cpu(vmap_block_queue, i);
- spin_lock_init(&vbq->lock);
- INIT_LIST_HEAD(&vbq->free);
- p = &per_cpu(vfree_deferred, i);
- init_llist_head(&p->list);
- INIT_WORK(&p->wq, free_work);
- }
-
- /* Import existing vmlist entries. */
- for (tmp = vmlist; tmp; tmp = tmp->next) {
- va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
- va->flags = VM_VM_AREA;
- va->va_start = (unsigned long)tmp->addr;
- va->va_end = va->va_start + tmp->size;
- va->vm = tmp;
- __insert_vmap_area(va);
- }
+ unsigned long addr = ALIGN(VMALLOC_START, align);
+ struct vm_struct *cur, **p;
- vmap_area_pcpu_hole = VMALLOC_END;
-
- vmap_initialized = true;
-}
-
-/**
- * map_kernel_range_noflush - map kernel VM area with the specified pages
- * @addr: start of the VM area to map
- * @size: size of the VM area to map
- * @prot: page protection flags to use
- * @pages: pages to map
- *
- * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
- * specify should have been allocated using get_vm_area() and its
- * friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing. The caller is
- * responsible for calling flush_cache_vmap() on to-be-mapped areas
- * before calling this function.
- *
- * RETURNS:
- * The number of pages mapped on success, -errno on failure.
- */
-int map_kernel_range_noflush(unsigned long addr, unsigned long size,
- pgprot_t prot, struct page **pages)
-{
- return vmap_page_range_noflush(addr, addr + size, prot, pages);
-}
-
-/**
- * unmap_kernel_range_noflush - unmap kernel VM area
- * @addr: start of the VM area to unmap
- * @size: size of the VM area to unmap
- *
- * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
- * specify should have been allocated using get_vm_area() and its
- * friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing. The caller is
- * responsible for calling flush_cache_vunmap() on to-be-mapped areas
- * before calling this function and flush_tlb_kernel_range() after.
- */
-void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
-{
- vunmap_page_range(addr, addr + size);
-}
-EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
-
-/**
- * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
- * @addr: start of the VM area to unmap
- * @size: size of the VM area to unmap
- *
- * Similar to unmap_kernel_range_noflush() but flushes vcache before
- * the unmapping and tlb after.
- */
-void unmap_kernel_range(unsigned long addr, unsigned long size)
-{
- unsigned long end = addr + size;
-
- flush_cache_vunmap(addr, end);
- vunmap_page_range(addr, end);
- flush_tlb_kernel_range(addr, end);
-}
-
-int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
-{
- unsigned long addr = (unsigned long)area->addr;
- unsigned long end = addr + area->size - PAGE_SIZE;
- int err;
+ BUG_ON(vmap_initialized);
- err = vmap_page_range(addr, end, prot, *pages);
- if (err > 0) {
- *pages += err;
- err = 0;
+ for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
+ if ((unsigned long)cur->addr - addr >= vm->size)
+ break;
+ addr = ALIGN((unsigned long)cur->addr + cur->size, align);
}
- return err;
-}
-EXPORT_SYMBOL_GPL(map_vm_area);
-
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, const void *caller)
-{
- spin_lock(&vmap_area_lock);
- vm->flags = flags;
- vm->addr = (void *)va->va_start;
- vm->size = va->va_end - va->va_start;
- vm->caller = caller;
- va->vm = vm;
- va->flags |= VM_VM_AREA;
- spin_unlock(&vmap_area_lock);
+ BUG_ON(addr > VMALLOC_END - vm->size);
+ vm->addr = (void *)addr;
+ vm->next = *p;
+ *p = vm;
+ kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
}
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
@@ -1294,75 +3181,85 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm)
/*
* Before removing VM_UNINITIALIZED,
* we should make sure that vm has proper values.
- * Pair with smp_rmb() in show_numa_info().
+ * Pair with smp_rmb() in vread_iter() and vmalloc_info_show().
*/
smp_wmb();
vm->flags &= ~VM_UNINITIALIZED;
}
-static struct vm_struct *__get_vm_area_node(unsigned long size,
- unsigned long align, unsigned long flags, unsigned long start,
- unsigned long end, int node, gfp_t gfp_mask, const void *caller)
+struct vm_struct *__get_vm_area_node(unsigned long size,
+ unsigned long align, unsigned long shift, unsigned long flags,
+ unsigned long start, unsigned long end, int node,
+ gfp_t gfp_mask, const void *caller)
{
struct vmap_area *va;
struct vm_struct *area;
+ unsigned long requested_size = size;
BUG_ON(in_interrupt());
- if (flags & VM_IOREMAP)
- align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER);
-
- size = PAGE_ALIGN(size);
+ size = ALIGN(size, 1ul << shift);
if (unlikely(!size))
return NULL;
+ if (flags & VM_IOREMAP)
+ align = 1ul << clamp_t(int, get_count_order_long(size),
+ PAGE_SHIFT, IOREMAP_MAX_ORDER);
+
area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!area))
return NULL;
- /*
- * We always allocate a guard page.
- */
- size += PAGE_SIZE;
+ if (!(flags & VM_NO_GUARD))
+ size += PAGE_SIZE;
+
+ area->flags = flags;
+ area->caller = caller;
+ area->requested_size = requested_size;
- va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
+ va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
if (IS_ERR(va)) {
kfree(area);
return NULL;
}
- setup_vmalloc_vm(area, va, flags, caller);
+ /*
+ * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
+ * best-effort approach, as they can be mapped outside of vmalloc code.
+ * For VM_ALLOC mappings, the pages are marked as accessible after
+ * getting mapped in __vmalloc_node_range().
+ * With hardware tag-based KASAN, marking is skipped for
+ * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
+ */
+ if (!(flags & VM_ALLOC))
+ area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
+ KASAN_VMALLOC_PROT_NORMAL);
return area;
}
-struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
- unsigned long start, unsigned long end)
-{
- return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
- GFP_KERNEL, __builtin_return_address(0));
-}
-EXPORT_SYMBOL_GPL(__get_vm_area);
-
struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
const void *caller)
{
- return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
- GFP_KERNEL, caller);
+ return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
+ NUMA_NO_NODE, GFP_KERNEL, caller);
}
/**
- * get_vm_area - reserve a contiguous kernel virtual area
- * @size: size of the area
- * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
+ * get_vm_area - reserve a contiguous kernel virtual area
+ * @size: size of the area
+ * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
*
- * Search an area of @size in the kernel virtual mapping area,
- * and reserved it for out purposes. Returns the area descriptor
- * on success or %NULL on failure.
+ * Search an area of @size in the kernel virtual mapping area,
+ * and reserved it for out purposes. Returns the area descriptor
+ * on success or %NULL on failure.
+ *
+ * Return: the area descriptor on success or %NULL on failure.
*/
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
- return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
+ return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
+ VMALLOC_START, VMALLOC_END,
NUMA_NO_NODE, GFP_KERNEL,
__builtin_return_address(0));
}
@@ -1370,369 +3267,929 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
const void *caller)
{
- return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
+ return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
+ VMALLOC_START, VMALLOC_END,
NUMA_NO_NODE, GFP_KERNEL, caller);
}
/**
- * find_vm_area - find a continuous kernel virtual area
- * @addr: base address
+ * find_vm_area - find a continuous kernel virtual area
+ * @addr: base address
+ *
+ * Search for the kernel VM area starting at @addr, and return it.
+ * It is up to the caller to do all required locking to keep the returned
+ * pointer valid.
*
- * Search for the kernel VM area starting at @addr, and return it.
- * It is up to the caller to do all required locking to keep the returned
- * pointer valid.
+ * Return: the area descriptor on success or %NULL on failure.
*/
struct vm_struct *find_vm_area(const void *addr)
{
struct vmap_area *va;
va = find_vmap_area((unsigned long)addr);
- if (va && va->flags & VM_VM_AREA)
- return va->vm;
+ if (!va)
+ return NULL;
- return NULL;
+ return va->vm;
}
/**
- * remove_vm_area - find and remove a continuous kernel virtual area
- * @addr: base address
+ * remove_vm_area - find and remove a continuous kernel virtual area
+ * @addr: base address
+ *
+ * Search for the kernel VM area starting at @addr, and remove it.
+ * This function returns the found VM area, but using it is NOT safe
+ * on SMP machines, except for its size or flags.
*
- * Search for the kernel VM area starting at @addr, and remove it.
- * This function returns the found VM area, but using it is NOT safe
- * on SMP machines, except for its size or flags.
+ * Return: the area descriptor on success or %NULL on failure.
*/
struct vm_struct *remove_vm_area(const void *addr)
{
struct vmap_area *va;
+ struct vm_struct *vm;
- va = find_vmap_area((unsigned long)addr);
- if (va && va->flags & VM_VM_AREA) {
- struct vm_struct *vm = va->vm;
+ might_sleep();
- spin_lock(&vmap_area_lock);
- va->vm = NULL;
- va->flags &= ~VM_VM_AREA;
- spin_unlock(&vmap_area_lock);
+ if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
+ addr))
+ return NULL;
+
+ va = find_unlink_vmap_area((unsigned long)addr);
+ if (!va || !va->vm)
+ return NULL;
+ vm = va->vm;
- vmap_debug_free_range(va->va_start, va->va_end);
- free_unmap_vmap_area(va);
- vm->size -= PAGE_SIZE;
+ debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm));
+ debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm));
+ kasan_free_module_shadow(vm);
+ kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm));
- return vm;
- }
- return NULL;
+ free_unmap_vmap_area(va);
+ return vm;
}
-static void __vunmap(const void *addr, int deallocate_pages)
+static inline void set_area_direct_map(const struct vm_struct *area,
+ int (*set_direct_map)(struct page *page))
{
- struct vm_struct *area;
+ int i;
- if (!addr)
- return;
+ /* HUGE_VMALLOC passes small pages to set_direct_map */
+ for (i = 0; i < area->nr_pages; i++)
+ if (page_address(area->pages[i]))
+ set_direct_map(area->pages[i]);
+}
- if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
- addr))
- return;
+/*
+ * Flush the vm mapping and reset the direct map.
+ */
+static void vm_reset_perms(struct vm_struct *area)
+{
+ unsigned long start = ULONG_MAX, end = 0;
+ unsigned int page_order = vm_area_page_order(area);
+ int flush_dmap = 0;
+ int i;
- area = remove_vm_area(addr);
- if (unlikely(!area)) {
- WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
- addr);
- return;
+ /*
+ * Find the start and end range of the direct mappings to make sure that
+ * the vm_unmap_aliases() flush includes the direct map.
+ */
+ for (i = 0; i < area->nr_pages; i += 1U << page_order) {
+ unsigned long addr = (unsigned long)page_address(area->pages[i]);
+
+ if (addr) {
+ unsigned long page_size;
+
+ page_size = PAGE_SIZE << page_order;
+ start = min(addr, start);
+ end = max(addr + page_size, end);
+ flush_dmap = 1;
+ }
}
- debug_check_no_locks_freed(addr, area->size);
- debug_check_no_obj_freed(addr, area->size);
+ /*
+ * Set direct map to something invalid so that it won't be cached if
+ * there are any accesses after the TLB flush, then flush the TLB and
+ * reset the direct map permissions to the default.
+ */
+ set_area_direct_map(area, set_direct_map_invalid_noflush);
+ _vm_unmap_aliases(start, end, flush_dmap);
+ set_area_direct_map(area, set_direct_map_default_noflush);
+}
- if (deallocate_pages) {
- int i;
+static void delayed_vfree_work(struct work_struct *w)
+{
+ struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
+ struct llist_node *t, *llnode;
- for (i = 0; i < area->nr_pages; i++) {
- struct page *page = area->pages[i];
+ llist_for_each_safe(llnode, t, llist_del_all(&p->list))
+ vfree(llnode);
+}
- BUG_ON(!page);
- __free_page(page);
- }
+/**
+ * vfree_atomic - release memory allocated by vmalloc()
+ * @addr: memory base address
+ *
+ * This one is just like vfree() but can be called in any atomic context
+ * except NMIs.
+ */
+void vfree_atomic(const void *addr)
+{
+ struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
- if (area->flags & VM_VPAGES)
- vfree(area->pages);
- else
- kfree(area->pages);
- }
+ BUG_ON(in_nmi());
+ kmemleak_free(addr);
- kfree(area);
- return;
+ /*
+ * Use raw_cpu_ptr() because this can be called from preemptible
+ * context. Preemption is absolutely fine here, because the llist_add()
+ * implementation is lockless, so it works even if we are adding to
+ * another cpu's list. schedule_work() should be fine with this too.
+ */
+ if (addr && llist_add((struct llist_node *)addr, &p->list))
+ schedule_work(&p->wq);
}
-
+
/**
- * vfree - release memory allocated by vmalloc()
- * @addr: memory base address
+ * vfree - Release memory allocated by vmalloc()
+ * @addr: Memory base address
*
- * Free the virtually continuous memory area starting at @addr, as
- * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
- * NULL, no operation is performed.
+ * Free the virtually continuous memory area starting at @addr, as obtained
+ * from one of the vmalloc() family of APIs. This will usually also free the
+ * physical memory underlying the virtual allocation, but that memory is
+ * reference counted, so it will not be freed until the last user goes away.
*
- * Must not be called in NMI context (strictly speaking, only if we don't
- * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
- * conventions for vfree() arch-depenedent would be a really bad idea)
+ * If @addr is NULL, no operation is performed.
*
- * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node)
+ * Context:
+ * May sleep if called *not* from interrupt context.
+ * Must not be called in NMI context (strictly speaking, it could be
+ * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
+ * conventions for vfree() arch-dependent would be a really bad idea).
*/
void vfree(const void *addr)
{
- BUG_ON(in_nmi());
+ struct vm_struct *vm;
+ int i;
+ if (unlikely(in_interrupt())) {
+ vfree_atomic(addr);
+ return;
+ }
+
+ BUG_ON(in_nmi());
kmemleak_free(addr);
+ might_sleep();
if (!addr)
return;
- if (unlikely(in_interrupt())) {
- struct vfree_deferred *p = &__get_cpu_var(vfree_deferred);
- if (llist_add((struct llist_node *)addr, &p->list))
- schedule_work(&p->wq);
- } else
- __vunmap(addr, 1);
+
+ vm = remove_vm_area(addr);
+ if (unlikely(!vm)) {
+ WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
+ addr);
+ return;
+ }
+
+ if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
+ vm_reset_perms(vm);
+ /* All pages of vm should be charged to same memcg, so use first one. */
+ if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES))
+ mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages);
+ for (i = 0; i < vm->nr_pages; i++) {
+ struct page *page = vm->pages[i];
+
+ BUG_ON(!page);
+ /*
+ * High-order allocs for huge vmallocs are split, so
+ * can be freed as an array of order-0 allocations
+ */
+ __free_page(page);
+ cond_resched();
+ }
+ if (!(vm->flags & VM_MAP_PUT_PAGES))
+ atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
+ kvfree(vm->pages);
+ kfree(vm);
}
EXPORT_SYMBOL(vfree);
/**
- * vunmap - release virtual mapping obtained by vmap()
- * @addr: memory base address
+ * vunmap - release virtual mapping obtained by vmap()
+ * @addr: memory base address
*
- * Free the virtually contiguous memory area starting at @addr,
- * which was created from the page array passed to vmap().
+ * Free the virtually contiguous memory area starting at @addr,
+ * which was created from the page array passed to vmap().
*
- * Must not be called in interrupt context.
+ * Must not be called in interrupt context.
*/
void vunmap(const void *addr)
{
+ struct vm_struct *vm;
+
BUG_ON(in_interrupt());
might_sleep();
- if (addr)
- __vunmap(addr, 0);
+
+ if (!addr)
+ return;
+ vm = remove_vm_area(addr);
+ if (unlikely(!vm)) {
+ WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n",
+ addr);
+ return;
+ }
+ kfree(vm);
}
EXPORT_SYMBOL(vunmap);
/**
- * vmap - map an array of pages into virtually contiguous space
- * @pages: array of page pointers
- * @count: number of pages to map
- * @flags: vm_area->flags
- * @prot: page protection for the mapping
+ * vmap - map an array of pages into virtually contiguous space
+ * @pages: array of page pointers
+ * @count: number of pages to map
+ * @flags: vm_area->flags
+ * @prot: page protection for the mapping
*
- * Maps @count pages from @pages into contiguous kernel virtual
- * space.
+ * Maps @count pages from @pages into contiguous kernel virtual space.
+ * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
+ * (which must be kmalloc or vmalloc memory) and one reference per pages in it
+ * are transferred from the caller to vmap(), and will be freed / dropped when
+ * vfree() is called on the return value.
+ *
+ * Return: the address of the area or %NULL on failure
*/
void *vmap(struct page **pages, unsigned int count,
- unsigned long flags, pgprot_t prot)
+ unsigned long flags, pgprot_t prot)
{
struct vm_struct *area;
+ unsigned long addr;
+ unsigned long size; /* In bytes */
might_sleep();
- if (count > totalram_pages)
+ if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS))
return NULL;
- area = get_vm_area_caller((count << PAGE_SHIFT), flags,
- __builtin_return_address(0));
+ /*
+ * Your top guard is someone else's bottom guard. Not having a top
+ * guard compromises someone else's mappings too.
+ */
+ if (WARN_ON_ONCE(flags & VM_NO_GUARD))
+ flags &= ~VM_NO_GUARD;
+
+ if (count > totalram_pages())
+ return NULL;
+
+ size = (unsigned long)count << PAGE_SHIFT;
+ area = get_vm_area_caller(size, flags, __builtin_return_address(0));
if (!area)
return NULL;
- if (map_vm_area(area, prot, &pages)) {
+ addr = (unsigned long)area->addr;
+ if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
+ pages, PAGE_SHIFT) < 0) {
vunmap(area->addr);
return NULL;
}
+ if (flags & VM_MAP_PUT_PAGES) {
+ area->pages = pages;
+ area->nr_pages = count;
+ }
return area->addr;
}
EXPORT_SYMBOL(vmap);
-static void *__vmalloc_node(unsigned long size, unsigned long align,
- gfp_t gfp_mask, pgprot_t prot,
- int node, const void *caller);
+#ifdef CONFIG_VMAP_PFN
+struct vmap_pfn_data {
+ unsigned long *pfns;
+ pgprot_t prot;
+ unsigned int idx;
+};
+
+static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
+{
+ struct vmap_pfn_data *data = private;
+ unsigned long pfn = data->pfns[data->idx];
+ pte_t ptent;
+
+ if (WARN_ON_ONCE(pfn_valid(pfn)))
+ return -EINVAL;
+
+ ptent = pte_mkspecial(pfn_pte(pfn, data->prot));
+ set_pte_at(&init_mm, addr, pte, ptent);
+
+ data->idx++;
+ return 0;
+}
+
+/**
+ * vmap_pfn - map an array of PFNs into virtually contiguous space
+ * @pfns: array of PFNs
+ * @count: number of pages to map
+ * @prot: page protection for the mapping
+ *
+ * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
+ * the start address of the mapping.
+ */
+void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
+{
+ struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
+ struct vm_struct *area;
+
+ area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
+ __builtin_return_address(0));
+ if (!area)
+ return NULL;
+ if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
+ count * PAGE_SIZE, vmap_pfn_apply, &data)) {
+ free_vm_area(area);
+ return NULL;
+ }
+
+ flush_cache_vmap((unsigned long)area->addr,
+ (unsigned long)area->addr + count * PAGE_SIZE);
+
+ return area->addr;
+}
+EXPORT_SYMBOL_GPL(vmap_pfn);
+#endif /* CONFIG_VMAP_PFN */
+
+/*
+ * Helper for vmalloc to adjust the gfp flags for certain allocations.
+ */
+static inline gfp_t vmalloc_gfp_adjust(gfp_t flags, const bool large)
+{
+ flags |= __GFP_NOWARN;
+ if (large)
+ flags &= ~__GFP_NOFAIL;
+ return flags;
+}
+
+static inline unsigned int
+vm_area_alloc_pages(gfp_t gfp, int nid,
+ unsigned int order, unsigned int nr_pages, struct page **pages)
+{
+ unsigned int nr_allocated = 0;
+ unsigned int nr_remaining = nr_pages;
+ unsigned int max_attempt_order = MAX_PAGE_ORDER;
+ struct page *page;
+ int i;
+ unsigned int large_order = ilog2(nr_remaining);
+ gfp_t large_gfp = vmalloc_gfp_adjust(gfp, large_order) & ~__GFP_DIRECT_RECLAIM;
+
+ large_order = min(max_attempt_order, large_order);
+
+ /*
+ * Initially, attempt to have the page allocator give us large order
+ * pages. Do not attempt allocating smaller than order chunks since
+ * __vmap_pages_range() expects physically contigous pages of exactly
+ * order long chunks.
+ */
+ while (large_order > order && nr_remaining) {
+ if (nid == NUMA_NO_NODE)
+ page = alloc_pages_noprof(large_gfp, large_order);
+ else
+ page = alloc_pages_node_noprof(nid, large_gfp, large_order);
+
+ if (unlikely(!page)) {
+ max_attempt_order = --large_order;
+ continue;
+ }
+
+ split_page(page, large_order);
+ for (i = 0; i < (1U << large_order); i++)
+ pages[nr_allocated + i] = page + i;
+
+ nr_allocated += 1U << large_order;
+ nr_remaining = nr_pages - nr_allocated;
+
+ large_order = ilog2(nr_remaining);
+ large_order = min(max_attempt_order, large_order);
+ }
+
+ /*
+ * For order-0 pages we make use of bulk allocator, if
+ * the page array is partly or not at all populated due
+ * to fails, fallback to a single page allocator that is
+ * more permissive.
+ */
+ if (!order) {
+ while (nr_allocated < nr_pages) {
+ unsigned int nr, nr_pages_request;
+
+ /*
+ * A maximum allowed request is hard-coded and is 100
+ * pages per call. That is done in order to prevent a
+ * long preemption off scenario in the bulk-allocator
+ * so the range is [1:100].
+ */
+ nr_pages_request = min(100U, nr_pages - nr_allocated);
+
+ /* memory allocation should consider mempolicy, we can't
+ * wrongly use nearest node when nid == NUMA_NO_NODE,
+ * otherwise memory may be allocated in only one node,
+ * but mempolicy wants to alloc memory by interleaving.
+ */
+ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
+ nr = alloc_pages_bulk_mempolicy_noprof(gfp,
+ nr_pages_request,
+ pages + nr_allocated);
+ else
+ nr = alloc_pages_bulk_node_noprof(gfp, nid,
+ nr_pages_request,
+ pages + nr_allocated);
+
+ nr_allocated += nr;
+
+ /*
+ * If zero or pages were obtained partly,
+ * fallback to a single page allocator.
+ */
+ if (nr != nr_pages_request)
+ break;
+ }
+ }
+
+ /* High-order pages or fallback path if "bulk" fails. */
+ while (nr_allocated < nr_pages) {
+ if (!(gfp & __GFP_NOFAIL) && fatal_signal_pending(current))
+ break;
+
+ if (nid == NUMA_NO_NODE)
+ page = alloc_pages_noprof(gfp, order);
+ else
+ page = alloc_pages_node_noprof(nid, gfp, order);
+
+ if (unlikely(!page))
+ break;
+
+ /*
+ * High-order allocations must be able to be treated as
+ * independent small pages by callers (as they can with
+ * small-page vmallocs). Some drivers do their own refcounting
+ * on vmalloc_to_page() pages, some use page->mapping,
+ * page->lru, etc.
+ */
+ if (order)
+ split_page(page, order);
+
+ /*
+ * Careful, we allocate and map page-order pages, but
+ * tracking is done per PAGE_SIZE page so as to keep the
+ * vm_struct APIs independent of the physical/mapped size.
+ */
+ for (i = 0; i < (1U << order); i++)
+ pages[nr_allocated + i] = page + i;
+
+ nr_allocated += 1U << order;
+ }
+
+ return nr_allocated;
+}
+
+static LLIST_HEAD(pending_vm_area_cleanup);
+static void cleanup_vm_area_work(struct work_struct *work)
+{
+ struct vm_struct *area, *tmp;
+ struct llist_node *head;
+
+ head = llist_del_all(&pending_vm_area_cleanup);
+ if (!head)
+ return;
+
+ llist_for_each_entry_safe(area, tmp, head, llnode) {
+ if (!area->pages)
+ free_vm_area(area);
+ else
+ vfree(area->addr);
+ }
+}
+
+/*
+ * Helper for __vmalloc_area_node() to defer cleanup
+ * of partially initialized vm_struct in error paths.
+ */
+static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work);
+static void defer_vm_area_cleanup(struct vm_struct *area)
+{
+ if (llist_add(&area->llnode, &pending_vm_area_cleanup))
+ schedule_work(&cleanup_vm_area);
+}
+
+/*
+ * Page tables allocations ignore external GFP. Enforces it by
+ * the memalloc scope API. It is used by vmalloc internals and
+ * KASAN shadow population only.
+ *
+ * GFP to scope mapping:
+ *
+ * non-blocking (no __GFP_DIRECT_RECLAIM) - memalloc_noreclaim_save()
+ * GFP_NOFS - memalloc_nofs_save()
+ * GFP_NOIO - memalloc_noio_save()
+ *
+ * Returns a flag cookie to pair with restore.
+ */
+unsigned int
+memalloc_apply_gfp_scope(gfp_t gfp_mask)
+{
+ unsigned int flags = 0;
+
+ if (!gfpflags_allow_blocking(gfp_mask))
+ flags = memalloc_noreclaim_save();
+ else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
+ flags = memalloc_nofs_save();
+ else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
+ flags = memalloc_noio_save();
+
+ /* 0 - no scope applied. */
+ return flags;
+}
+
+void
+memalloc_restore_scope(unsigned int flags)
+{
+ if (flags)
+ memalloc_flags_restore(flags);
+}
+
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
- pgprot_t prot, int node, const void *caller)
+ pgprot_t prot, unsigned int page_shift,
+ int node)
{
- const int order = 0;
- struct page **pages;
- unsigned int nr_pages, array_size, i;
- gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
+ const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
+ bool nofail = gfp_mask & __GFP_NOFAIL;
+ unsigned long addr = (unsigned long)area->addr;
+ unsigned long size = get_vm_area_size(area);
+ unsigned long array_size;
+ unsigned int nr_small_pages = size >> PAGE_SHIFT;
+ unsigned int page_order;
+ unsigned int flags;
+ int ret;
+
+ array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
- nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
- array_size = (nr_pages * sizeof(struct page *));
+ /* __GFP_NOFAIL and "noblock" flags are mutually exclusive. */
+ if (!gfpflags_allow_blocking(gfp_mask))
+ nofail = false;
+
+ if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
+ gfp_mask |= __GFP_HIGHMEM;
- area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
- pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
- PAGE_KERNEL, node, caller);
- area->flags |= VM_VPAGES;
+ area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node,
+ area->caller);
} else {
- pages = kmalloc_node(array_size, nested_gfp, node);
+ area->pages = kmalloc_node_noprof(array_size, nested_gfp, node);
}
- area->pages = pages;
- area->caller = caller;
+
if (!area->pages) {
- remove_vm_area(area->addr);
- kfree(area);
- return NULL;
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc error: size %lu, failed to allocated page array size %lu",
+ nr_small_pages * PAGE_SIZE, array_size);
+ goto fail;
}
- for (i = 0; i < area->nr_pages; i++) {
- struct page *page;
- gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
+ set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
+ page_order = vm_area_page_order(area);
- if (node < 0)
- page = alloc_page(tmp_mask);
- else
- page = alloc_pages_node(node, tmp_mask, order);
+ /*
+ * High-order nofail allocations are really expensive and
+ * potentially dangerous (pre-mature OOM, disruptive reclaim
+ * and compaction etc.
+ *
+ * Please note, the __vmalloc_node_range_noprof() falls-back
+ * to order-0 pages if high-order attempt is unsuccessful.
+ */
+ area->nr_pages = vm_area_alloc_pages(
+ vmalloc_gfp_adjust(gfp_mask, page_order), node,
+ page_order, nr_small_pages, area->pages);
- if (unlikely(!page)) {
- /* Successfully allocated i pages, free them in __vunmap() */
- area->nr_pages = i;
- goto fail;
- }
- area->pages[i] = page;
+ atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
+ /* All pages of vm should be charged to same memcg, so use first one. */
+ if (gfp_mask & __GFP_ACCOUNT && area->nr_pages)
+ mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC,
+ area->nr_pages);
+
+ /*
+ * If not enough pages were obtained to accomplish an
+ * allocation request, free them via vfree() if any.
+ */
+ if (area->nr_pages != nr_small_pages) {
+ /*
+ * vm_area_alloc_pages() can fail due to insufficient memory but
+ * also:-
+ *
+ * - a pending fatal signal
+ * - insufficient huge page-order pages
+ *
+ * Since we always retry allocations at order-0 in the huge page
+ * case a warning for either is spurious.
+ */
+ if (!fatal_signal_pending(current) && page_order == 0)
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc error: size %lu, failed to allocate pages",
+ area->nr_pages * PAGE_SIZE);
+ goto fail;
}
- if (map_vm_area(area, prot, &pages))
+ /*
+ * page tables allocations ignore external gfp mask, enforce it
+ * by the scope API
+ */
+ flags = memalloc_apply_gfp_scope(gfp_mask);
+ do {
+ ret = __vmap_pages_range(addr, addr + size, prot, area->pages,
+ page_shift, nested_gfp);
+ if (nofail && (ret < 0))
+ schedule_timeout_uninterruptible(1);
+ } while (nofail && (ret < 0));
+ memalloc_restore_scope(flags);
+
+ if (ret < 0) {
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc error: size %lu, failed to map pages",
+ area->nr_pages * PAGE_SIZE);
goto fail;
+ }
+
return area->addr;
fail:
- warn_alloc_failed(gfp_mask, order,
- "vmalloc: allocation failure, allocated %ld of %ld bytes\n",
- (area->nr_pages*PAGE_SIZE), area->size);
- vfree(area->addr);
+ defer_vm_area_cleanup(area);
return NULL;
}
+/*
+ * See __vmalloc_node_range() for a clear list of supported vmalloc flags.
+ * This gfp lists all flags currently passed through vmalloc. Currently,
+ * __GFP_ZERO is used by BPF and __GFP_NORETRY is used by percpu. Both drm
+ * and BPF also use GFP_USER. Additionally, various users pass
+ * GFP_KERNEL_ACCOUNT. Xfs uses __GFP_NOLOCKDEP.
+ */
+#define GFP_VMALLOC_SUPPORTED (GFP_KERNEL | GFP_ATOMIC | GFP_NOWAIT |\
+ __GFP_NOFAIL | __GFP_ZERO | __GFP_NORETRY |\
+ GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\
+ GFP_USER | __GFP_NOLOCKDEP)
+
+static gfp_t vmalloc_fix_flags(gfp_t flags)
+{
+ gfp_t invalid_mask = flags & ~GFP_VMALLOC_SUPPORTED;
+
+ flags &= GFP_VMALLOC_SUPPORTED;
+ WARN_ONCE(1, "Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
+ invalid_mask, &invalid_mask, flags, &flags);
+ return flags;
+}
+
/**
- * __vmalloc_node_range - allocate virtually contiguous memory
- * @size: allocation size
- * @align: desired alignment
- * @start: vm area range start
- * @end: vm area range end
- * @gfp_mask: flags for the page level allocator
- * @prot: protection mask for the allocated pages
- * @node: node to use for allocation or NUMA_NO_NODE
- * @caller: caller's return address
- *
- * Allocate enough pages to cover @size from the page level
- * allocator with @gfp_mask flags. Map them into contiguous
- * kernel virtual space, using a pagetable protection of @prot.
- */
-void *__vmalloc_node_range(unsigned long size, unsigned long align,
+ * __vmalloc_node_range - allocate virtually contiguous memory
+ * @size: allocation size
+ * @align: desired alignment
+ * @start: vm area range start
+ * @end: vm area range end
+ * @gfp_mask: flags for the page level allocator
+ * @prot: protection mask for the allocated pages
+ * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
+ * @node: node to use for allocation or NUMA_NO_NODE
+ * @caller: caller's return address
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator with @gfp_mask flags and map them into contiguous
+ * virtual range with protection @prot.
+ *
+ * Supported GFP classes: %GFP_KERNEL, %GFP_ATOMIC, %GFP_NOWAIT,
+ * %GFP_NOFS and %GFP_NOIO. Zone modifiers are not supported.
+ * Please note %GFP_ATOMIC and %GFP_NOWAIT are supported only
+ * by __vmalloc().
+ *
+ * Retry modifiers: only %__GFP_NOFAIL is supported; %__GFP_NORETRY
+ * and %__GFP_RETRY_MAYFAIL are not supported.
+ *
+ * %__GFP_NOWARN can be used to suppress failure messages.
+ *
+ * Can not be called from interrupt nor NMI contexts.
+ * Return: the address of the area or %NULL on failure
+ */
+void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
- pgprot_t prot, int node, const void *caller)
+ pgprot_t prot, unsigned long vm_flags, int node,
+ const void *caller)
{
struct vm_struct *area;
- void *addr;
- unsigned long real_size = size;
+ void *ret;
+ kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
+ unsigned long original_align = align;
+ unsigned int shift = PAGE_SHIFT;
- size = PAGE_ALIGN(size);
- if (!size || (size >> PAGE_SHIFT) > totalram_pages)
- goto fail;
+ if (WARN_ON_ONCE(!size))
+ return NULL;
- area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED,
- start, end, node, gfp_mask, caller);
- if (!area)
+ if ((size >> PAGE_SHIFT) > totalram_pages()) {
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc error: size %lu, exceeds total pages",
+ size);
+ return NULL;
+ }
+
+ if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
+ /*
+ * Try huge pages. Only try for PAGE_KERNEL allocations,
+ * others like modules don't yet expect huge pages in
+ * their allocations due to apply_to_page_range not
+ * supporting them.
+ */
+
+ if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE)
+ shift = PMD_SHIFT;
+ else
+ shift = arch_vmap_pte_supported_shift(size);
+
+ align = max(original_align, 1UL << shift);
+ }
+
+again:
+ area = __get_vm_area_node(size, align, shift, VM_ALLOC |
+ VM_UNINITIALIZED | vm_flags, start, end, node,
+ gfp_mask, caller);
+ if (!area) {
+ bool nofail = gfp_mask & __GFP_NOFAIL;
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc error: size %lu, vm_struct allocation failed%s",
+ size, (nofail) ? ". Retrying." : "");
+ if (nofail) {
+ schedule_timeout_uninterruptible(1);
+ goto again;
+ }
goto fail;
+ }
- addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
- if (!addr)
+ /*
+ * Prepare arguments for __vmalloc_area_node() and
+ * kasan_unpoison_vmalloc().
+ */
+ if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
+ if (kasan_hw_tags_enabled()) {
+ /*
+ * Modify protection bits to allow tagging.
+ * This must be done before mapping.
+ */
+ prot = arch_vmap_pgprot_tagged(prot);
+
+ /*
+ * Skip page_alloc poisoning and zeroing for physical
+ * pages backing VM_ALLOC mapping. Memory is instead
+ * poisoned and zeroed by kasan_unpoison_vmalloc().
+ */
+ gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO;
+ }
+
+ /* Take note that the mapping is PAGE_KERNEL. */
+ kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
+ }
+
+ /* Allocate physical pages and map them into vmalloc space. */
+ ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
+ if (!ret)
goto fail;
/*
+ * Mark the pages as accessible, now that they are mapped.
+ * The condition for setting KASAN_VMALLOC_INIT should complement the
+ * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check
+ * to make sure that memory is initialized under the same conditions.
+ * Tag-based KASAN modes only assign tags to normal non-executable
+ * allocations, see __kasan_unpoison_vmalloc().
+ */
+ kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
+ if (!want_init_on_free() && want_init_on_alloc(gfp_mask) &&
+ (gfp_mask & __GFP_SKIP_ZERO))
+ kasan_flags |= KASAN_VMALLOC_INIT;
+ /* KASAN_VMALLOC_PROT_NORMAL already set if required. */
+ area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags);
+
+ /*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
* flag. It means that vm_struct is not fully initialized.
* Now, it is fully initialized, so remove this flag here.
*/
clear_vm_uninitialized_flag(area);
- /*
- * A ref_count = 3 is needed because the vm_struct and vmap_area
- * structures allocated in the __get_vm_area_node() function contain
- * references to the virtual address of the vmalloc'ed block.
- */
- kmemleak_alloc(addr, real_size, 3, gfp_mask);
+ if (!(vm_flags & VM_DEFER_KMEMLEAK))
+ kmemleak_vmalloc(area, PAGE_ALIGN(size), gfp_mask);
- return addr;
+ return area->addr;
fail:
- warn_alloc_failed(gfp_mask, 0,
- "vmalloc: allocation failure: %lu bytes\n",
- real_size);
+ if (shift > PAGE_SHIFT) {
+ shift = PAGE_SHIFT;
+ align = original_align;
+ goto again;
+ }
+
return NULL;
}
/**
- * __vmalloc_node - allocate virtually contiguous memory
- * @size: allocation size
- * @align: desired alignment
- * @gfp_mask: flags for the page level allocator
- * @prot: protection mask for the allocated pages
- * @node: node to use for allocation or NUMA_NO_NODE
- * @caller: caller's return address
+ * __vmalloc_node - allocate virtually contiguous memory
+ * @size: allocation size
+ * @align: desired alignment
+ * @gfp_mask: flags for the page level allocator
+ * @node: node to use for allocation or NUMA_NO_NODE
+ * @caller: caller's return address
+ *
+ * Allocate enough pages to cover @size from the page level allocator with
+ * @gfp_mask flags. Map them into contiguous kernel virtual space.
+ *
+ * Semantics of @gfp_mask (including reclaim/retry modifiers such as
+ * __GFP_NOFAIL) are the same as in __vmalloc_node_range_noprof().
*
- * Allocate enough pages to cover @size from the page level
- * allocator with @gfp_mask flags. Map them into contiguous
- * kernel virtual space, using a pagetable protection of @prot.
+ * Return: pointer to the allocated memory or %NULL on error
*/
-static void *__vmalloc_node(unsigned long size, unsigned long align,
- gfp_t gfp_mask, pgprot_t prot,
- int node, const void *caller)
+void *__vmalloc_node_noprof(unsigned long size, unsigned long align,
+ gfp_t gfp_mask, int node, const void *caller)
{
- return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
- gfp_mask, prot, node, caller);
+ return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
+ gfp_mask, PAGE_KERNEL, 0, node, caller);
}
+/*
+ * This is only for performance analysis of vmalloc and stress purpose.
+ * It is required by vmalloc test module, therefore do not use it other
+ * than that.
+ */
+#ifdef CONFIG_TEST_VMALLOC_MODULE
+EXPORT_SYMBOL_GPL(__vmalloc_node_noprof);
+#endif
-void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
+void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
{
- return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
+ if (unlikely(gfp_mask & ~GFP_VMALLOC_SUPPORTED))
+ gfp_mask = vmalloc_fix_flags(gfp_mask);
+ return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE,
__builtin_return_address(0));
}
-EXPORT_SYMBOL(__vmalloc);
+EXPORT_SYMBOL(__vmalloc_noprof);
-static inline void *__vmalloc_node_flags(unsigned long size,
- int node, gfp_t flags)
+/**
+ * vmalloc - allocate virtually contiguous memory
+ * @size: allocation size
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vmalloc_noprof(unsigned long size)
{
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
- node, __builtin_return_address(0));
+ return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
+EXPORT_SYMBOL(vmalloc_noprof);
/**
- * vmalloc - allocate virtually contiguous memory
- * @size: allocation size
- * Allocate enough pages to cover @size from the page level
- * allocator and map them into contiguous kernel virtual space.
+ * vmalloc_huge_node - allocate virtually contiguous memory, allow huge pages
+ * @size: allocation size
+ * @gfp_mask: flags for the page level allocator
+ * @node: node to use for allocation or NUMA_NO_NODE
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * If @size is greater than or equal to PMD_SIZE, allow using
+ * huge pages for the memory
*
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
+ * Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc(unsigned long size)
+void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
{
- return __vmalloc_node_flags(size, NUMA_NO_NODE,
- GFP_KERNEL | __GFP_HIGHMEM);
+ if (unlikely(gfp_mask & ~GFP_VMALLOC_SUPPORTED))
+ gfp_mask = vmalloc_fix_flags(gfp_mask);
+ return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
+ gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ node, __builtin_return_address(0));
}
-EXPORT_SYMBOL(vmalloc);
+EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof);
/**
- * vzalloc - allocate virtually contiguous memory with zero fill
- * @size: allocation size
- * Allocate enough pages to cover @size from the page level
- * allocator and map them into contiguous kernel virtual space.
- * The memory allocated is set to zero.
+ * vzalloc - allocate virtually contiguous memory with zero fill
+ * @size: allocation size
*
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
-void *vzalloc(unsigned long size)
+void *vzalloc_noprof(unsigned long size)
{
- return __vmalloc_node_flags(size, NUMA_NO_NODE,
- GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+ return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
-EXPORT_SYMBOL(vzalloc);
+EXPORT_SYMBOL(vzalloc_noprof);
/**
* vmalloc_user - allocate zeroed virtually contiguous memory for userspace
@@ -1740,41 +4197,37 @@ EXPORT_SYMBOL(vzalloc);
*
* The resulting memory area is zeroed so it can be mapped to userspace
* without leaking data.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc_user(unsigned long size)
+void *vmalloc_user_noprof(unsigned long size)
{
- struct vm_struct *area;
- void *ret;
-
- ret = __vmalloc_node(size, SHMLBA,
- GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
- PAGE_KERNEL, NUMA_NO_NODE,
- __builtin_return_address(0));
- if (ret) {
- area = find_vm_area(ret);
- area->flags |= VM_USERMAP;
- }
- return ret;
+ return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
+ VM_USERMAP, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
-EXPORT_SYMBOL(vmalloc_user);
+EXPORT_SYMBOL(vmalloc_user_noprof);
/**
- * vmalloc_node - allocate memory on a specific node
- * @size: allocation size
- * @node: numa node
+ * vmalloc_node - allocate memory on a specific node
+ * @size: allocation size
+ * @node: numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
*
- * Allocate enough pages to cover @size from the page level
- * allocator and map them into contiguous kernel virtual space.
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
*
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
+ * Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc_node(unsigned long size, int node)
+void *vmalloc_node_noprof(unsigned long size, int node)
{
- return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
- node, __builtin_return_address(0));
+ return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node,
+ __builtin_return_address(0));
}
-EXPORT_SYMBOL(vmalloc_node);
+EXPORT_SYMBOL(vmalloc_node_noprof);
/**
* vzalloc_node - allocate memory on a specific node with zero fill
@@ -1785,340 +4238,475 @@ EXPORT_SYMBOL(vmalloc_node);
* allocator and map them into contiguous kernel virtual space.
* The memory allocated is set to zero.
*
- * For tight control over page level allocator and protection flags
- * use __vmalloc_node() instead.
+ * Return: pointer to the allocated memory or %NULL on error
*/
-void *vzalloc_node(unsigned long size, int node)
+void *vzalloc_node_noprof(unsigned long size, int node)
{
- return __vmalloc_node_flags(size, node,
- GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+ return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node,
+ __builtin_return_address(0));
}
-EXPORT_SYMBOL(vzalloc_node);
-
-#ifndef PAGE_KERNEL_EXEC
-# define PAGE_KERNEL_EXEC PAGE_KERNEL
-#endif
+EXPORT_SYMBOL(vzalloc_node_noprof);
/**
- * vmalloc_exec - allocate virtually contiguous, executable memory
- * @size: allocation size
+ * vrealloc_node_align_noprof - reallocate virtually contiguous memory; contents
+ * remain unchanged
+ * @p: object to reallocate memory for
+ * @size: the size to reallocate
+ * @align: requested alignment
+ * @flags: the flags for the page level allocator
+ * @nid: node number of the target node
+ *
+ * If @p is %NULL, vrealloc_XXX() behaves exactly like vmalloc_XXX(). If @size
+ * is 0 and @p is not a %NULL pointer, the object pointed to is freed.
+ *
+ * If the caller wants the new memory to be on specific node *only*,
+ * __GFP_THISNODE flag should be set, otherwise the function will try to avoid
+ * reallocation and possibly disregard the specified @nid.
+ *
+ * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
+ * initial memory allocation, every subsequent call to this API for the same
+ * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
+ * __GFP_ZERO is not fully honored by this API.
*
- * Kernel-internal function to allocate enough pages to cover @size
- * the page level allocator and map them into contiguous and
- * executable kernel virtual space.
+ * Requesting an alignment that is bigger than the alignment of the existing
+ * allocation will fail.
*
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
+ * In any case, the contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.
+ *
+ * This function must not be called concurrently with itself or vfree() for the
+ * same memory allocation.
+ *
+ * Return: pointer to the allocated memory; %NULL if @size is zero or in case of
+ * failure
*/
-
-void *vmalloc_exec(unsigned long size)
+void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
+ gfp_t flags, int nid)
{
- return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
- NUMA_NO_NODE, __builtin_return_address(0));
+ struct vm_struct *vm = NULL;
+ size_t alloced_size = 0;
+ size_t old_size = 0;
+ void *n;
+
+ if (!size) {
+ vfree(p);
+ return NULL;
+ }
+
+ if (p) {
+ vm = find_vm_area(p);
+ if (unlikely(!vm)) {
+ WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p);
+ return NULL;
+ }
+
+ alloced_size = get_vm_area_size(vm);
+ old_size = vm->requested_size;
+ if (WARN(alloced_size < old_size,
+ "vrealloc() has mismatched area vs requested sizes (%p)\n", p))
+ return NULL;
+ if (WARN(!IS_ALIGNED((unsigned long)p, align),
+ "will not reallocate with a bigger alignment (0x%lx)\n", align))
+ return NULL;
+ if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
+ nid != page_to_nid(vmalloc_to_page(p)))
+ goto need_realloc;
+ }
+
+ /*
+ * TODO: Shrink the vm_area, i.e. unmap and free unused pages. What
+ * would be a good heuristic for when to shrink the vm_area?
+ */
+ if (size <= old_size) {
+ /* Zero out "freed" memory, potentially for future realloc. */
+ if (want_init_on_free() || want_init_on_alloc(flags))
+ memset((void *)p + size, 0, old_size - size);
+ vm->requested_size = size;
+ kasan_poison_vmalloc(p + size, old_size - size);
+ return (void *)p;
+ }
+
+ /*
+ * We already have the bytes available in the allocation; use them.
+ */
+ if (size <= alloced_size) {
+ kasan_unpoison_vmalloc(p + old_size, size - old_size,
+ KASAN_VMALLOC_PROT_NORMAL);
+ /*
+ * No need to zero memory here, as unused memory will have
+ * already been zeroed at initial allocation time or during
+ * realloc shrink time.
+ */
+ vm->requested_size = size;
+ return (void *)p;
+ }
+
+need_realloc:
+ /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */
+ n = __vmalloc_node_noprof(size, align, flags, nid, __builtin_return_address(0));
+
+ if (!n)
+ return NULL;
+
+ if (p) {
+ memcpy(n, p, old_size);
+ vfree(p);
+ }
+
+ return n;
}
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
-#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
+#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
-#define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL
+#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
#else
-#define GFP_VMALLOC32 GFP_KERNEL
+/*
+ * 64b systems should always have either DMA or DMA32 zones. For others
+ * GFP_DMA32 should do the right thing and use the normal zone.
+ */
+#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#endif
/**
- * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
- * @size: allocation size
+ * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
+ * @size: allocation size
+ *
+ * Allocate enough 32bit PA addressable pages to cover @size from the
+ * page level allocator and map them into contiguous kernel virtual space.
*
- * Allocate enough 32bit PA addressable pages to cover @size from the
- * page level allocator and map them into contiguous kernel virtual space.
+ * Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc_32(unsigned long size)
+void *vmalloc_32_noprof(unsigned long size)
{
- return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
- NUMA_NO_NODE, __builtin_return_address(0));
+ return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
-EXPORT_SYMBOL(vmalloc_32);
+EXPORT_SYMBOL(vmalloc_32_noprof);
/**
* vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
- * @size: allocation size
+ * @size: allocation size
*
* The resulting memory area is 32bit addressable and zeroed so it can be
* mapped to userspace without leaking data.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc_32_user(unsigned long size)
+void *vmalloc_32_user_noprof(unsigned long size)
{
- struct vm_struct *area;
- void *ret;
-
- ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
- NUMA_NO_NODE, __builtin_return_address(0));
- if (ret) {
- area = find_vm_area(ret);
- area->flags |= VM_USERMAP;
- }
- return ret;
+ return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END,
+ GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
+ VM_USERMAP, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
-EXPORT_SYMBOL(vmalloc_32_user);
+EXPORT_SYMBOL(vmalloc_32_user_noprof);
/*
- * small helper routine , copy contents to buf from addr.
- * If the page is not present, fill zero.
+ * Atomically zero bytes in the iterator.
+ *
+ * Returns the number of zeroed bytes.
*/
-
-static int aligned_vread(char *buf, char *addr, unsigned long count)
+static size_t zero_iter(struct iov_iter *iter, size_t count)
{
- struct page *p;
- int copied = 0;
+ size_t remains = count;
- while (count) {
- unsigned long offset, length;
+ while (remains > 0) {
+ size_t num, copied;
- offset = (unsigned long)addr & ~PAGE_MASK;
- length = PAGE_SIZE - offset;
- if (length > count)
- length = count;
- p = vmalloc_to_page(addr);
- /*
- * To do safe access to this _mapped_ area, we need
- * lock. But adding lock here means that we need to add
- * overhead of vmalloc()/vfree() calles for this _debug_
- * interface, rarely used. Instead of that, we'll use
- * kmap() and get small overhead in this access function.
- */
- if (p) {
- /*
- * we can expect USER0 is not used (see vread/vwrite's
- * function description)
- */
- void *map = kmap_atomic(p);
- memcpy(buf, map + offset, length);
- kunmap_atomic(map);
- } else
- memset(buf, 0, length);
+ num = min_t(size_t, remains, PAGE_SIZE);
+ copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
+ remains -= copied;
- addr += length;
- buf += length;
- copied += length;
- count -= length;
+ if (copied < num)
+ break;
}
- return copied;
+
+ return count - remains;
}
-static int aligned_vwrite(char *buf, char *addr, unsigned long count)
+/*
+ * small helper routine, copy contents to iter from addr.
+ * If the page is not present, fill zero.
+ *
+ * Returns the number of copied bytes.
+ */
+static size_t aligned_vread_iter(struct iov_iter *iter,
+ const char *addr, size_t count)
{
- struct page *p;
- int copied = 0;
+ size_t remains = count;
+ struct page *page;
- while (count) {
+ while (remains > 0) {
unsigned long offset, length;
+ size_t copied = 0;
- offset = (unsigned long)addr & ~PAGE_MASK;
+ offset = offset_in_page(addr);
length = PAGE_SIZE - offset;
- if (length > count)
- length = count;
- p = vmalloc_to_page(addr);
+ if (length > remains)
+ length = remains;
+ page = vmalloc_to_page(addr);
/*
- * To do safe access to this _mapped_ area, we need
- * lock. But adding lock here means that we need to add
- * overhead of vmalloc()/vfree() calles for this _debug_
- * interface, rarely used. Instead of that, we'll use
- * kmap() and get small overhead in this access function.
+ * To do safe access to this _mapped_ area, we need lock. But
+ * adding lock here means that we need to add overhead of
+ * vmalloc()/vfree() calls for this _debug_ interface, rarely
+ * used. Instead of that, we'll use an local mapping via
+ * copy_page_to_iter_nofault() and accept a small overhead in
+ * this access function.
*/
- if (p) {
- /*
- * we can expect USER0 is not used (see vread/vwrite's
- * function description)
- */
- void *map = kmap_atomic(p);
- memcpy(map + offset, buf, length);
- kunmap_atomic(map);
- }
- addr += length;
- buf += length;
- copied += length;
- count -= length;
+ if (page)
+ copied = copy_page_to_iter_nofault(page, offset,
+ length, iter);
+ else
+ copied = zero_iter(iter, length);
+
+ addr += copied;
+ remains -= copied;
+
+ if (copied != length)
+ break;
}
- return copied;
+
+ return count - remains;
}
-/**
- * vread() - read vmalloc area in a safe way.
- * @buf: buffer for reading data
- * @addr: vm address.
- * @count: number of bytes to be read.
- *
- * Returns # of bytes which addr and buf should be increased.
- * (same number to @count). Returns 0 if [addr...addr+count) doesn't
- * includes any intersect with alive vmalloc area.
- *
- * This function checks that addr is a valid vmalloc'ed area, and
- * copy data from that area to a given buffer. If the given memory range
- * of [addr...addr+count) includes some valid address, data is copied to
- * proper area of @buf. If there are memory holes, they'll be zero-filled.
- * IOREMAP area is treated as memory hole and no copy is done.
- *
- * If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0. @buf should be kernel's buffer.
- *
- * Note: In usual ops, vread() is never necessary because the caller
- * should know vmalloc() area is valid and can use memcpy().
- * This is for routines which have to access vmalloc area without
- * any informaion, as /dev/kmem.
+/*
+ * Read from a vm_map_ram region of memory.
*
+ * Returns the number of copied bytes.
*/
-
-long vread(char *buf, char *addr, unsigned long count)
+static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
+ size_t count, unsigned long flags)
{
- struct vmap_area *va;
- struct vm_struct *vm;
- char *vaddr, *buf_start = buf;
- unsigned long buflen = count;
- unsigned long n;
+ char *start;
+ struct vmap_block *vb;
+ struct xarray *xa;
+ unsigned long offset;
+ unsigned int rs, re;
+ size_t remains, n;
- /* Don't allow overflow */
- if ((unsigned long) addr + count < count)
- count = -(unsigned long) addr;
+ /*
+ * If it's area created by vm_map_ram() interface directly, but
+ * not further subdividing and delegating management to vmap_block,
+ * handle it here.
+ */
+ if (!(flags & VMAP_BLOCK))
+ return aligned_vread_iter(iter, addr, count);
- spin_lock(&vmap_area_lock);
- list_for_each_entry(va, &vmap_area_list, list) {
- if (!count)
- break;
+ remains = count;
- if (!(va->flags & VM_VM_AREA))
- continue;
+ /*
+ * Area is split into regions and tracked with vmap_block, read out
+ * each region and zero fill the hole between regions.
+ */
+ xa = addr_to_vb_xa((unsigned long) addr);
+ vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr));
+ if (!vb)
+ goto finished_zero;
- vm = va->vm;
- vaddr = (char *) vm->addr;
- if (addr >= vaddr + vm->size - PAGE_SIZE)
- continue;
- while (addr < vaddr) {
- if (count == 0)
+ spin_lock(&vb->lock);
+ if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
+ spin_unlock(&vb->lock);
+ goto finished_zero;
+ }
+
+ for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
+ size_t copied;
+
+ if (remains == 0)
+ goto finished;
+
+ start = vmap_block_vaddr(vb->va->va_start, rs);
+
+ if (addr < start) {
+ size_t to_zero = min_t(size_t, start - addr, remains);
+ size_t zeroed = zero_iter(iter, to_zero);
+
+ addr += zeroed;
+ remains -= zeroed;
+
+ if (remains == 0 || zeroed != to_zero)
goto finished;
- *buf = '\0';
- buf++;
- addr++;
- count--;
}
- n = vaddr + vm->size - PAGE_SIZE - addr;
- if (n > count)
- n = count;
- if (!(vm->flags & VM_IOREMAP))
- aligned_vread(buf, addr, n);
- else /* IOREMAP area is treated as memory hole */
- memset(buf, 0, n);
- buf += n;
- addr += n;
- count -= n;
+
+ /*it could start reading from the middle of used region*/
+ offset = offset_in_page(addr);
+ n = ((re - rs + 1) << PAGE_SHIFT) - offset;
+ if (n > remains)
+ n = remains;
+
+ copied = aligned_vread_iter(iter, start + offset, n);
+
+ addr += copied;
+ remains -= copied;
+
+ if (copied != n)
+ goto finished;
}
-finished:
- spin_unlock(&vmap_area_lock);
- if (buf == buf_start)
- return 0;
- /* zero-fill memory holes */
- if (buf != buf_start + buflen)
- memset(buf, 0, buflen - (buf - buf_start));
+ spin_unlock(&vb->lock);
- return buflen;
+finished_zero:
+ /* zero-fill the left dirty or free regions */
+ return count - remains + zero_iter(iter, remains);
+finished:
+ /* We couldn't copy/zero everything */
+ spin_unlock(&vb->lock);
+ return count - remains;
}
/**
- * vwrite() - write vmalloc area in a safe way.
- * @buf: buffer for source data
- * @addr: vm address.
- * @count: number of bytes to be read.
+ * vread_iter() - read vmalloc area in a safe way to an iterator.
+ * @iter: the iterator to which data should be written.
+ * @addr: vm address.
+ * @count: number of bytes to be read.
*
- * Returns # of bytes which addr and buf should be incresed.
- * (same number to @count).
- * If [addr...addr+count) doesn't includes any intersect with valid
- * vmalloc area, returns 0.
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * copy data from that area to a given buffer. If the given memory range
+ * of [addr...addr+count) includes some valid address, data is copied to
+ * proper area of @buf. If there are memory holes, they'll be zero-filled.
+ * IOREMAP area is treated as memory hole and no copy is done.
*
- * This function checks that addr is a valid vmalloc'ed area, and
- * copy data from a buffer to the given addr. If specified range of
- * [addr...addr+count) includes some valid address, data is copied from
- * proper area of @buf. If there are memory holes, no copy to hole.
- * IOREMAP area is treated as memory hole and no copy is done.
+ * If [addr...addr+count) doesn't includes any intersects with alive
+ * vm_struct area, returns 0. @buf should be kernel's buffer.
*
- * If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0. @buf should be kernel's buffer.
+ * Note: In usual ops, vread() is never necessary because the caller
+ * should know vmalloc() area is valid and can use memcpy().
+ * This is for routines which have to access vmalloc area without
+ * any information, as /proc/kcore.
*
- * Note: In usual ops, vwrite() is never necessary because the caller
- * should know vmalloc() area is valid and can use memcpy().
- * This is for routines which have to access vmalloc area without
- * any informaion, as /dev/kmem.
+ * Return: number of bytes for which addr and buf should be increased
+ * (same number as @count) or %0 if [addr...addr+count) doesn't
+ * include any intersection with valid vmalloc area
*/
-
-long vwrite(char *buf, char *addr, unsigned long count)
+long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
{
+ struct vmap_node *vn;
struct vmap_area *va;
struct vm_struct *vm;
char *vaddr;
- unsigned long n, buflen;
- int copied = 0;
+ size_t n, size, flags, remains;
+ unsigned long next;
+
+ addr = kasan_reset_tag(addr);
/* Don't allow overflow */
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
- buflen = count;
- spin_lock(&vmap_area_lock);
- list_for_each_entry(va, &vmap_area_list, list) {
- if (!count)
- break;
+ remains = count;
- if (!(va->flags & VM_VM_AREA))
- continue;
+ vn = find_vmap_area_exceed_addr_lock((unsigned long) addr, &va);
+ if (!vn)
+ goto finished_zero;
+
+ /* no intersects with alive vmap_area */
+ if ((unsigned long)addr + remains <= va->va_start)
+ goto finished_zero;
+
+ do {
+ size_t copied;
+
+ if (remains == 0)
+ goto finished;
vm = va->vm;
- vaddr = (char *) vm->addr;
- if (addr >= vaddr + vm->size - PAGE_SIZE)
- continue;
- while (addr < vaddr) {
- if (count == 0)
+ flags = va->flags & VMAP_FLAGS_MASK;
+ /*
+ * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need
+ * be set together with VMAP_RAM.
+ */
+ WARN_ON(flags == VMAP_BLOCK);
+
+ if (!vm && !flags)
+ goto next_va;
+
+ if (vm && (vm->flags & VM_UNINITIALIZED))
+ goto next_va;
+
+ /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+ smp_rmb();
+
+ vaddr = (char *) va->va_start;
+ size = vm ? get_vm_area_size(vm) : va_size(va);
+
+ if (addr >= vaddr + size)
+ goto next_va;
+
+ if (addr < vaddr) {
+ size_t to_zero = min_t(size_t, vaddr - addr, remains);
+ size_t zeroed = zero_iter(iter, to_zero);
+
+ addr += zeroed;
+ remains -= zeroed;
+
+ if (remains == 0 || zeroed != to_zero)
goto finished;
- buf++;
- addr++;
- count--;
- }
- n = vaddr + vm->size - PAGE_SIZE - addr;
- if (n > count)
- n = count;
- if (!(vm->flags & VM_IOREMAP)) {
- aligned_vwrite(buf, addr, n);
- copied++;
}
- buf += n;
- addr += n;
- count -= n;
- }
+
+ n = vaddr + size - addr;
+ if (n > remains)
+ n = remains;
+
+ if (flags & VMAP_RAM)
+ copied = vmap_ram_vread_iter(iter, addr, n, flags);
+ else if (!(vm && (vm->flags & (VM_IOREMAP | VM_SPARSE))))
+ copied = aligned_vread_iter(iter, addr, n);
+ else /* IOREMAP | SPARSE area is treated as memory hole */
+ copied = zero_iter(iter, n);
+
+ addr += copied;
+ remains -= copied;
+
+ if (copied != n)
+ goto finished;
+
+ next_va:
+ next = va->va_end;
+ spin_unlock(&vn->busy.lock);
+ } while ((vn = find_vmap_area_exceed_addr_lock(next, &va)));
+
+finished_zero:
+ if (vn)
+ spin_unlock(&vn->busy.lock);
+
+ /* zero-fill memory holes */
+ return count - remains + zero_iter(iter, remains);
finished:
- spin_unlock(&vmap_area_lock);
- if (!copied)
- return 0;
- return buflen;
+ /* Nothing remains, or We couldn't copy/zero everything. */
+ if (vn)
+ spin_unlock(&vn->busy.lock);
+
+ return count - remains;
}
/**
- * remap_vmalloc_range_partial - map vmalloc pages to userspace
- * @vma: vma to cover
- * @uaddr: target user address to start at
- * @kaddr: virtual address of vmalloc kernel memory
- * @size: size of map area
+ * remap_vmalloc_range_partial - map vmalloc pages to userspace
+ * @vma: vma to cover
+ * @uaddr: target user address to start at
+ * @kaddr: virtual address of vmalloc kernel memory
+ * @pgoff: offset from @kaddr to start at
+ * @size: size of map area
*
- * Returns: 0 for success, -Exxx on failure
+ * Returns: 0 for success, -Exxx on failure
*
- * This function checks that @kaddr is a valid vmalloc'ed area,
- * and that it is big enough to cover the range starting at
- * @uaddr in @vma. Will return failure if that criteria isn't
- * met.
+ * This function checks that @kaddr is a valid vmalloc'ed area,
+ * and that it is big enough to cover the range starting at
+ * @uaddr in @vma. Will return failure if that criteria isn't
+ * met.
*
- * Similar to remap_pfn_range() (see mm/memory.c)
+ * Similar to remap_pfn_range() (see mm/memory.c)
*/
int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
- void *kaddr, unsigned long size)
+ void *kaddr, unsigned long pgoff,
+ unsigned long size)
{
struct vm_struct *area;
+ unsigned long off;
+ unsigned long end_index;
+
+ if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
+ return -EINVAL;
size = PAGE_ALIGN(size);
@@ -2129,11 +4717,13 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
if (!area)
return -EINVAL;
- if (!(area->flags & VM_USERMAP))
+ if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
return -EINVAL;
- if (kaddr + size > area->addr + area->size)
+ if (check_add_overflow(size, off, &end_index) ||
+ end_index > get_vm_area_size(area))
return -EINVAL;
+ kaddr += off;
do {
struct page *page = vmalloc_to_page(kaddr);
@@ -2148,92 +4738,34 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
size -= PAGE_SIZE;
} while (size > 0);
- vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+ vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP);
return 0;
}
-EXPORT_SYMBOL(remap_vmalloc_range_partial);
/**
- * remap_vmalloc_range - map vmalloc pages to userspace
- * @vma: vma to cover (map full range of vma)
- * @addr: vmalloc memory
- * @pgoff: number of pages into addr before first page to map
+ * remap_vmalloc_range - map vmalloc pages to userspace
+ * @vma: vma to cover (map full range of vma)
+ * @addr: vmalloc memory
+ * @pgoff: number of pages into addr before first page to map
*
- * Returns: 0 for success, -Exxx on failure
+ * Returns: 0 for success, -Exxx on failure
*
- * This function checks that addr is a valid vmalloc'ed area, and
- * that it is big enough to cover the vma. Will return failure if
- * that criteria isn't met.
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * that it is big enough to cover the vma. Will return failure if
+ * that criteria isn't met.
*
- * Similar to remap_pfn_range() (see mm/memory.c)
+ * Similar to remap_pfn_range() (see mm/memory.c)
*/
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
unsigned long pgoff)
{
return remap_vmalloc_range_partial(vma, vma->vm_start,
- addr + (pgoff << PAGE_SHIFT),
+ addr, pgoff,
vma->vm_end - vma->vm_start);
}
EXPORT_SYMBOL(remap_vmalloc_range);
-/*
- * Implement a stub for vmalloc_sync_all() if the architecture chose not to
- * have one.
- */
-void __attribute__((weak)) vmalloc_sync_all(void)
-{
-}
-
-
-static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
-{
- pte_t ***p = data;
-
- if (p) {
- *(*p) = pte;
- (*p)++;
- }
- return 0;
-}
-
-/**
- * alloc_vm_area - allocate a range of kernel address space
- * @size: size of the area
- * @ptes: returns the PTEs for the address space
- *
- * Returns: NULL on failure, vm_struct on success
- *
- * This function reserves a range of kernel address space, and
- * allocates pagetables to map that range. No actual mappings
- * are created.
- *
- * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
- * allocated for the VM area are returned.
- */
-struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
-{
- struct vm_struct *area;
-
- area = get_vm_area_caller(size, VM_IOREMAP,
- __builtin_return_address(0));
- if (area == NULL)
- return NULL;
-
- /*
- * This ensures that page tables are constructed for this region
- * of kernel virtual address space and mapped into init_mm.
- */
- if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
- size, f, ptes ? &ptes : NULL)) {
- free_vm_area(area);
- return NULL;
- }
-
- return area;
-}
-EXPORT_SYMBOL_GPL(alloc_vm_area);
-
void free_vm_area(struct vm_struct *area)
{
struct vm_struct *ret;
@@ -2246,85 +4778,69 @@ EXPORT_SYMBOL_GPL(free_vm_area);
#ifdef CONFIG_SMP
static struct vmap_area *node_to_va(struct rb_node *n)
{
- return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
+ return rb_entry_safe(n, struct vmap_area, rb_node);
}
/**
- * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
- * @end: target address
- * @pnext: out arg for the next vmap_area
- * @pprev: out arg for the previous vmap_area
+ * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
+ * @addr: target address
*
- * Returns: %true if either or both of next and prev are found,
- * %false if no vmap_area exists
- *
- * Find vmap_areas end addresses of which enclose @end. ie. if not
- * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
+ * Returns: vmap_area if it is found. If there is no such area
+ * the first highest(reverse order) vmap_area is returned
+ * i.e. va->va_start < addr && va->va_end < addr or NULL
+ * if there are no any areas before @addr.
*/
-static bool pvm_find_next_prev(unsigned long end,
- struct vmap_area **pnext,
- struct vmap_area **pprev)
+static struct vmap_area *
+pvm_find_va_enclose_addr(unsigned long addr)
{
- struct rb_node *n = vmap_area_root.rb_node;
- struct vmap_area *va = NULL;
+ struct vmap_area *va, *tmp;
+ struct rb_node *n;
+
+ n = free_vmap_area_root.rb_node;
+ va = NULL;
while (n) {
- va = rb_entry(n, struct vmap_area, rb_node);
- if (end < va->va_end)
- n = n->rb_left;
- else if (end > va->va_end)
+ tmp = rb_entry(n, struct vmap_area, rb_node);
+ if (tmp->va_start <= addr) {
+ va = tmp;
+ if (tmp->va_end >= addr)
+ break;
+
n = n->rb_right;
- else
- break;
+ } else {
+ n = n->rb_left;
+ }
}
- if (!va)
- return false;
-
- if (va->va_end > end) {
- *pnext = va;
- *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
- } else {
- *pprev = va;
- *pnext = node_to_va(rb_next(&(*pprev)->rb_node));
- }
- return true;
+ return va;
}
/**
- * pvm_determine_end - find the highest aligned address between two vmap_areas
- * @pnext: in/out arg for the next vmap_area
- * @pprev: in/out arg for the previous vmap_area
- * @align: alignment
+ * pvm_determine_end_from_reverse - find the highest aligned address
+ * of free block below VMALLOC_END
+ * @va:
+ * in - the VA we start the search(reverse order);
+ * out - the VA with the highest aligned end address.
+ * @align: alignment for required highest address
*
- * Returns: determined end address
- *
- * Find the highest aligned address between *@pnext and *@pprev below
- * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned
- * down address is between the end addresses of the two vmap_areas.
- *
- * Please note that the address returned by this function may fall
- * inside *@pnext vmap_area. The caller is responsible for checking
- * that.
+ * Returns: determined end address within vmap_area
*/
-static unsigned long pvm_determine_end(struct vmap_area **pnext,
- struct vmap_area **pprev,
- unsigned long align)
+static unsigned long
+pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
{
- const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
+ unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
unsigned long addr;
- if (*pnext)
- addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
- else
- addr = vmalloc_end;
-
- while (*pprev && (*pprev)->va_end > addr) {
- *pnext = *pprev;
- *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
+ if (likely(*va)) {
+ list_for_each_entry_from_reverse((*va),
+ &free_vmap_area_list, list) {
+ addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
+ if ((*va)->va_start < addr)
+ return addr;
+ }
}
- return addr;
+ return 0;
}
/**
@@ -2344,12 +4860,12 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
* to gigabytes. To avoid interacting with regular vmallocs, these
* areas are allocated from top.
*
- * Despite its complicated look, this allocator is rather simple. It
- * does everything top-down and scans areas from the end looking for
- * matching slot. While scanning, if any of the areas overlaps with
- * existing vmap_area, the base address is pulled down to fit the
- * area. Scanning is repeated till all the areas fit and then all
- * necessary data structres are inserted and the result is returned.
+ * Despite its complicated look, this allocator is rather simple. It
+ * does everything top-down and scans free blocks from the end looking
+ * for matching base. While scanning, if any of the areas do not fit the
+ * base address is pulled down to fit the area. Scanning is repeated till
+ * all the areas fit and then all necessary data structures are inserted
+ * and the result is returned.
*/
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
const size_t *sizes, int nr_vms,
@@ -2357,14 +4873,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
{
const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
- struct vmap_area **vas, *prev, *next;
+ struct vmap_area **vas, *va;
struct vm_struct **vms;
int area, area2, last_area, term_area;
- unsigned long base, start, end, last_end;
+ unsigned long base, start, size, end, last_end, orig_start, orig_end;
bool purged = false;
/* verify parameters and allocate data structures */
- BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
+ BUG_ON(offset_in_page(align) || !is_power_of_2(align));
for (last_area = 0, area = 0; area < nr_vms; area++) {
start = offsets[area];
end = start + sizes[area];
@@ -2377,15 +4893,11 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
if (start > offsets[last_area])
last_area = area;
- for (area2 = 0; area2 < nr_vms; area2++) {
+ for (area2 = area + 1; area2 < nr_vms; area2++) {
unsigned long start2 = offsets[area2];
unsigned long end2 = start2 + sizes[area2];
- if (area2 == area)
- continue;
-
- BUG_ON(start2 >= start && start2 < end);
- BUG_ON(end2 <= end && end2 > start);
+ BUG_ON(start2 < end && start < end2);
}
}
last_end = offsets[last_area] + sizes[last_area];
@@ -2401,62 +4913,52 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
goto err_free2;
for (area = 0; area < nr_vms; area++) {
- vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
+ vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
if (!vas[area] || !vms[area])
goto err_free;
}
retry:
- spin_lock(&vmap_area_lock);
+ spin_lock(&free_vmap_area_lock);
/* start scanning - we scan from the top, begin with the last area */
area = term_area = last_area;
start = offsets[area];
end = start + sizes[area];
- if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
- base = vmalloc_end - last_end;
- goto found;
- }
- base = pvm_determine_end(&next, &prev, align) - end;
+ va = pvm_find_va_enclose_addr(vmalloc_end);
+ base = pvm_determine_end_from_reverse(&va, align) - end;
while (true) {
- BUG_ON(next && next->va_end <= base + end);
- BUG_ON(prev && prev->va_end > base + end);
-
/*
* base might have underflowed, add last_end before
* comparing.
*/
- if (base + last_end < vmalloc_start + last_end) {
- spin_unlock(&vmap_area_lock);
- if (!purged) {
- purge_vmap_area_lazy();
- purged = true;
- goto retry;
- }
- goto err_free;
- }
+ if (base + last_end < vmalloc_start + last_end)
+ goto overflow;
/*
- * If next overlaps, move base downwards so that it's
- * right below next and then recheck.
+ * Fitting base has not been found.
*/
- if (next && next->va_start < base + end) {
- base = pvm_determine_end(&next, &prev, align) - end;
+ if (va == NULL)
+ goto overflow;
+
+ /*
+ * If required width exceeds current VA block, move
+ * base downwards and then recheck.
+ */
+ if (base + end > va->va_end) {
+ base = pvm_determine_end_from_reverse(&va, align) - end;
term_area = area;
continue;
}
/*
- * If prev overlaps, shift down next and prev and move
- * base so that it's right below new next and then
- * recheck.
+ * If this VA does not fit, move base downwards and recheck.
*/
- if (prev && prev->va_end > base + start) {
- next = prev;
- prev = node_to_va(rb_prev(&next->rb_node));
- base = pvm_determine_end(&next, &prev, align) - end;
+ if (base + start < va->va_start) {
+ va = node_to_va(rb_prev(&va->rb_node));
+ base = pvm_determine_end_from_reverse(&va, align) - end;
term_area = area;
continue;
}
@@ -2468,41 +4970,142 @@ retry:
area = (area + nr_vms - 1) % nr_vms;
if (area == term_area)
break;
+
start = offsets[area];
end = start + sizes[area];
- pvm_find_next_prev(base + end, &next, &prev);
+ va = pvm_find_va_enclose_addr(base + end);
}
-found:
+
/* we've found a fitting base, insert all va's */
for (area = 0; area < nr_vms; area++) {
- struct vmap_area *va = vas[area];
+ int ret;
+
+ start = base + offsets[area];
+ size = sizes[area];
+
+ va = pvm_find_va_enclose_addr(start);
+ if (WARN_ON_ONCE(va == NULL))
+ /* It is a BUG(), but trigger recovery instead. */
+ goto recovery;
- va->va_start = base + offsets[area];
- va->va_end = va->va_start + sizes[area];
- __insert_vmap_area(va);
+ ret = va_clip(&free_vmap_area_root,
+ &free_vmap_area_list, va, start, size);
+ if (WARN_ON_ONCE(unlikely(ret)))
+ /* It is a BUG(), but trigger recovery instead. */
+ goto recovery;
+
+ /* Allocated area. */
+ va = vas[area];
+ va->va_start = start;
+ va->va_end = start + size;
}
- vmap_area_pcpu_hole = base + offsets[last_area];
+ spin_unlock(&free_vmap_area_lock);
- spin_unlock(&vmap_area_lock);
+ /* populate the kasan shadow space */
+ for (area = 0; area < nr_vms; area++) {
+ if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area], GFP_KERNEL))
+ goto err_free_shadow;
+ }
/* insert all vm's */
- for (area = 0; area < nr_vms; area++)
+ for (area = 0; area < nr_vms; area++) {
+ struct vmap_node *vn = addr_to_node(vas[area]->va_start);
+
+ spin_lock(&vn->busy.lock);
+ insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
+ spin_unlock(&vn->busy.lock);
+ }
+
+ /*
+ * Mark allocated areas as accessible. Do it now as a best-effort
+ * approach, as they can be mapped outside of vmalloc code.
+ * With hardware tag-based KASAN, marking is skipped for
+ * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
+ */
+ for (area = 0; area < nr_vms; area++)
+ vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
+ vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);
kfree(vas);
return vms;
+recovery:
+ /*
+ * Remove previously allocated areas. There is no
+ * need in removing these areas from the busy tree,
+ * because they are inserted only on the final step
+ * and when pcpu_get_vm_areas() is success.
+ */
+ while (area--) {
+ orig_start = vas[area]->va_start;
+ orig_end = vas[area]->va_end;
+ va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
+ &free_vmap_area_list);
+ if (va)
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end,
+ KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
+ vas[area] = NULL;
+ }
+
+overflow:
+ spin_unlock(&free_vmap_area_lock);
+ if (!purged) {
+ reclaim_and_purge_vmap_areas();
+ purged = true;
+
+ /* Before "retry", check if we recover. */
+ for (area = 0; area < nr_vms; area++) {
+ if (vas[area])
+ continue;
+
+ vas[area] = kmem_cache_zalloc(
+ vmap_area_cachep, GFP_KERNEL);
+ if (!vas[area])
+ goto err_free;
+ }
+
+ goto retry;
+ }
+
err_free:
for (area = 0; area < nr_vms; area++) {
- kfree(vas[area]);
+ if (vas[area])
+ kmem_cache_free(vmap_area_cachep, vas[area]);
+
kfree(vms[area]);
}
err_free2:
kfree(vas);
kfree(vms);
return NULL;
+
+err_free_shadow:
+ spin_lock(&free_vmap_area_lock);
+ /*
+ * We release all the vmalloc shadows, even the ones for regions that
+ * hadn't been successfully added. This relies on kasan_release_vmalloc
+ * being able to tolerate this case.
+ */
+ for (area = 0; area < nr_vms; area++) {
+ orig_start = vas[area]->va_start;
+ orig_end = vas[area]->va_end;
+ va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
+ &free_vmap_area_list);
+ if (va)
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end,
+ KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
+ vas[area] = NULL;
+ kfree(vms[area]);
+ }
+ spin_unlock(&free_vmap_area_lock);
+ kfree(vas);
+ kfree(vms);
+ return NULL;
}
/**
@@ -2522,203 +5125,350 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
}
#endif /* CONFIG_SMP */
-#ifdef CONFIG_PROC_FS
-static void *s_start(struct seq_file *m, loff_t *pos)
- __acquires(&vmap_area_lock)
+#ifdef CONFIG_PRINTK
+bool vmalloc_dump_obj(void *object)
{
- loff_t n = *pos;
+ const void *caller;
+ struct vm_struct *vm;
struct vmap_area *va;
+ struct vmap_node *vn;
+ unsigned long addr;
+ unsigned int nr_pages;
- spin_lock(&vmap_area_lock);
- va = list_entry((&vmap_area_list)->next, typeof(*va), list);
- while (n > 0 && &va->list != &vmap_area_list) {
- n--;
- va = list_entry(va->list.next, typeof(*va), list);
+ addr = PAGE_ALIGN((unsigned long) object);
+ vn = addr_to_node(addr);
+
+ if (!spin_trylock(&vn->busy.lock))
+ return false;
+
+ va = __find_vmap_area(addr, &vn->busy.root);
+ if (!va || !va->vm) {
+ spin_unlock(&vn->busy.lock);
+ return false;
}
- if (!n && &va->list != &vmap_area_list)
- return va;
- return NULL;
+ vm = va->vm;
+ addr = (unsigned long) vm->addr;
+ caller = vm->caller;
+ nr_pages = vm->nr_pages;
+ spin_unlock(&vn->busy.lock);
+
+ pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
+ nr_pages, addr, caller);
+ return true;
}
+#endif
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+#ifdef CONFIG_PROC_FS
+
+/*
+ * Print number of pages allocated on each memory node.
+ *
+ * This function can only be called if CONFIG_NUMA is enabled
+ * and VM_UNINITIALIZED bit in v->flags is disabled.
+ */
+static void show_numa_info(struct seq_file *m, struct vm_struct *v,
+ unsigned int *counters)
{
- struct vmap_area *va = p, *next;
+ unsigned int nr;
+ unsigned int step = 1U << vm_area_page_order(v);
+
+ if (!counters)
+ return;
- ++*pos;
- next = list_entry(va->list.next, typeof(*va), list);
- if (&next->list != &vmap_area_list)
- return next;
+ memset(counters, 0, nr_node_ids * sizeof(unsigned int));
- return NULL;
+ for (nr = 0; nr < v->nr_pages; nr += step)
+ counters[page_to_nid(v->pages[nr])] += step;
+ for_each_node_state(nr, N_HIGH_MEMORY)
+ if (counters[nr])
+ seq_printf(m, " N%u=%u", nr, counters[nr]);
}
-static void s_stop(struct seq_file *m, void *p)
- __releases(&vmap_area_lock)
+static void show_purge_info(struct seq_file *m)
{
- spin_unlock(&vmap_area_lock);
+ struct vmap_node *vn;
+ struct vmap_area *va;
+
+ for_each_vmap_node(vn) {
+ spin_lock(&vn->lazy.lock);
+ list_for_each_entry(va, &vn->lazy.head, list) {
+ seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
+ (void *)va->va_start, (void *)va->va_end,
+ va_size(va));
+ }
+ spin_unlock(&vn->lazy.lock);
+ }
}
-static void show_numa_info(struct seq_file *m, struct vm_struct *v)
+static int vmalloc_info_show(struct seq_file *m, void *p)
{
- if (IS_ENABLED(CONFIG_NUMA)) {
- unsigned int nr, *counters = m->private;
+ struct vmap_node *vn;
+ struct vmap_area *va;
+ struct vm_struct *v;
+ unsigned int *counters;
- if (!counters)
- return;
+ if (IS_ENABLED(CONFIG_NUMA))
+ counters = kmalloc_array(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
- memset(counters, 0, nr_node_ids * sizeof(unsigned int));
+ for_each_vmap_node(vn) {
+ spin_lock(&vn->busy.lock);
+ list_for_each_entry(va, &vn->busy.head, list) {
+ if (!va->vm) {
+ if (va->flags & VMAP_RAM)
+ seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
+ (void *)va->va_start, (void *)va->va_end,
+ va_size(va));
- for (nr = 0; nr < v->nr_pages; nr++)
- counters[page_to_nid(v->pages[nr])]++;
+ continue;
+ }
- for_each_node_state(nr, N_HIGH_MEMORY)
- if (counters[nr])
- seq_printf(m, " N%u=%u", nr, counters[nr]);
- }
-}
+ v = va->vm;
+ if (v->flags & VM_UNINITIALIZED)
+ continue;
-static int s_show(struct seq_file *m, void *p)
-{
- struct vmap_area *va = p;
- struct vm_struct *v;
+ /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+ smp_rmb();
- if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING))
- return 0;
+ seq_printf(m, "0x%pK-0x%pK %7ld",
+ v->addr, v->addr + v->size, v->size);
- if (!(va->flags & VM_VM_AREA)) {
- seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
- (void *)va->va_start, (void *)va->va_end,
- va->va_end - va->va_start);
- return 0;
- }
+ if (v->caller)
+ seq_printf(m, " %pS", v->caller);
- v = va->vm;
+ if (v->nr_pages)
+ seq_printf(m, " pages=%d", v->nr_pages);
- /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
- smp_rmb();
- if (v->flags & VM_UNINITIALIZED)
- return 0;
+ if (v->phys_addr)
+ seq_printf(m, " phys=%pa", &v->phys_addr);
- seq_printf(m, "0x%pK-0x%pK %7ld",
- v->addr, v->addr + v->size, v->size);
+ if (v->flags & VM_IOREMAP)
+ seq_puts(m, " ioremap");
- if (v->caller)
- seq_printf(m, " %pS", v->caller);
+ if (v->flags & VM_SPARSE)
+ seq_puts(m, " sparse");
- if (v->nr_pages)
- seq_printf(m, " pages=%d", v->nr_pages);
+ if (v->flags & VM_ALLOC)
+ seq_puts(m, " vmalloc");
- if (v->phys_addr)
- seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr);
+ if (v->flags & VM_MAP)
+ seq_puts(m, " vmap");
- if (v->flags & VM_IOREMAP)
- seq_printf(m, " ioremap");
+ if (v->flags & VM_USERMAP)
+ seq_puts(m, " user");
- if (v->flags & VM_ALLOC)
- seq_printf(m, " vmalloc");
+ if (v->flags & VM_DMA_COHERENT)
+ seq_puts(m, " dma-coherent");
- if (v->flags & VM_MAP)
- seq_printf(m, " vmap");
+ if (is_vmalloc_addr(v->pages))
+ seq_puts(m, " vpages");
- if (v->flags & VM_USERMAP)
- seq_printf(m, " user");
+ if (IS_ENABLED(CONFIG_NUMA))
+ show_numa_info(m, v, counters);
- if (v->flags & VM_VPAGES)
- seq_printf(m, " vpages");
+ seq_putc(m, '\n');
+ }
+ spin_unlock(&vn->busy.lock);
+ }
- show_numa_info(m, v);
- seq_putc(m, '\n');
+ /*
+ * As a final step, dump "unpurged" areas.
+ */
+ show_purge_info(m);
+ if (IS_ENABLED(CONFIG_NUMA))
+ kfree(counters);
return 0;
}
-static const struct seq_operations vmalloc_op = {
- .start = s_start,
- .next = s_next,
- .stop = s_stop,
- .show = s_show,
-};
+static int __init proc_vmalloc_init(void)
+{
+ proc_create_single("vmallocinfo", 0400, NULL, vmalloc_info_show);
+ return 0;
+}
+module_init(proc_vmalloc_init);
+
+#endif
-static int vmalloc_open(struct inode *inode, struct file *file)
+static void __init vmap_init_free_space(void)
{
- unsigned int *ptr = NULL;
- int ret;
+ unsigned long vmap_start = 1;
+ const unsigned long vmap_end = ULONG_MAX;
+ struct vmap_area *free;
+ struct vm_struct *busy;
- if (IS_ENABLED(CONFIG_NUMA)) {
- ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
- if (ptr == NULL)
- return -ENOMEM;
+ /*
+ * B F B B B F
+ * -|-----|.....|-----|-----|-----|.....|-
+ * | The KVA space |
+ * |<--------------------------------->|
+ */
+ for (busy = vmlist; busy; busy = busy->next) {
+ if ((unsigned long) busy->addr - vmap_start > 0) {
+ free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (!WARN_ON_ONCE(!free)) {
+ free->va_start = vmap_start;
+ free->va_end = (unsigned long) busy->addr;
+
+ insert_vmap_area_augment(free, NULL,
+ &free_vmap_area_root,
+ &free_vmap_area_list);
+ }
+ }
+
+ vmap_start = (unsigned long) busy->addr + busy->size;
+ }
+
+ if (vmap_end - vmap_start > 0) {
+ free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (!WARN_ON_ONCE(!free)) {
+ free->va_start = vmap_start;
+ free->va_end = vmap_end;
+
+ insert_vmap_area_augment(free, NULL,
+ &free_vmap_area_root,
+ &free_vmap_area_list);
+ }
}
- ret = seq_open(file, &vmalloc_op);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = ptr;
- } else
- kfree(ptr);
- return ret;
}
-static const struct file_operations proc_vmalloc_operations = {
- .open = vmalloc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
+static void vmap_init_nodes(void)
+{
+ struct vmap_node *vn;
+ int i;
-static int __init proc_vmalloc_init(void)
+#if BITS_PER_LONG == 64
+ /*
+ * A high threshold of max nodes is fixed and bound to 128,
+ * thus a scale factor is 1 for systems where number of cores
+ * are less or equal to specified threshold.
+ *
+ * As for NUMA-aware notes. For bigger systems, for example
+ * NUMA with multi-sockets, where we can end-up with thousands
+ * of cores in total, a "sub-numa-clustering" should be added.
+ *
+ * In this case a NUMA domain is considered as a single entity
+ * with dedicated sub-nodes in it which describe one group or
+ * set of cores. Therefore a per-domain purging is supposed to
+ * be added as well as a per-domain balancing.
+ */
+ int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
+
+ if (n > 1) {
+ vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT);
+ if (vn) {
+ /* Node partition is 16 pages. */
+ vmap_zone_size = (1 << 4) * PAGE_SIZE;
+ nr_vmap_nodes = n;
+ vmap_nodes = vn;
+ } else {
+ pr_err("Failed to allocate an array. Disable a node layer\n");
+ }
+ }
+#endif
+
+ for_each_vmap_node(vn) {
+ vn->busy.root = RB_ROOT;
+ INIT_LIST_HEAD(&vn->busy.head);
+ spin_lock_init(&vn->busy.lock);
+
+ vn->lazy.root = RB_ROOT;
+ INIT_LIST_HEAD(&vn->lazy.head);
+ spin_lock_init(&vn->lazy.lock);
+
+ for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
+ INIT_LIST_HEAD(&vn->pool[i].head);
+ WRITE_ONCE(vn->pool[i].len, 0);
+ }
+
+ spin_lock_init(&vn->pool_lock);
+ }
+}
+
+static unsigned long
+vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
- proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
- return 0;
+ unsigned long count = 0;
+ struct vmap_node *vn;
+ int i;
+
+ for_each_vmap_node(vn) {
+ for (i = 0; i < MAX_VA_SIZE_PAGES; i++)
+ count += READ_ONCE(vn->pool[i].len);
+ }
+
+ return count ? count : SHRINK_EMPTY;
}
-module_init(proc_vmalloc_init);
-void get_vmalloc_info(struct vmalloc_info *vmi)
+static unsigned long
+vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
- struct vmap_area *va;
- unsigned long free_area_size;
- unsigned long prev_end;
+ struct vmap_node *vn;
+
+ for_each_vmap_node(vn)
+ decay_va_pool_node(vn, true);
- vmi->used = 0;
- vmi->largest_chunk = 0;
+ return SHRINK_STOP;
+}
+
+void __init vmalloc_init(void)
+{
+ struct shrinker *vmap_node_shrinker;
+ struct vmap_area *va;
+ struct vmap_node *vn;
+ struct vm_struct *tmp;
+ int i;
- prev_end = VMALLOC_START;
+ /*
+ * Create the cache for vmap_area objects.
+ */
+ vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
- spin_lock(&vmap_area_lock);
+ for_each_possible_cpu(i) {
+ struct vmap_block_queue *vbq;
+ struct vfree_deferred *p;
- if (list_empty(&vmap_area_list)) {
- vmi->largest_chunk = VMALLOC_TOTAL;
- goto out;
+ vbq = &per_cpu(vmap_block_queue, i);
+ spin_lock_init(&vbq->lock);
+ INIT_LIST_HEAD(&vbq->free);
+ p = &per_cpu(vfree_deferred, i);
+ init_llist_head(&p->list);
+ INIT_WORK(&p->wq, delayed_vfree_work);
+ xa_init(&vbq->vmap_blocks);
}
- list_for_each_entry(va, &vmap_area_list, list) {
- unsigned long addr = va->va_start;
+ /*
+ * Setup nodes before importing vmlist.
+ */
+ vmap_init_nodes();
- /*
- * Some archs keep another range for modules in vmalloc space
- */
- if (addr < VMALLOC_START)
+ /* Import existing vmlist entries. */
+ for (tmp = vmlist; tmp; tmp = tmp->next) {
+ va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (WARN_ON_ONCE(!va))
continue;
- if (addr >= VMALLOC_END)
- break;
- if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING))
- continue;
+ va->va_start = (unsigned long)tmp->addr;
+ va->va_end = va->va_start + tmp->size;
+ va->vm = tmp;
- vmi->used += (va->va_end - va->va_start);
+ vn = addr_to_node(va->va_start);
+ insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
+ }
- free_area_size = addr - prev_end;
- if (vmi->largest_chunk < free_area_size)
- vmi->largest_chunk = free_area_size;
+ /*
+ * Now we can initialize a free vmap space.
+ */
+ vmap_init_free_space();
+ vmap_initialized = true;
- prev_end = va->va_end;
+ vmap_node_shrinker = shrinker_alloc(0, "vmap-node");
+ if (!vmap_node_shrinker) {
+ pr_err("Failed to allocate vmap-node shrinker!\n");
+ return;
}
- if (VMALLOC_END - prev_end > vmi->largest_chunk)
- vmi->largest_chunk = VMALLOC_END - prev_end;
-
-out:
- spin_unlock(&vmap_area_lock);
+ vmap_node_shrinker->count_objects = vmap_node_shrink_count;
+ vmap_node_shrinker->scan_objects = vmap_node_shrink_scan;
+ shrinker_register(vmap_node_shrinker);
}
-#endif
-