summaryrefslogtreecommitdiff
path: root/mm/vmalloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmalloc.c')
-rw-r--r--mm/vmalloc.c840
1 files changed, 580 insertions, 260 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e34ea860153f..ecbac900c35f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -100,12 +100,18 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct page *page;
unsigned long size = PAGE_SIZE;
+ if (WARN_ON_ONCE(!PAGE_ALIGNED(end - addr)))
+ return -EINVAL;
+
pfn = phys_addr >> PAGE_SHIFT;
pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
+
+ arch_enter_lazy_mmu_mode();
+
do {
- if (!pte_none(ptep_get(pte))) {
+ if (unlikely(!pte_none(ptep_get(pte)))) {
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
dump_page(page, "remapping already mapped page");
@@ -127,6 +133,8 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
} while (pte += PFN_DOWN(size), addr += size, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
@@ -162,6 +170,7 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
{
pmd_t *pmd;
unsigned long next;
+ int err = 0;
pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
if (!pmd)
@@ -175,10 +184,11 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
continue;
}
- if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
- return -ENOMEM;
+ err = vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask);
+ if (err)
+ break;
} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
+ return err;
}
static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
@@ -212,6 +222,7 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
{
pud_t *pud;
unsigned long next;
+ int err = 0;
pud = pud_alloc_track(&init_mm, p4d, addr, mask);
if (!pud)
@@ -225,11 +236,11 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
continue;
}
- if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
- max_page_shift, mask))
- return -ENOMEM;
+ err = vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift, mask);
+ if (err)
+ break;
} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
+ return err;
}
static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
@@ -263,6 +274,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
{
p4d_t *p4d;
unsigned long next;
+ int err = 0;
p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
if (!p4d)
@@ -276,11 +288,11 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
continue;
}
- if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
- max_page_shift, mask))
- return -ENOMEM;
+ err = vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift, mask);
+ if (err)
+ break;
} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
+ return err;
}
static int vmap_range_noflush(unsigned long addr, unsigned long end,
@@ -350,12 +362,30 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
{
pte_t *pte;
+ pte_t ptent;
+ unsigned long size = PAGE_SIZE;
pte = pte_offset_kernel(pmd, addr);
+ arch_enter_lazy_mmu_mode();
+
do {
- pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
+#ifdef CONFIG_HUGETLB_PAGE
+ size = arch_vmap_pte_range_unmap_size(addr, pte);
+ if (size != PAGE_SIZE) {
+ if (WARN_ON(!IS_ALIGNED(addr, size))) {
+ addr = ALIGN_DOWN(addr, size);
+ pte = PTR_ALIGN_DOWN(pte, sizeof(*pte) * (size >> PAGE_SHIFT));
+ }
+ ptent = huge_ptep_get_and_clear(&init_mm, addr, pte, size);
+ if (WARN_ON(end - addr < size))
+ size = end - addr;
+ } else
+#endif
+ ptent = ptep_get_and_clear(&init_mm, addr, pte);
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
}
@@ -374,8 +404,10 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
if (cleared || pmd_bad(*pmd))
*mask |= PGTBL_PMD_MODIFIED;
- if (cleared)
+ if (cleared) {
+ WARN_ON(next - addr < PMD_SIZE);
continue;
+ }
if (pmd_none_or_clear_bad(pmd))
continue;
vunmap_pte_range(pmd, addr, next, mask);
@@ -399,8 +431,10 @@ static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
if (cleared || pud_bad(*pud))
*mask |= PGTBL_PUD_MODIFIED;
- if (cleared)
+ if (cleared) {
+ WARN_ON(next - addr < PUD_SIZE);
continue;
+ }
if (pud_none_or_clear_bad(pud))
continue;
vunmap_pmd_range(pud, addr, next, mask);
@@ -487,6 +521,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
+ int err = 0;
pte_t *pte;
/*
@@ -497,21 +532,33 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
+
+ arch_enter_lazy_mmu_mode();
+
do {
struct page *page = pages[*nr];
- if (WARN_ON(!pte_none(ptep_get(pte))))
- return -EBUSY;
- if (WARN_ON(!page))
- return -ENOMEM;
- if (WARN_ON(!pfn_valid(page_to_pfn(page))))
- return -EINVAL;
+ if (WARN_ON(!pte_none(ptep_get(pte)))) {
+ err = -EBUSY;
+ break;
+ }
+ if (WARN_ON(!page)) {
+ err = -ENOMEM;
+ break;
+ }
+ if (WARN_ON(!pfn_valid(page_to_pfn(page)))) {
+ err = -EINVAL;
+ break;
+ }
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
- return 0;
+
+ return err;
}
static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
@@ -586,13 +633,13 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
mask |= PGTBL_PGD_MODIFIED;
err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
- return err;
+ break;
} while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
- return 0;
+ return err;
}
/*
@@ -631,16 +678,28 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
}
int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
- pgprot_t prot, struct page **pages, unsigned int page_shift)
+ pgprot_t prot, struct page **pages, unsigned int page_shift,
+ gfp_t gfp_mask)
{
int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
- page_shift);
+ page_shift, gfp_mask);
if (ret)
return ret;
return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
}
+static int __vmap_pages_range(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift,
+ gfp_t gfp_mask)
+{
+ int err;
+
+ err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift, gfp_mask);
+ flush_cache_vmap(addr, end);
+ return err;
+}
+
/**
* vmap_pages_range - map pages to a kernel virtual address
* @addr: start of the VM area to map
@@ -653,14 +712,10 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
* RETURNS:
* 0 on success, -errno on failure.
*/
-static int vmap_pages_range(unsigned long addr, unsigned long end,
+int vmap_pages_range(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift)
{
- int err;
-
- err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
- flush_cache_vmap(addr, end);
- return err;
+ return __vmap_pages_range(addr, end, prot, pages, page_shift, GFP_KERNEL);
}
static int check_sparse_vm_area(struct vm_struct *area, unsigned long start,
@@ -900,6 +955,11 @@ static struct vmap_node *vmap_nodes = &single;
static __read_mostly unsigned int nr_vmap_nodes = 1;
static __read_mostly unsigned int vmap_zone_size = 1;
+/* A simple iterator over all vmap-nodes. */
+#define for_each_vmap_node(vn) \
+ for ((vn) = &vmap_nodes[0]; \
+ (vn) < &vmap_nodes[nr_vmap_nodes]; (vn)++)
+
static inline unsigned int
addr_to_node_id(unsigned long addr)
{
@@ -918,6 +978,19 @@ id_to_node(unsigned int id)
return &vmap_nodes[id % nr_vmap_nodes];
}
+static inline unsigned int
+node_to_id(struct vmap_node *node)
+{
+ /* Pointer arithmetic. */
+ unsigned int id = node - vmap_nodes;
+
+ if (likely(id < nr_vmap_nodes))
+ return id;
+
+ WARN_ONCE(1, "An address 0x%p is out-of-bounds.\n", node);
+ return 0;
+}
+
/*
* We use the value 0 to represent "no node", that is why
* an encoded value will be the node-id incremented by 1.
@@ -990,7 +1063,8 @@ static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
static void drain_vmap_area_work(struct work_struct *work);
static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
-static atomic_long_t nr_vmalloc_pages;
+static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages;
+static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr;
unsigned long vmalloc_nr_pages(void)
{
@@ -1056,12 +1130,11 @@ find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
{
unsigned long va_start_lowest;
struct vmap_node *vn;
- int i;
repeat:
- for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ va_start_lowest = 0;
+ for_each_vmap_node(vn) {
spin_lock(&vn->busy.lock);
*va = __find_vmap_area_exceed_addr(addr, &vn->busy.root);
@@ -1698,7 +1771,7 @@ va_clip(struct rb_root *root, struct list_head *head,
*/
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
if (!lva)
- return -1;
+ return -ENOMEM;
}
/*
@@ -1712,7 +1785,7 @@ va_clip(struct rb_root *root, struct list_head *head,
*/
va->va_start = nva_start_addr + size;
} else {
- return -1;
+ return -EINVAL;
}
if (type != FL_FIT_TYPE) {
@@ -1741,19 +1814,19 @@ va_alloc(struct vmap_area *va,
/* Check the "vend" restriction. */
if (nva_start_addr + size > vend)
- return vend;
+ return -ERANGE;
/* Update the free vmap_area. */
ret = va_clip(root, head, va, nva_start_addr, size);
if (WARN_ON_ONCE(ret))
- return vend;
+ return ret;
return nva_start_addr;
}
/*
* Returns a start address of the newly allocated area, if success.
- * Otherwise a vend is returned that indicates failure.
+ * Otherwise an error value is returned that indicates failure.
*/
static __always_inline unsigned long
__alloc_vmap_area(struct rb_root *root, struct list_head *head,
@@ -1778,14 +1851,13 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
if (unlikely(!va))
- return vend;
+ return -ENOENT;
nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend);
- if (nva_start_addr == vend)
- return vend;
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
- find_vmap_lowest_match_check(root, head, size, align);
+ if (!IS_ERR_VALUE(nva_start_addr))
+ find_vmap_lowest_match_check(root, head, size, align);
#endif
return nva_start_addr;
@@ -1816,7 +1888,7 @@ static void free_vmap_area(struct vmap_area *va)
static inline void
preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
{
- struct vmap_area *va = NULL;
+ struct vmap_area *va = NULL, *tmp;
/*
* Preload this CPU with one extra vmap_area object. It is used
@@ -1832,7 +1904,8 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
spin_lock(lock);
- if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va))
+ tmp = NULL;
+ if (va && !__this_cpu_try_cmpxchg(ne_fit_preload_node, &tmp, va))
kmem_cache_free(vmap_area_cachep, va);
}
@@ -1914,7 +1987,7 @@ node_alloc(unsigned long size, unsigned long align,
struct vmap_area *va;
*vn_id = 0;
- *addr = vend;
+ *addr = -EINVAL;
/*
* Fallback to a global heap if not vmalloc or there
@@ -1939,7 +2012,7 @@ static inline void setup_vmalloc_vm(struct vm_struct *vm,
{
vm->flags = flags;
vm->addr = (void *)va->va_start;
- vm->size = va->va_end - va->va_start;
+ vm->size = vm->requested_size = va_size(va);
vm->caller = caller;
va->vm = vm;
}
@@ -1959,6 +2032,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long freed;
unsigned long addr;
unsigned int vn_id;
+ bool allow_block;
int purged = 0;
int ret;
@@ -1968,7 +2042,10 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
if (unlikely(!vmap_initialized))
return ERR_PTR(-EBUSY);
- might_sleep();
+ /* Only reclaim behaviour flags are relevant. */
+ gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
+ allow_block = gfpflags_allow_blocking(gfp_mask);
+ might_sleep_if(allow_block);
/*
* If a VA is obtained from a global heap(if it fails here)
@@ -1980,8 +2057,6 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
*/
va = node_alloc(size, align, vstart, vend, &addr, &vn_id);
if (!va) {
- gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
-
va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
@@ -1994,21 +2069,36 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
}
retry:
- if (addr == vend) {
+ if (IS_ERR_VALUE(addr)) {
preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
size, align, vstart, vend);
spin_unlock(&free_vmap_area_lock);
+
+ /*
+ * This is not a fast path. Check if yielding is needed. This
+ * is the only reschedule point in the vmalloc() path.
+ */
+ if (allow_block)
+ cond_resched();
}
- trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
+ trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr));
/*
- * If an allocation fails, the "vend" address is
+ * If an allocation fails, the error value is
* returned. Therefore trigger the overflow path.
*/
- if (unlikely(addr == vend))
- goto overflow;
+ if (IS_ERR_VALUE(addr)) {
+ if (allow_block)
+ goto overflow;
+
+ /*
+ * We can not trigger any reclaim logic because
+ * sleeping is not allowed, thus fail an allocation.
+ */
+ goto out_free_va;
+ }
va->va_start = addr;
va->va_end = addr + size;
@@ -2017,7 +2107,7 @@ retry:
if (vm) {
vm->addr = (void *)va->va_start;
- vm->size = va->va_end - va->va_start;
+ vm->size = va_size(va);
va->vm = vm;
}
@@ -2031,7 +2121,7 @@ retry:
BUG_ON(va->va_start < vstart);
BUG_ON(va->va_end > vend);
- ret = kasan_populate_vmalloc(addr, size);
+ ret = kasan_populate_vmalloc(addr, size, gfp_mask);
if (ret) {
free_vmap_area(va);
return ERR_PTR(ret);
@@ -2055,9 +2145,10 @@ overflow:
}
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
- pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
- size);
+ pr_warn("vmalloc_node_range for size %lu failed: Address range restricted to %#lx - %#lx\n",
+ size, vstart, vend);
+out_free_va:
kmem_cache_free(vmap_area_cachep, va);
return ERR_PTR(-EBUSY);
}
@@ -2099,8 +2190,6 @@ static unsigned long lazy_max_pages(void)
return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}
-static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
-
/*
* Serialize vmap purging. There is no actual critical section protected
* by this lock, but we want to avoid concurrent calls for performance
@@ -2110,7 +2199,6 @@ static DEFINE_MUTEX(vmap_purge_lock);
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
-static cpumask_t purge_nodes;
static void
reclaim_list_global(struct list_head *head)
@@ -2130,44 +2218,37 @@ reclaim_list_global(struct list_head *head)
static void
decay_va_pool_node(struct vmap_node *vn, bool full_decay)
{
+ LIST_HEAD(decay_list);
+ struct rb_root decay_root = RB_ROOT;
struct vmap_area *va, *nva;
- struct list_head decay_list;
- struct rb_root decay_root;
- unsigned long n_decay;
+ unsigned long n_decay, pool_len;
int i;
- decay_root = RB_ROOT;
- INIT_LIST_HEAD(&decay_list);
-
for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
- struct list_head tmp_list;
+ LIST_HEAD(tmp_list);
if (list_empty(&vn->pool[i].head))
continue;
- INIT_LIST_HEAD(&tmp_list);
-
/* Detach the pool, so no-one can access it. */
spin_lock(&vn->pool_lock);
list_replace_init(&vn->pool[i].head, &tmp_list);
spin_unlock(&vn->pool_lock);
- if (full_decay)
- WRITE_ONCE(vn->pool[i].len, 0);
+ pool_len = n_decay = vn->pool[i].len;
+ WRITE_ONCE(vn->pool[i].len, 0);
/* Decay a pool by ~25% out of left objects. */
- n_decay = vn->pool[i].len >> 2;
+ if (!full_decay)
+ n_decay >>= 2;
+ pool_len -= n_decay;
list_for_each_entry_safe(va, nva, &tmp_list, list) {
+ if (!n_decay--)
+ break;
+
list_del_init(&va->list);
merge_or_add_vmap_area(va, &decay_root, &decay_list);
-
- if (!full_decay) {
- WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1);
-
- if (!--n_decay)
- break;
- }
}
/*
@@ -2176,9 +2257,10 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
* can populate the pool therefore a simple list replace
* operation takes place here.
*/
- if (!full_decay && !list_empty(&tmp_list)) {
+ if (!list_empty(&tmp_list)) {
spin_lock(&vn->pool_lock);
list_replace_init(&tmp_list, &vn->pool[i].head);
+ WRITE_ONCE(vn->pool[i].len, pool_len);
spin_unlock(&vn->pool_lock);
}
}
@@ -2186,28 +2268,45 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
reclaim_list_global(&decay_list);
}
+static void
+kasan_release_vmalloc_node(struct vmap_node *vn)
+{
+ struct vmap_area *va;
+ unsigned long start, end;
+
+ start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
+ end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
+
+ list_for_each_entry(va, &vn->purge_list, list) {
+ if (is_vmalloc_or_module_addr((void *) va->va_start))
+ kasan_release_vmalloc(va->va_start, va->va_end,
+ va->va_start, va->va_end,
+ KASAN_VMALLOC_PAGE_RANGE);
+ }
+
+ kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
+}
+
static void purge_vmap_node(struct work_struct *work)
{
struct vmap_node *vn = container_of(work,
struct vmap_node, purge_work);
+ unsigned long nr_purged_pages = 0;
struct vmap_area *va, *n_va;
LIST_HEAD(local_list);
+ if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
+ kasan_release_vmalloc_node(vn);
+
vn->nr_purged = 0;
list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
- unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
- unsigned long orig_start = va->va_start;
- unsigned long orig_end = va->va_end;
+ unsigned long nr = va_size(va) >> PAGE_SHIFT;
unsigned int vn_id = decode_vn_id(va->flags);
list_del_init(&va->list);
- if (is_vmalloc_or_module_addr((void *)orig_start))
- kasan_release_vmalloc(orig_start, orig_end,
- va->va_start, va->va_end);
-
- atomic_long_sub(nr, &vmap_lazy_nr);
+ nr_purged_pages += nr;
vn->nr_purged++;
if (is_vn_id_valid(vn_id) && !vn->skip_populate)
@@ -2218,6 +2317,8 @@ static void purge_vmap_node(struct work_struct *work)
list_add(&va->list, &local_list);
}
+ atomic_long_sub(nr_purged_pages, &vmap_lazy_nr);
+
reclaim_list_global(&local_list);
}
@@ -2229,6 +2330,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
{
unsigned long nr_purged_areas = 0;
unsigned int nr_purge_helpers;
+ static cpumask_t purge_nodes;
unsigned int nr_purge_nodes;
struct vmap_node *vn;
int i;
@@ -2240,9 +2342,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
*/
purge_nodes = CPU_MASK_NONE;
- for (i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
-
+ for_each_vmap_node(vn) {
INIT_LIST_HEAD(&vn->purge_list);
vn->skip_populate = full_pool_decay;
decay_va_pool_node(vn, full_pool_decay);
@@ -2261,7 +2361,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
end = max(end, list_last_entry(&vn->purge_list,
struct vmap_area, list)->va_end);
- cpumask_set_cpu(i, &purge_nodes);
+ cpumask_set_cpu(node_to_id(vn), &purge_nodes);
}
nr_purge_nodes = cpumask_weight(&purge_nodes);
@@ -2340,8 +2440,8 @@ static void free_vmap_area_noflush(struct vmap_area *va)
if (WARN_ON_ONCE(!list_empty(&va->list)))
return;
- nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
- PAGE_SHIFT, &vmap_lazy_nr);
+ nr_lazy = atomic_long_add_return_relaxed(va_size(va) >> PAGE_SHIFT,
+ &vmap_lazy_nr);
/*
* If it was request by a certain node we would like to
@@ -2406,7 +2506,7 @@ struct vmap_area *find_vmap_area(unsigned long addr)
if (va)
return va;
- } while ((i = (i + 1) % nr_vmap_nodes) != j);
+ } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);
return NULL;
}
@@ -2432,7 +2532,7 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
if (va)
return va;
- } while ((i = (i + 1) % nr_vmap_nodes) != j);
+ } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);
return NULL;
}
@@ -2599,8 +2699,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
node = numa_node_id();
- vb = kmalloc_node(sizeof(struct vmap_block),
- gfp_mask & GFP_RECLAIM_MASK, node);
+ vb = kmalloc_node(sizeof(struct vmap_block), gfp_mask, node);
if (unlikely(!vb))
return ERR_PTR(-ENOMEM);
@@ -2625,6 +2724,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
vb->dirty_max = 0;
bitmap_set(vb->used_map, 0, (1UL << order));
INIT_LIST_HEAD(&vb->free_list);
+ vb->cpu = raw_smp_processor_id();
xa = addr_to_vb_xa(va->va_start);
vb_idx = addr_to_vb_idx(va->va_start);
@@ -2641,7 +2741,6 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
* integrity together with list_for_each_rcu from read
* side.
*/
- vb->cpu = raw_smp_processor_id();
vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu);
spin_lock(&vbq->lock);
list_add_tail_rcu(&vb->free_list, &vbq->free);
@@ -2901,10 +3000,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
*/
void vm_unmap_aliases(void)
{
- unsigned long start = ULONG_MAX, end = 0;
- int flush = 0;
-
- _vm_unmap_aliases(start, end, flush);
+ _vm_unmap_aliases(ULONG_MAX, 0, 0);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
@@ -2937,8 +3033,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
if (WARN_ON_ONCE(!va))
return;
- debug_check_no_locks_freed((void *)va->va_start,
- (va->va_end - va->va_start));
+ debug_check_no_locks_freed((void *)va->va_start, va_size(va));
free_unmap_vmap_area(va);
}
EXPORT_SYMBOL(vm_unmap_ram);
@@ -3009,6 +3104,11 @@ static inline unsigned int vm_area_page_order(struct vm_struct *vm)
#endif
}
+unsigned int get_vm_area_page_order(struct vm_struct *vm)
+{
+ return vm_area_page_order(vm);
+}
+
static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
{
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
@@ -3081,13 +3181,13 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm)
/*
* Before removing VM_UNINITIALIZED,
* we should make sure that vm has proper values.
- * Pair with smp_rmb() in show_numa_info().
+ * Pair with smp_rmb() in vread_iter() and vmalloc_info_show().
*/
smp_wmb();
vm->flags &= ~VM_UNINITIALIZED;
}
-static struct vm_struct *__get_vm_area_node(unsigned long size,
+struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long shift, unsigned long flags,
unsigned long start, unsigned long end, int node,
gfp_t gfp_mask, const void *caller)
@@ -3114,6 +3214,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
area->flags = flags;
area->caller = caller;
+ area->requested_size = requested_size;
va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
if (IS_ERR(va)) {
@@ -3351,11 +3452,13 @@ void vfree(const void *addr)
if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
vm_reset_perms(vm);
+ /* All pages of vm should be charged to same memcg, so use first one. */
+ if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES))
+ mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages);
for (i = 0; i < vm->nr_pages; i++) {
struct page *page = vm->pages[i];
BUG_ON(!page);
- mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
/*
* High-order allocs for huge vmallocs are split, so
* can be freed as an array of order-0 allocations
@@ -3363,7 +3466,8 @@ void vfree(const void *addr)
__free_page(page);
cond_resched();
}
- atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
+ if (!(vm->flags & VM_MAP_PUT_PAGES))
+ atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
kvfree(vm->pages);
kfree(vm);
}
@@ -3509,15 +3613,58 @@ void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
EXPORT_SYMBOL_GPL(vmap_pfn);
#endif /* CONFIG_VMAP_PFN */
+/*
+ * Helper for vmalloc to adjust the gfp flags for certain allocations.
+ */
+static inline gfp_t vmalloc_gfp_adjust(gfp_t flags, const bool large)
+{
+ flags |= __GFP_NOWARN;
+ if (large)
+ flags &= ~__GFP_NOFAIL;
+ return flags;
+}
+
static inline unsigned int
vm_area_alloc_pages(gfp_t gfp, int nid,
unsigned int order, unsigned int nr_pages, struct page **pages)
{
unsigned int nr_allocated = 0;
- gfp_t alloc_gfp = gfp;
- bool nofail = gfp & __GFP_NOFAIL;
+ unsigned int nr_remaining = nr_pages;
+ unsigned int max_attempt_order = MAX_PAGE_ORDER;
struct page *page;
int i;
+ unsigned int large_order = ilog2(nr_remaining);
+ gfp_t large_gfp = vmalloc_gfp_adjust(gfp, large_order) & ~__GFP_DIRECT_RECLAIM;
+
+ large_order = min(max_attempt_order, large_order);
+
+ /*
+ * Initially, attempt to have the page allocator give us large order
+ * pages. Do not attempt allocating smaller than order chunks since
+ * __vmap_pages_range() expects physically contigous pages of exactly
+ * order long chunks.
+ */
+ while (large_order > order && nr_remaining) {
+ if (nid == NUMA_NO_NODE)
+ page = alloc_pages_noprof(large_gfp, large_order);
+ else
+ page = alloc_pages_node_noprof(nid, large_gfp, large_order);
+
+ if (unlikely(!page)) {
+ max_attempt_order = --large_order;
+ continue;
+ }
+
+ split_page(page, large_order);
+ for (i = 0; i < (1U << large_order); i++)
+ pages[nr_allocated + i] = page + i;
+
+ nr_allocated += 1U << large_order;
+ nr_remaining = nr_pages - nr_allocated;
+
+ large_order = ilog2(nr_remaining);
+ large_order = min(max_attempt_order, large_order);
+ }
/*
* For order-0 pages we make use of bulk allocator, if
@@ -3526,9 +3673,6 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
* more permissive.
*/
if (!order) {
- /* bulk allocator doesn't support nofail req. officially */
- gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL;
-
while (nr_allocated < nr_pages) {
unsigned int nr, nr_pages_request;
@@ -3546,17 +3690,15 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
* but mempolicy wants to alloc memory by interleaving.
*/
if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
- nr = alloc_pages_bulk_array_mempolicy_noprof(bulk_gfp,
+ nr = alloc_pages_bulk_mempolicy_noprof(gfp,
nr_pages_request,
pages + nr_allocated);
-
else
- nr = alloc_pages_bulk_array_node_noprof(bulk_gfp, nid,
+ nr = alloc_pages_bulk_node_noprof(gfp, nid,
nr_pages_request,
pages + nr_allocated);
nr_allocated += nr;
- cond_resched();
/*
* If zero or pages were obtained partly,
@@ -3565,37 +3707,24 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
if (nr != nr_pages_request)
break;
}
- } else if (gfp & __GFP_NOFAIL) {
- /*
- * Higher order nofail allocations are really expensive and
- * potentially dangerous (pre-mature OOM, disruptive reclaim
- * and compaction etc.
- */
- alloc_gfp &= ~__GFP_NOFAIL;
}
/* High-order pages or fallback path if "bulk" fails. */
while (nr_allocated < nr_pages) {
- if (!nofail && fatal_signal_pending(current))
+ if (!(gfp & __GFP_NOFAIL) && fatal_signal_pending(current))
break;
if (nid == NUMA_NO_NODE)
- page = alloc_pages_noprof(alloc_gfp, order);
+ page = alloc_pages_noprof(gfp, order);
else
- page = alloc_pages_node_noprof(nid, alloc_gfp, order);
- if (unlikely(!page)) {
- if (!nofail)
- break;
+ page = alloc_pages_node_noprof(nid, gfp, order);
- /* fall back to the zero order allocations */
- alloc_gfp |= __GFP_NOFAIL;
- order = 0;
- continue;
- }
+ if (unlikely(!page))
+ break;
/*
- * Higher order allocations must be able to be treated as
- * indepdenent small pages by callers (as they can with
+ * High-order allocations must be able to be treated as
+ * independent small pages by callers (as they can with
* small-page vmallocs). Some drivers do their own refcounting
* on vmalloc_to_page() pages, some use page->mapping,
* page->lru, etc.
@@ -3611,13 +3740,77 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
for (i = 0; i < (1U << order); i++)
pages[nr_allocated + i] = page + i;
- cond_resched();
nr_allocated += 1U << order;
}
return nr_allocated;
}
+static LLIST_HEAD(pending_vm_area_cleanup);
+static void cleanup_vm_area_work(struct work_struct *work)
+{
+ struct vm_struct *area, *tmp;
+ struct llist_node *head;
+
+ head = llist_del_all(&pending_vm_area_cleanup);
+ if (!head)
+ return;
+
+ llist_for_each_entry_safe(area, tmp, head, llnode) {
+ if (!area->pages)
+ free_vm_area(area);
+ else
+ vfree(area->addr);
+ }
+}
+
+/*
+ * Helper for __vmalloc_area_node() to defer cleanup
+ * of partially initialized vm_struct in error paths.
+ */
+static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work);
+static void defer_vm_area_cleanup(struct vm_struct *area)
+{
+ if (llist_add(&area->llnode, &pending_vm_area_cleanup))
+ schedule_work(&cleanup_vm_area);
+}
+
+/*
+ * Page tables allocations ignore external GFP. Enforces it by
+ * the memalloc scope API. It is used by vmalloc internals and
+ * KASAN shadow population only.
+ *
+ * GFP to scope mapping:
+ *
+ * non-blocking (no __GFP_DIRECT_RECLAIM) - memalloc_noreclaim_save()
+ * GFP_NOFS - memalloc_nofs_save()
+ * GFP_NOIO - memalloc_noio_save()
+ *
+ * Returns a flag cookie to pair with restore.
+ */
+unsigned int
+memalloc_apply_gfp_scope(gfp_t gfp_mask)
+{
+ unsigned int flags = 0;
+
+ if (!gfpflags_allow_blocking(gfp_mask))
+ flags = memalloc_noreclaim_save();
+ else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
+ flags = memalloc_nofs_save();
+ else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
+ flags = memalloc_noio_save();
+
+ /* 0 - no scope applied. */
+ return flags;
+}
+
+void
+memalloc_restore_scope(unsigned int flags)
+{
+ if (flags)
+ memalloc_flags_restore(flags);
+}
+
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, unsigned int page_shift,
int node)
@@ -3634,6 +3827,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
+ /* __GFP_NOFAIL and "noblock" flags are mutually exclusive. */
+ if (!gfpflags_allow_blocking(gfp_mask))
+ nofail = false;
+
if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
gfp_mask |= __GFP_HIGHMEM;
@@ -3649,23 +3846,29 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, failed to allocated page array size %lu",
nr_small_pages * PAGE_SIZE, array_size);
- free_vm_area(area);
- return NULL;
+ goto fail;
}
set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
page_order = vm_area_page_order(area);
- area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN,
- node, page_order, nr_small_pages, area->pages);
+ /*
+ * High-order nofail allocations are really expensive and
+ * potentially dangerous (pre-mature OOM, disruptive reclaim
+ * and compaction etc.
+ *
+ * Please note, the __vmalloc_node_range_noprof() falls-back
+ * to order-0 pages if high-order attempt is unsuccessful.
+ */
+ area->nr_pages = vm_area_alloc_pages(
+ vmalloc_gfp_adjust(gfp_mask, page_order), node,
+ page_order, nr_small_pages, area->pages);
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- if (gfp_mask & __GFP_ACCOUNT) {
- int i;
-
- for (i = 0; i < area->nr_pages; i++)
- mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
- }
+ /* All pages of vm should be charged to same memcg, so use first one. */
+ if (gfp_mask & __GFP_ACCOUNT && area->nr_pages)
+ mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC,
+ area->nr_pages);
/*
* If not enough pages were obtained to accomplish an
@@ -3693,22 +3896,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
* page tables allocations ignore external gfp mask, enforce it
* by the scope API
*/
- if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
- flags = memalloc_nofs_save();
- else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
- flags = memalloc_noio_save();
-
+ flags = memalloc_apply_gfp_scope(gfp_mask);
do {
- ret = vmap_pages_range(addr, addr + size, prot, area->pages,
- page_shift);
+ ret = __vmap_pages_range(addr, addr + size, prot, area->pages,
+ page_shift, nested_gfp);
if (nofail && (ret < 0))
schedule_timeout_uninterruptible(1);
} while (nofail && (ret < 0));
-
- if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
- memalloc_nofs_restore(flags);
- else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
- memalloc_noio_restore(flags);
+ memalloc_restore_scope(flags);
if (ret < 0) {
warn_alloc(gfp_mask, NULL,
@@ -3720,10 +3915,32 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
return area->addr;
fail:
- vfree(area->addr);
+ defer_vm_area_cleanup(area);
return NULL;
}
+/*
+ * See __vmalloc_node_range() for a clear list of supported vmalloc flags.
+ * This gfp lists all flags currently passed through vmalloc. Currently,
+ * __GFP_ZERO is used by BPF and __GFP_NORETRY is used by percpu. Both drm
+ * and BPF also use GFP_USER. Additionally, various users pass
+ * GFP_KERNEL_ACCOUNT. Xfs uses __GFP_NOLOCKDEP.
+ */
+#define GFP_VMALLOC_SUPPORTED (GFP_KERNEL | GFP_ATOMIC | GFP_NOWAIT |\
+ __GFP_NOFAIL | __GFP_ZERO | __GFP_NORETRY |\
+ GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\
+ GFP_USER | __GFP_NOLOCKDEP)
+
+static gfp_t vmalloc_fix_flags(gfp_t flags)
+{
+ gfp_t invalid_mask = flags & ~GFP_VMALLOC_SUPPORTED;
+
+ flags &= GFP_VMALLOC_SUPPORTED;
+ WARN_ONCE(1, "Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
+ invalid_mask, &invalid_mask, flags, &flags);
+ return flags;
+}
+
/**
* __vmalloc_node_range - allocate virtually contiguous memory
* @size: allocation size
@@ -3737,19 +3954,20 @@ fail:
* @caller: caller's return address
*
* Allocate enough pages to cover @size from the page level
- * allocator with @gfp_mask flags. Please note that the full set of gfp
- * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
- * supported.
- * Zone modifiers are not supported. From the reclaim modifiers
- * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
- * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
- * __GFP_RETRY_MAYFAIL are not supported).
+ * allocator with @gfp_mask flags and map them into contiguous
+ * virtual range with protection @prot.
+ *
+ * Supported GFP classes: %GFP_KERNEL, %GFP_ATOMIC, %GFP_NOWAIT,
+ * %GFP_NOFS and %GFP_NOIO. Zone modifiers are not supported.
+ * Please note %GFP_ATOMIC and %GFP_NOWAIT are supported only
+ * by __vmalloc().
*
- * __GFP_NOWARN can be used to suppress failures messages.
+ * Retry modifiers: only %__GFP_NOFAIL is supported; %__GFP_NORETRY
+ * and %__GFP_RETRY_MAYFAIL are not supported.
*
- * Map them into contiguous kernel virtual space, using a pagetable
- * protection of @prot.
+ * %__GFP_NOWARN can be used to suppress failure messages.
*
+ * Can not be called from interrupt nor NMI contexts.
* Return: the address of the area or %NULL on failure
*/
void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
@@ -3760,8 +3978,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
struct vm_struct *area;
void *ret;
kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
- unsigned long real_size = size;
- unsigned long real_align = align;
+ unsigned long original_align = align;
unsigned int shift = PAGE_SHIFT;
if (WARN_ON_ONCE(!size))
@@ -3770,13 +3987,11 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
if ((size >> PAGE_SHIFT) > totalram_pages()) {
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, exceeds total pages",
- real_size);
+ size);
return NULL;
}
if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
- unsigned long size_per_node;
-
/*
* Try huge pages. Only try for PAGE_KERNEL allocations,
* others like modules don't yet expect huge pages in
@@ -3784,27 +3999,23 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
* supporting them.
*/
- size_per_node = size;
- if (node == NUMA_NO_NODE)
- size_per_node /= num_online_nodes();
- if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
+ if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE)
shift = PMD_SHIFT;
else
- shift = arch_vmap_pte_supported_shift(size_per_node);
+ shift = arch_vmap_pte_supported_shift(size);
- align = max(real_align, 1UL << shift);
- size = ALIGN(real_size, 1UL << shift);
+ align = max(original_align, 1UL << shift);
}
again:
- area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
+ area = __get_vm_area_node(size, align, shift, VM_ALLOC |
VM_UNINITIALIZED | vm_flags, start, end, node,
gfp_mask, caller);
if (!area) {
bool nofail = gfp_mask & __GFP_NOFAIL;
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, vm_struct allocation failed%s",
- real_size, (nofail) ? ". Retrying." : "");
+ size, (nofail) ? ". Retrying." : "");
if (nofail) {
schedule_timeout_uninterruptible(1);
goto again;
@@ -3854,7 +4065,7 @@ again:
(gfp_mask & __GFP_SKIP_ZERO))
kasan_flags |= KASAN_VMALLOC_INIT;
/* KASAN_VMALLOC_PROT_NORMAL already set if required. */
- area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
+ area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags);
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -3863,17 +4074,15 @@ again:
*/
clear_vm_uninitialized_flag(area);
- size = PAGE_ALIGN(size);
if (!(vm_flags & VM_DEFER_KMEMLEAK))
- kmemleak_vmalloc(area, size, gfp_mask);
+ kmemleak_vmalloc(area, PAGE_ALIGN(size), gfp_mask);
return area->addr;
fail:
if (shift > PAGE_SHIFT) {
shift = PAGE_SHIFT;
- align = real_align;
- size = real_size;
+ align = original_align;
goto again;
}
@@ -3891,11 +4100,8 @@ fail:
* Allocate enough pages to cover @size from the page level allocator with
* @gfp_mask flags. Map them into contiguous kernel virtual space.
*
- * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
- * and __GFP_NOFAIL are not supported
- *
- * Any use of gfp flags outside of GFP_KERNEL should be consulted
- * with mm people.
+ * Semantics of @gfp_mask (including reclaim/retry modifiers such as
+ * __GFP_NOFAIL) are the same as in __vmalloc_node_range_noprof().
*
* Return: pointer to the allocated memory or %NULL on error
*/
@@ -3916,6 +4122,8 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_noprof);
void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
{
+ if (unlikely(gfp_mask & ~GFP_VMALLOC_SUPPORTED))
+ gfp_mask = vmalloc_fix_flags(gfp_mask);
return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE,
__builtin_return_address(0));
}
@@ -3941,9 +4149,10 @@ void *vmalloc_noprof(unsigned long size)
EXPORT_SYMBOL(vmalloc_noprof);
/**
- * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
+ * vmalloc_huge_node - allocate virtually contiguous memory, allow huge pages
* @size: allocation size
* @gfp_mask: flags for the page level allocator
+ * @node: node to use for allocation or NUMA_NO_NODE
*
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
@@ -3952,13 +4161,15 @@ EXPORT_SYMBOL(vmalloc_noprof);
*
* Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask)
+void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
{
+ if (unlikely(gfp_mask & ~GFP_VMALLOC_SUPPORTED))
+ gfp_mask = vmalloc_fix_flags(gfp_mask);
return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
- gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
- NUMA_NO_NODE, __builtin_return_address(0));
+ gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ node, __builtin_return_address(0));
}
-EXPORT_SYMBOL_GPL(vmalloc_huge_noprof);
+EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof);
/**
* vzalloc - allocate virtually contiguous memory with zero fill
@@ -4036,6 +4247,115 @@ void *vzalloc_node_noprof(unsigned long size, int node)
}
EXPORT_SYMBOL(vzalloc_node_noprof);
+/**
+ * vrealloc_node_align_noprof - reallocate virtually contiguous memory; contents
+ * remain unchanged
+ * @p: object to reallocate memory for
+ * @size: the size to reallocate
+ * @align: requested alignment
+ * @flags: the flags for the page level allocator
+ * @nid: node number of the target node
+ *
+ * If @p is %NULL, vrealloc_XXX() behaves exactly like vmalloc_XXX(). If @size
+ * is 0 and @p is not a %NULL pointer, the object pointed to is freed.
+ *
+ * If the caller wants the new memory to be on specific node *only*,
+ * __GFP_THISNODE flag should be set, otherwise the function will try to avoid
+ * reallocation and possibly disregard the specified @nid.
+ *
+ * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
+ * initial memory allocation, every subsequent call to this API for the same
+ * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
+ * __GFP_ZERO is not fully honored by this API.
+ *
+ * Requesting an alignment that is bigger than the alignment of the existing
+ * allocation will fail.
+ *
+ * In any case, the contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.
+ *
+ * This function must not be called concurrently with itself or vfree() for the
+ * same memory allocation.
+ *
+ * Return: pointer to the allocated memory; %NULL if @size is zero or in case of
+ * failure
+ */
+void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
+ gfp_t flags, int nid)
+{
+ struct vm_struct *vm = NULL;
+ size_t alloced_size = 0;
+ size_t old_size = 0;
+ void *n;
+
+ if (!size) {
+ vfree(p);
+ return NULL;
+ }
+
+ if (p) {
+ vm = find_vm_area(p);
+ if (unlikely(!vm)) {
+ WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p);
+ return NULL;
+ }
+
+ alloced_size = get_vm_area_size(vm);
+ old_size = vm->requested_size;
+ if (WARN(alloced_size < old_size,
+ "vrealloc() has mismatched area vs requested sizes (%p)\n", p))
+ return NULL;
+ if (WARN(!IS_ALIGNED((unsigned long)p, align),
+ "will not reallocate with a bigger alignment (0x%lx)\n", align))
+ return NULL;
+ if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
+ nid != page_to_nid(vmalloc_to_page(p)))
+ goto need_realloc;
+ }
+
+ /*
+ * TODO: Shrink the vm_area, i.e. unmap and free unused pages. What
+ * would be a good heuristic for when to shrink the vm_area?
+ */
+ if (size <= old_size) {
+ /* Zero out "freed" memory, potentially for future realloc. */
+ if (want_init_on_free() || want_init_on_alloc(flags))
+ memset((void *)p + size, 0, old_size - size);
+ vm->requested_size = size;
+ kasan_poison_vmalloc(p + size, old_size - size);
+ return (void *)p;
+ }
+
+ /*
+ * We already have the bytes available in the allocation; use them.
+ */
+ if (size <= alloced_size) {
+ kasan_unpoison_vmalloc(p + old_size, size - old_size,
+ KASAN_VMALLOC_PROT_NORMAL);
+ /*
+ * No need to zero memory here, as unused memory will have
+ * already been zeroed at initial allocation time or during
+ * realloc shrink time.
+ */
+ vm->requested_size = size;
+ return (void *)p;
+ }
+
+need_realloc:
+ /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */
+ n = __vmalloc_node_noprof(size, align, flags, nid, __builtin_return_address(0));
+
+ if (!n)
+ return NULL;
+
+ if (p) {
+ memcpy(n, p, old_size);
+ vfree(p);
+ }
+
+ return n;
+}
+
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
@@ -4684,7 +5004,7 @@ retry:
/* populate the kasan shadow space */
for (area = 0; area < nr_vms; area++) {
- if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
+ if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area], GFP_KERNEL))
goto err_free_shadow;
}
@@ -4726,7 +5046,8 @@ recovery:
&free_vmap_area_list);
if (va)
kasan_release_vmalloc(orig_start, orig_end,
- va->va_start, va->va_end);
+ va->va_start, va->va_end,
+ KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
vas[area] = NULL;
}
@@ -4776,7 +5097,8 @@ err_free_shadow:
&free_vmap_area_list);
if (va)
kasan_release_vmalloc(orig_start, orig_end,
- va->va_start, va->va_end);
+ va->va_start, va->va_end,
+ KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
vas[area] = NULL;
kfree(vms[area]);
}
@@ -4839,44 +5161,42 @@ bool vmalloc_dump_obj(void *object)
#endif
#ifdef CONFIG_PROC_FS
-static void show_numa_info(struct seq_file *m, struct vm_struct *v)
-{
- if (IS_ENABLED(CONFIG_NUMA)) {
- unsigned int nr, *counters = m->private;
- unsigned int step = 1U << vm_area_page_order(v);
- if (!counters)
- return;
+/*
+ * Print number of pages allocated on each memory node.
+ *
+ * This function can only be called if CONFIG_NUMA is enabled
+ * and VM_UNINITIALIZED bit in v->flags is disabled.
+ */
+static void show_numa_info(struct seq_file *m, struct vm_struct *v,
+ unsigned int *counters)
+{
+ unsigned int nr;
+ unsigned int step = 1U << vm_area_page_order(v);
- if (v->flags & VM_UNINITIALIZED)
- return;
- /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
- smp_rmb();
+ if (!counters)
+ return;
- memset(counters, 0, nr_node_ids * sizeof(unsigned int));
+ memset(counters, 0, nr_node_ids * sizeof(unsigned int));
- for (nr = 0; nr < v->nr_pages; nr += step)
- counters[page_to_nid(v->pages[nr])] += step;
- for_each_node_state(nr, N_HIGH_MEMORY)
- if (counters[nr])
- seq_printf(m, " N%u=%u", nr, counters[nr]);
- }
+ for (nr = 0; nr < v->nr_pages; nr += step)
+ counters[page_to_nid(v->pages[nr])] += step;
+ for_each_node_state(nr, N_HIGH_MEMORY)
+ if (counters[nr])
+ seq_printf(m, " N%u=%u", nr, counters[nr]);
}
static void show_purge_info(struct seq_file *m)
{
struct vmap_node *vn;
struct vmap_area *va;
- int i;
-
- for (i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ for_each_vmap_node(vn) {
spin_lock(&vn->lazy.lock);
list_for_each_entry(va, &vn->lazy.head, list) {
seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
(void *)va->va_start, (void *)va->va_end,
- va->va_end - va->va_start);
+ va_size(va));
}
spin_unlock(&vn->lazy.lock);
}
@@ -4887,23 +5207,29 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
struct vmap_node *vn;
struct vmap_area *va;
struct vm_struct *v;
- int i;
+ unsigned int *counters;
- for (i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ if (IS_ENABLED(CONFIG_NUMA))
+ counters = kmalloc_array(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
+ for_each_vmap_node(vn) {
spin_lock(&vn->busy.lock);
list_for_each_entry(va, &vn->busy.head, list) {
if (!va->vm) {
if (va->flags & VMAP_RAM)
seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
(void *)va->va_start, (void *)va->va_end,
- va->va_end - va->va_start);
+ va_size(va));
continue;
}
v = va->vm;
+ if (v->flags & VM_UNINITIALIZED)
+ continue;
+
+ /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+ smp_rmb();
seq_printf(m, "0x%pK-0x%pK %7ld",
v->addr, v->addr + v->size, v->size);
@@ -4938,7 +5264,9 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
if (is_vmalloc_addr(v->pages))
seq_puts(m, " vpages");
- show_numa_info(m, v);
+ if (IS_ENABLED(CONFIG_NUMA))
+ show_numa_info(m, v, counters);
+
seq_putc(m, '\n');
}
spin_unlock(&vn->busy.lock);
@@ -4948,19 +5276,14 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
* As a final step, dump "unpurged" areas.
*/
show_purge_info(m);
+ if (IS_ENABLED(CONFIG_NUMA))
+ kfree(counters);
return 0;
}
static int __init proc_vmalloc_init(void)
{
- void *priv_data = NULL;
-
- if (IS_ENABLED(CONFIG_NUMA))
- priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
-
- proc_create_single_data("vmallocinfo",
- 0400, NULL, vmalloc_info_show, priv_data);
-
+ proc_create_single("vmallocinfo", 0400, NULL, vmalloc_info_show);
return 0;
}
module_init(proc_vmalloc_init);
@@ -5012,7 +5335,7 @@ static void __init vmap_init_free_space(void)
static void vmap_init_nodes(void)
{
struct vmap_node *vn;
- int i, n;
+ int i;
#if BITS_PER_LONG == 64
/*
@@ -5029,10 +5352,10 @@ static void vmap_init_nodes(void)
* set of cores. Therefore a per-domain purging is supposed to
* be added as well as a per-domain balancing.
*/
- n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
+ int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
if (n > 1) {
- vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN);
+ vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT);
if (vn) {
/* Node partition is 16 pages. */
vmap_zone_size = (1 << 4) * PAGE_SIZE;
@@ -5044,8 +5367,7 @@ static void vmap_init_nodes(void)
}
#endif
- for (n = 0; n < nr_vmap_nodes; n++) {
- vn = &vmap_nodes[n];
+ for_each_vmap_node(vn) {
vn->busy.root = RB_ROOT;
INIT_LIST_HEAD(&vn->busy.head);
spin_lock_init(&vn->busy.lock);
@@ -5066,15 +5388,13 @@ static void vmap_init_nodes(void)
static unsigned long
vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
- unsigned long count;
+ unsigned long count = 0;
struct vmap_node *vn;
- int i, j;
-
- for (count = 0, i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ int i;
- for (j = 0; j < MAX_VA_SIZE_PAGES; j++)
- count += READ_ONCE(vn->pool[j].len);
+ for_each_vmap_node(vn) {
+ for (i = 0; i < MAX_VA_SIZE_PAGES; i++)
+ count += READ_ONCE(vn->pool[i].len);
}
return count ? count : SHRINK_EMPTY;
@@ -5083,10 +5403,10 @@ vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
static unsigned long
vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
- int i;
+ struct vmap_node *vn;
- for (i = 0; i < nr_vmap_nodes; i++)
- decay_va_pool_node(&vmap_nodes[i], true);
+ for_each_vmap_node(vn)
+ decay_va_pool_node(vn, true);
return SHRINK_STOP;
}