From a7c3e901a46ff54c016d040847eda598a9e3e653 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 May 2017 15:57:09 -0700 Subject: mm: introduce kv[mz]alloc helpers Patch series "kvmalloc", v5. There are many open coded kmalloc with vmalloc fallback instances in the tree. Most of them are not careful enough or simply do not care about the underlying semantic of the kmalloc/page allocator which means that a) some vmalloc fallbacks are basically unreachable because the kmalloc part will keep retrying until it succeeds b) the page allocator can invoke a really disruptive steps like the OOM killer to move forward which doesn't sound appropriate when we consider that the vmalloc fallback is available. As it can be seen implementing kvmalloc requires quite an intimate knowledge if the page allocator and the memory reclaim internals which strongly suggests that a helper should be implemented in the memory subsystem proper. Most callers, I could find, have been converted to use the helper instead. This is patch 6. There are some more relying on __GFP_REPEAT in the networking stack which I have converted as well and Eric Dumazet was not opposed [2] to convert them as well. [1] http://lkml.kernel.org/r/20170130094940.13546-1-mhocko@kernel.org [2] http://lkml.kernel.org/r/1485273626.16328.301.camel@edumazet-glaptop3.roam.corp.google.com This patch (of 9): Using kmalloc with the vmalloc fallback for larger allocations is a common pattern in the kernel code. Yet we do not have any common helper for that and so users have invented their own helpers. Some of them are really creative when doing so. Let's just add kv[mz]alloc and make sure it is implemented properly. This implementation makes sure to not make a large memory pressure for > PAGE_SZE requests (__GFP_NORETRY) and also to not warn about allocation failures. This also rules out the OOM killer as the vmalloc is a more approapriate fallback than a disruptive user visible action. This patch also changes some existing users and removes helpers which are specific for them. In some cases this is not possible (e.g. ext4_kvmalloc, libcfs_kvzalloc) because those seems to be broken and require GFP_NO{FS,IO} context which is not vmalloc compatible in general (note that the page table allocation is GFP_KERNEL). Those need to be fixed separately. While we are at it, document that __vmalloc{_node} about unsupported gfp mask because there seems to be a lot of confusion out there. kvmalloc_node will warn about GFP_KERNEL incompatible (which are not superset) flags to catch new abusers. Existing ones would have to die slowly. [sfr@canb.auug.org.au: f2fs fixup] Link: http://lkml.kernel.org/r/20170320163735.332e64b7@canb.auug.org.au Link: http://lkml.kernel.org/r/20170306103032.2540-2-mhocko@kernel.org Signed-off-by: Michal Hocko Signed-off-by: Stephen Rothwell Reviewed-by: Andreas Dilger [ext4 part] Acked-by: Vlastimil Babka Cc: John Hubbard Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b52aeed3f58e..33603239560e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1786,6 +1786,13 @@ fail: * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. + * + * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_REPEAT + * and __GFP_NOFAIL are not supported + * + * Any use of gfp flags outside of GFP_KERNEL should be consulted + * with mm people. + * */ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, @@ -1802,7 +1809,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) } EXPORT_SYMBOL(__vmalloc); -static inline void *__vmalloc_node_flags(unsigned long size, +void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) { return __vmalloc_node(size, 1, flags, PAGE_KERNEL, -- cgit From 1f5307b1e094bfffa83c65c40ac6e3415c108780 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 May 2017 15:57:12 -0700 Subject: mm, vmalloc: properly track vmalloc users __vmalloc_node_flags used to be static inline but this has changed by "mm: introduce kv[mz]alloc helpers" because kvmalloc_node needs to use it as well and the code is outside of the vmalloc proper. I haven't realized that changing this will lead to a subtle bug though. The function is responsible to track the caller as well. This caller is then printed by /proc/vmallocinfo. If __vmalloc_node_flags is not inline then we would get only direct users of __vmalloc_node_flags as callers (e.g. v[mz]alloc) which reduces usefulness of this debugging feature considerably. It simply doesn't help to see that the given range belongs to vmalloc as a caller: 0xffffc90002c79000-0xffffc90002c7d000 16384 vmalloc+0x16/0x18 pages=3 vmalloc N0=3 0xffffc90002c81000-0xffffc90002c85000 16384 vmalloc+0x16/0x18 pages=3 vmalloc N1=3 0xffffc90002c8d000-0xffffc90002c91000 16384 vmalloc+0x16/0x18 pages=3 vmalloc N1=3 0xffffc90002c95000-0xffffc90002c99000 16384 vmalloc+0x16/0x18 pages=3 vmalloc N1=3 We really want to catch the _caller_ of the vmalloc function. Fix this issue by making __vmalloc_node_flags static inline again. Link: http://lkml.kernel.org/r/20170502134657.12381-1-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 33603239560e..717b1e8b942c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1649,9 +1649,6 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap); -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, const void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node) { @@ -1794,7 +1791,7 @@ fail: * with mm people. * */ -static void *__vmalloc_node(unsigned long size, unsigned long align, +void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, int node, const void *caller) { @@ -1809,13 +1806,6 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) } EXPORT_SYMBOL(__vmalloc); -void *__vmalloc_node_flags(unsigned long size, - int node, gfp_t flags) -{ - return __vmalloc_node(size, 1, flags, PAGE_KERNEL, - node, __builtin_return_address(0)); -} - /** * vmalloc - allocate virtually contiguous memory * @size: allocation size -- cgit From 19809c2da28aee5860ad9a2eff760730a0710df0 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 May 2017 15:57:44 -0700 Subject: mm, vmalloc: use __GFP_HIGHMEM implicitly __vmalloc* allows users to provide gfp flags for the underlying allocation. This API is quite popular $ git grep "=[[:space:]]__vmalloc\|return[[:space:]]*__vmalloc" | wc -l 77 The only problem is that many people are not aware that they really want to give __GFP_HIGHMEM along with other flags because there is really no reason to consume precious lowmemory on CONFIG_HIGHMEM systems for pages which are mapped to the kernel vmalloc space. About half of users don't use this flag, though. This signals that we make the API unnecessarily too complex. This patch simply uses __GFP_HIGHMEM implicitly when allocating pages to be mapped to the vmalloc space. Current users which add __GFP_HIGHMEM are simplified and drop the flag. Link: http://lkml.kernel.org/r/20170307141020.29107-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Matthew Wilcox Cc: Al Viro Cc: Vlastimil Babka Cc: David Rientjes Cc: Cristopher Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 717b1e8b942c..1dda6d8a200a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1655,7 +1655,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page **pages; unsigned int nr_pages, array_size, i; const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; + const gfp_t alloc_mask = gfp_mask | __GFP_HIGHMEM | __GFP_NOWARN; nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1818,7 +1818,7 @@ EXPORT_SYMBOL(__vmalloc); void *vmalloc(unsigned long size) { return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL | __GFP_HIGHMEM); + GFP_KERNEL); } EXPORT_SYMBOL(vmalloc); @@ -1835,7 +1835,7 @@ EXPORT_SYMBOL(vmalloc); void *vzalloc(unsigned long size) { return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc); @@ -1852,7 +1852,7 @@ void *vmalloc_user(unsigned long size) void *ret; ret = __vmalloc_node(size, SHMLBA, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, NUMA_NO_NODE, __builtin_return_address(0)); if (ret) { @@ -1876,7 +1876,7 @@ EXPORT_SYMBOL(vmalloc_user); */ void *vmalloc_node(unsigned long size, int node) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, + return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, node, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node); @@ -1896,7 +1896,7 @@ EXPORT_SYMBOL(vmalloc_node); void *vzalloc_node(unsigned long size, int node) { return __vmalloc_node_flags(size, node, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc_node); @@ -1918,7 +1918,7 @@ EXPORT_SYMBOL(vzalloc_node); void *vmalloc_exec(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, + return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, NUMA_NO_NODE, __builtin_return_address(0)); } -- cgit