From 0db10c8e8ff24559b3768b8d010191b01e0940ae Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 11 Oct 2012 21:05:10 +0200 Subject: krealloc: Fix kernel-doc comment It should say "@new_size" and not "@size". Correct that. Signed-off-by: Borislav Petkov Cc: trivial@kernel.org Signed-off-by: Jiri Kosina --- mm/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/util.c b/mm/util.c index 8c7265afa29f..3a5278c08d76 100644 --- a/mm/util.c +++ b/mm/util.c @@ -145,7 +145,7 @@ EXPORT_SYMBOL(__krealloc); * * The contents of the object pointed to are preserved up to the * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @size is 0 and @p is not a + * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a * %NULL pointer, the object pointed to is freed. */ void *krealloc(const void *p, size_t new_size, gfp_t flags) -- cgit From 837d678dc264c797c16f81cf56f615f7544891c1 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 16 Aug 2012 00:02:40 +0900 Subject: slub: remove one code path and reduce lock contention in __slab_free() When we try to free object, there is some of case that we need to take a node lock. This is the necessary step for preventing a race. After taking a lock, then we try to cmpxchg_double_slab(). But, there is a possible scenario that cmpxchg_double_slab() is failed with taking a lock. Following example explains it. CPU A CPU B need lock ... need lock ... lock!! lock..but spin free success spin... unlock lock!! free fail In this case, retry with taking a lock is occured in CPU A. I think that in this case for CPU A, "release a lock first, and re-take a lock if necessary" is preferable way. There are two reasons for this. First, this makes __slab_free()'s logic somehow simple. With this patch, 'was_frozen = 1' is "always" handled without taking a lock. So we can remove one code path. Second, it may reduce lock contention. When we do retrying, status of slab is already changed, so we don't need a lock anymore in almost every case. "release a lock first, and re-take a lock if necessary" policy is helpful to this. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index a0d698467f70..e7aec2001ae5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2459,7 +2459,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page, void *prior; void **object = (void *)x; int was_frozen; - int inuse; struct page new; unsigned long counters; struct kmem_cache_node *n = NULL; @@ -2472,13 +2471,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page, return; do { + if (unlikely(n)) { + spin_unlock_irqrestore(&n->list_lock, flags); + n = NULL; + } prior = page->freelist; counters = page->counters; set_freepointer(s, object, prior); new.counters = counters; was_frozen = new.frozen; new.inuse--; - if ((!new.inuse || !prior) && !was_frozen && !n) { + if ((!new.inuse || !prior) && !was_frozen) { if (!kmem_cache_debug(s) && !prior) @@ -2503,7 +2506,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page, } } - inuse = new.inuse; } while (!cmpxchg_double_slab(s, page, prior, counters, @@ -2529,25 +2531,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page, return; } + if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) + goto slab_empty; + /* - * was_frozen may have been set after we acquired the list_lock in - * an earlier loop. So we need to check it here again. + * Objects left in the slab. If it was not on the partial list before + * then add it. */ - if (was_frozen) - stat(s, FREE_FROZEN); - else { - if (unlikely(!inuse && n->nr_partial > s->min_partial)) - goto slab_empty; - - /* - * Objects left in the slab. If it was not on the partial list before - * then add it. - */ - if (unlikely(!prior)) { - remove_full(s, page); - add_partial(n, page, DEACTIVATE_TO_TAIL); - stat(s, FREE_ADD_PARTIAL); - } + if (kmem_cache_debug(s) && unlikely(!prior)) { + remove_full(s, page); + add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); } spin_unlock_irqrestore(&n->list_lock, flags); return; -- cgit From f0263d2d222e9e25f2587e51a9dc58c6fb2a9352 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 19 Oct 2012 14:03:31 +0100 Subject: mm: highmem: export kmap_to_page for modules Some virtio device drivers (9p) need to translate high virtual addresses to physical addresses, which are inserted into the virtqueue for processing by userspace. This patch exports the kmap_to_page symbol, so that the affected drivers can be compiled as modules. Cc: stable@kernel.org Signed-off-by: Will Deacon Signed-off-by: Rusty Russell --- mm/highmem.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index d517cd16a6eb..2a07f97dabf1 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -105,6 +105,7 @@ struct page *kmap_to_page(void *vaddr) return virt_to_page(addr); } +EXPORT_SYMBOL(kmap_to_page); static void flush_all_zero_pkmaps(void) { -- cgit From b7454ad3cfc3043c5264729a6204f049fe1f34b1 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 19 Oct 2012 18:20:25 +0400 Subject: mm/sl[au]b: Move slabinfo processing to slab_common.c This patch moves all the common machinery to slabinfo processing to slab_common.c. We can do better by noticing that the output is heavily common, and having the allocators to just provide finished information about this. But after this first step, this can be done easier. Signed-off-by: Glauber Costa Acked-by: Christoph Lameter CC: David Rientjes Signed-off-by: Pekka Enberg --- mm/slab.c | 72 ++++++++++---------------------------------------------- mm/slab.h | 8 +++++++ mm/slab_common.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/slub.c | 51 ++++----------------------------------- 4 files changed, 96 insertions(+), 105 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 33d3363658df..a6e045c13b8d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4277,7 +4277,7 @@ out: #ifdef CONFIG_SLABINFO -static void print_slabinfo_header(struct seq_file *m) +void print_slabinfo_header(struct seq_file *m) { /* * Output format version, so at least we can change it @@ -4300,28 +4300,7 @@ static void print_slabinfo_header(struct seq_file *m) seq_putc(m, '\n'); } -static void *s_start(struct seq_file *m, loff_t *pos) -{ - loff_t n = *pos; - - mutex_lock(&slab_mutex); - if (!n) - print_slabinfo_header(m); - - return seq_list_start(&slab_caches, *pos); -} - -static void *s_next(struct seq_file *m, void *p, loff_t *pos) -{ - return seq_list_next(p, &slab_caches, pos); -} - -static void s_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&slab_mutex); -} - -static int s_show(struct seq_file *m, void *p) +int slabinfo_show(struct seq_file *m, void *p) { struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); struct slab *slabp; @@ -4418,27 +4397,6 @@ static int s_show(struct seq_file *m, void *p) return 0; } -/* - * slabinfo_op - iterator that generates /proc/slabinfo - * - * Output layout: - * cache-name - * num-active-objs - * total-objs - * object size - * num-active-slabs - * total-slabs - * num-pages-per-slab - * + further values on SMP and with statistics enabled - */ - -static const struct seq_operations slabinfo_op = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show, -}; - #define MAX_SLABINFO_WRITE 128 /** * slabinfo_write - Tuning for the slab allocator @@ -4447,7 +4405,7 @@ static const struct seq_operations slabinfo_op = { * @count: data length * @ppos: unused */ -static ssize_t slabinfo_write(struct file *file, const char __user *buffer, +ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; @@ -4490,19 +4448,6 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer, return res; } -static int slabinfo_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &slabinfo_op); -} - -static const struct file_operations proc_slabinfo_operations = { - .open = slabinfo_open, - .read = seq_read, - .write = slabinfo_write, - .llseek = seq_lseek, - .release = seq_release, -}; - #ifdef CONFIG_DEBUG_SLAB_LEAK static void *leaks_start(struct seq_file *m, loff_t *pos) @@ -4631,6 +4576,16 @@ static int leaks_show(struct seq_file *m, void *p) return 0; } +static void *s_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &slab_caches, pos); +} + +static void s_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&slab_mutex); +} + static const struct seq_operations slabstats_op = { .start = leaks_start, .next = s_next, @@ -4665,7 +4620,6 @@ static const struct file_operations proc_slabstats_operations = { static int __init slab_proc_init(void) { - proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations); #ifdef CONFIG_DEBUG_SLAB_LEAK proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); #endif diff --git a/mm/slab.h b/mm/slab.h index 7deeb449a301..dc78101962a1 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -47,4 +47,12 @@ static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t siz int __kmem_cache_shutdown(struct kmem_cache *); +struct seq_file; +struct file; +void print_slabinfo_header(struct seq_file *m); + +int slabinfo_show(struct seq_file *m, void *p); + +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos); #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index 069a24e64403..2e4b4c6d89e2 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -192,3 +194,71 @@ int slab_is_available(void) { return slab_state >= UP; } + +#ifdef CONFIG_SLABINFO +static void *s_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + + mutex_lock(&slab_mutex); + if (!n) + print_slabinfo_header(m); + + return seq_list_start(&slab_caches, *pos); +} + +static void *s_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &slab_caches, pos); +} + +static void s_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&slab_mutex); +} + +static int s_show(struct seq_file *m, void *p) +{ + return slabinfo_show(m, p); +} + +/* + * slabinfo_op - iterator that generates /proc/slabinfo + * + * Output layout: + * cache-name + * num-active-objs + * total-objs + * object size + * num-active-slabs + * total-slabs + * num-pages-per-slab + * + further values on SMP and with statistics enabled + */ +static const struct seq_operations slabinfo_op = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static int slabinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &slabinfo_op); +} + +static const struct file_operations proc_slabinfo_operations = { + .open = slabinfo_open, + .read = seq_read, + .write = slabinfo_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init slab_proc_init(void) +{ + proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); + return 0; +} +module_init(slab_proc_init); +#endif /* CONFIG_SLABINFO */ diff --git a/mm/slub.c b/mm/slub.c index a0d698467f70..77a0c8a9fc75 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5405,7 +5405,7 @@ __initcall(slab_sysfs_init); * The /proc/slabinfo ABI */ #ifdef CONFIG_SLABINFO -static void print_slabinfo_header(struct seq_file *m) +void print_slabinfo_header(struct seq_file *m) { seq_puts(m, "slabinfo - version: 2.1\n"); seq_puts(m, "# name " @@ -5415,28 +5415,7 @@ static void print_slabinfo_header(struct seq_file *m) seq_putc(m, '\n'); } -static void *s_start(struct seq_file *m, loff_t *pos) -{ - loff_t n = *pos; - - mutex_lock(&slab_mutex); - if (!n) - print_slabinfo_header(m); - - return seq_list_start(&slab_caches, *pos); -} - -static void *s_next(struct seq_file *m, void *p, loff_t *pos) -{ - return seq_list_next(p, &slab_caches, pos); -} - -static void s_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&slab_mutex); -} - -static int s_show(struct seq_file *m, void *p) +int slabinfo_show(struct seq_file *m, void *p) { unsigned long nr_partials = 0; unsigned long nr_slabs = 0; @@ -5472,29 +5451,9 @@ static int s_show(struct seq_file *m, void *p) return 0; } -static const struct seq_operations slabinfo_op = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show, -}; - -static int slabinfo_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &slabinfo_op); -} - -static const struct file_operations proc_slabinfo_operations = { - .open = slabinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init slab_proc_init(void) +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) { - proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); - return 0; + return -EIO; } -module_init(slab_proc_init); #endif /* CONFIG_SLABINFO */ -- cgit From bcee6e2a13d580f6c21d748fcd7239ccc66cb4b8 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 19 Oct 2012 18:20:26 +0400 Subject: mm/sl[au]b: Move print_slabinfo_header to slab_common.c The header format is highly similar between slab and slub. The main difference lays in the fact that slab may optionally have statistics added here in case of CONFIG_SLAB_DEBUG, while the slub will stick them somewhere else. By making sure that information conditionally lives inside a globally-visible CONFIG_DEBUG_SLAB switch, we can move the header printing to a common location. Signed-off-by: Glauber Costa Acked-by: Christoph Lameter CC: David Rientjes Signed-off-by: Pekka Enberg --- mm/slab.c | 24 ------------------------ mm/slab.h | 2 -- mm/slab_common.c | 23 +++++++++++++++++++++++ mm/slub.c | 10 ---------- 4 files changed, 23 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index a6e045c13b8d..73811ca0ae29 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4276,30 +4276,6 @@ out: } #ifdef CONFIG_SLABINFO - -void print_slabinfo_header(struct seq_file *m) -{ - /* - * Output format version, so at least we can change it - * without _too_ many complaints. - */ -#if STATS - seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); -#else - seq_puts(m, "slabinfo - version: 2.1\n"); -#endif - seq_puts(m, "# name " - " "); - seq_puts(m, " : tunables "); - seq_puts(m, " : slabdata "); -#if STATS - seq_puts(m, " : globalstat " - " "); - seq_puts(m, " : cpustat "); -#endif - seq_putc(m, '\n'); -} - int slabinfo_show(struct seq_file *m, void *p) { struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); diff --git a/mm/slab.h b/mm/slab.h index dc78101962a1..3442eb83ee1e 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -49,8 +49,6 @@ int __kmem_cache_shutdown(struct kmem_cache *); struct seq_file; struct file; -void print_slabinfo_header(struct seq_file *m); - int slabinfo_show(struct seq_file *m, void *p); ssize_t slabinfo_write(struct file *file, const char __user *buffer, diff --git a/mm/slab_common.c b/mm/slab_common.c index 2e4b4c6d89e2..c64a0438c1f3 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -196,6 +196,29 @@ int slab_is_available(void) } #ifdef CONFIG_SLABINFO +static void print_slabinfo_header(struct seq_file *m) +{ + /* + * Output format version, so at least we can change it + * without _too_ many complaints. + */ +#ifdef CONFIG_DEBUG_SLAB + seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); +#else + seq_puts(m, "slabinfo - version: 2.1\n"); +#endif + seq_puts(m, "# name " + " "); + seq_puts(m, " : tunables "); + seq_puts(m, " : slabdata "); +#ifdef CONFIG_DEBUG_SLAB + seq_puts(m, " : globalstat " + " "); + seq_puts(m, " : cpustat "); +#endif + seq_putc(m, '\n'); +} + static void *s_start(struct seq_file *m, loff_t *pos) { loff_t n = *pos; diff --git a/mm/slub.c b/mm/slub.c index 77a0c8a9fc75..6b5ee3472e18 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5405,16 +5405,6 @@ __initcall(slab_sysfs_init); * The /proc/slabinfo ABI */ #ifdef CONFIG_SLABINFO -void print_slabinfo_header(struct seq_file *m) -{ - seq_puts(m, "slabinfo - version: 2.1\n"); - seq_puts(m, "# name " - " "); - seq_puts(m, " : tunables "); - seq_puts(m, " : slabdata "); - seq_putc(m, '\n'); -} - int slabinfo_show(struct seq_file *m, void *p) { unsigned long nr_partials = 0; -- cgit From 0d7561c61d76690ed84bd1016acc0fcbff063205 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 19 Oct 2012 18:20:27 +0400 Subject: sl[au]b: Process slabinfo_show in common code With all the infrastructure in place, we can now have slabinfo_show done from slab_common.c. A cache-specific function is called to grab information about the cache itself, since that is still heavily dependent on the implementation. But with the values produced by it, all the printing and handling is done from common code. Signed-off-by: Glauber Costa CC: Christoph Lameter CC: David Rientjes Signed-off-by: Pekka Enberg --- mm/slab.c | 26 +++++++++++++++----------- mm/slab.h | 16 +++++++++++++++- mm/slab_common.c | 18 +++++++++++++++++- mm/slub.c | 24 ++++++++++-------------- 4 files changed, 57 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 73811ca0ae29..6d5c83c6ddd5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4276,9 +4276,8 @@ out: } #ifdef CONFIG_SLABINFO -int slabinfo_show(struct seq_file *m, void *p) +void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) { - struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); struct slab *slabp; unsigned long active_objs; unsigned long num_objs; @@ -4333,13 +4332,20 @@ int slabinfo_show(struct seq_file *m, void *p) if (error) printk(KERN_ERR "slab: cache %s error: %s\n", name, error); - seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", - name, active_objs, num_objs, cachep->size, - cachep->num, (1 << cachep->gfporder)); - seq_printf(m, " : tunables %4u %4u %4u", - cachep->limit, cachep->batchcount, cachep->shared); - seq_printf(m, " : slabdata %6lu %6lu %6lu", - active_slabs, num_slabs, shared_avail); + sinfo->active_objs = active_objs; + sinfo->num_objs = num_objs; + sinfo->active_slabs = active_slabs; + sinfo->num_slabs = num_slabs; + sinfo->shared_avail = shared_avail; + sinfo->limit = cachep->limit; + sinfo->batchcount = cachep->batchcount; + sinfo->shared = cachep->shared; + sinfo->objects_per_slab = cachep->num; + sinfo->cache_order = cachep->gfporder; +} + +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) +{ #if STATS { /* list3 stats */ unsigned long high = cachep->high_mark; @@ -4369,8 +4375,6 @@ int slabinfo_show(struct seq_file *m, void *p) allochit, allocmiss, freehit, freemiss); } #endif - seq_putc(m, '\n'); - return 0; } #define MAX_SLABINFO_WRITE 128 diff --git a/mm/slab.h b/mm/slab.h index 3442eb83ee1e..5a43c2f13621 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -49,8 +49,22 @@ int __kmem_cache_shutdown(struct kmem_cache *); struct seq_file; struct file; -int slabinfo_show(struct seq_file *m, void *p); +struct slabinfo { + unsigned long active_objs; + unsigned long num_objs; + unsigned long active_slabs; + unsigned long num_slabs; + unsigned long shared_avail; + unsigned int limit; + unsigned int batchcount; + unsigned int shared; + unsigned int objects_per_slab; + unsigned int cache_order; +}; + +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos); #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index c64a0438c1f3..5fb753da6cf0 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -242,7 +242,23 @@ static void s_stop(struct seq_file *m, void *p) static int s_show(struct seq_file *m, void *p) { - return slabinfo_show(m, p); + struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + struct slabinfo sinfo; + + memset(&sinfo, 0, sizeof(sinfo)); + get_slabinfo(s, &sinfo); + + seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", + s->name, sinfo.active_objs, sinfo.num_objs, s->size, + sinfo.objects_per_slab, (1 << sinfo.cache_order)); + + seq_printf(m, " : tunables %4u %4u %4u", + sinfo.limit, sinfo.batchcount, sinfo.shared); + seq_printf(m, " : slabdata %6lu %6lu %6lu", + sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); + slabinfo_show_stats(m, s); + seq_putc(m, '\n'); + return 0; } /* diff --git a/mm/slub.c b/mm/slub.c index 6b5ee3472e18..472e739278b4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5405,18 +5405,14 @@ __initcall(slab_sysfs_init); * The /proc/slabinfo ABI */ #ifdef CONFIG_SLABINFO -int slabinfo_show(struct seq_file *m, void *p) +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) { unsigned long nr_partials = 0; unsigned long nr_slabs = 0; - unsigned long nr_inuse = 0; unsigned long nr_objs = 0; unsigned long nr_free = 0; - struct kmem_cache *s; int node; - s = list_entry(p, struct kmem_cache, list); - for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); @@ -5429,16 +5425,16 @@ int slabinfo_show(struct seq_file *m, void *p) nr_free += count_partial(n, count_free); } - nr_inuse = nr_objs - nr_free; + sinfo->active_objs = nr_objs - nr_free; + sinfo->num_objs = nr_objs; + sinfo->active_slabs = nr_slabs; + sinfo->num_slabs = nr_slabs; + sinfo->objects_per_slab = oo_objects(s->oo); + sinfo->cache_order = oo_order(s->oo); +} - seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, - nr_objs, s->size, oo_objects(s->oo), - (1 << oo_order(s->oo))); - seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); - seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, - 0UL); - seq_putc(m, '\n'); - return 0; +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) +{ } ssize_t slabinfo_write(struct file *file, const char __user *buffer, -- cgit From 1b4f59e356cc94929305bd107b7f38eec62715ad Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 22 Oct 2012 18:05:36 +0400 Subject: slub: Commonize slab_cache field in struct page Right now, slab and slub have fields in struct page to derive which cache a page belongs to, but they do it slightly differently. slab uses a field called slab_cache, that lives in the third double word. slub, uses a field called "slab", living outside of the doublewords area. Ideally, we could use the same field for this. Since slub heavily makes use of the doubleword region, there isn't really much room to move slub's slab_cache field around. Since slab does not have such strict placement restrictions, we can move it outside the doubleword area. The naming used by slab, "slab_cache", is less confusing, and it is preferred over slub's generic "slab". Signed-off-by: Glauber Costa Acked-by: Christoph Lameter CC: David Rientjes Signed-off-by: Pekka Enberg --- mm/slub.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 16274b273c61..35483e0ab6bc 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1092,11 +1092,11 @@ static noinline struct kmem_cache_node *free_debug_processing( if (!check_object(s, page, object, SLUB_RED_ACTIVE)) goto out; - if (unlikely(s != page->slab)) { + if (unlikely(s != page->slab_cache)) { if (!PageSlab(page)) { slab_err(s, page, "Attempt to free object(0x%p) " "outside of slab", object); - } else if (!page->slab) { + } else if (!page->slab_cache) { printk(KERN_ERR "SLUB : no slab for object 0x%p.\n", object); @@ -1357,7 +1357,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) goto out; inc_slabs_node(s, page_to_nid(page), page->objects); - page->slab = s; + page->slab_cache = s; __SetPageSlab(page); if (page->pfmemalloc) SetPageSlabPfmemalloc(page); @@ -1424,7 +1424,7 @@ static void rcu_free_slab(struct rcu_head *h) else page = container_of((struct list_head *)h, struct page, lru); - __free_slab(page->slab, page); + __free_slab(page->slab_cache, page); } static void free_slab(struct kmem_cache *s, struct page *page) @@ -2617,9 +2617,9 @@ void kmem_cache_free(struct kmem_cache *s, void *x) page = virt_to_head_page(x); - if (kmem_cache_debug(s) && page->slab != s) { + if (kmem_cache_debug(s) && page->slab_cache != s) { pr_err("kmem_cache_free: Wrong slab cache. %s but object" - " is from %s\n", page->slab->name, s->name); + " is from %s\n", page->slab_cache->name, s->name); WARN_ON_ONCE(1); return; } @@ -3418,7 +3418,7 @@ size_t ksize(const void *object) return PAGE_SIZE << compound_order(page); } - return slab_ksize(page->slab); + return slab_ksize(page->slab_cache); } EXPORT_SYMBOL(ksize); @@ -3443,8 +3443,8 @@ bool verify_mem_not_deleted(const void *x) } slab_lock(page); - if (on_freelist(page->slab, page, object)) { - object_err(page->slab, page, object, "Object is on free-list"); + if (on_freelist(page->slab_cache, page, object)) { + object_err(page->slab_cache, page, object, "Object is on free-list"); rv = false; } else { rv = true; @@ -3475,7 +3475,7 @@ void kfree(const void *x) __free_pages(page, compound_order(page)); return; } - slab_free(page->slab, page, object, _RET_IP_); + slab_free(page->slab_cache, page, object, _RET_IP_); } EXPORT_SYMBOL(kfree); @@ -3686,11 +3686,11 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) if (n) { list_for_each_entry(p, &n->partial, lru) - p->slab = s; + p->slab_cache = s; #ifdef CONFIG_SLUB_DEBUG list_for_each_entry(p, &n->full, lru) - p->slab = s; + p->slab_cache = s; #endif } } -- cgit From b4916cb17c261a6043bcb2a98d0d6512497a7cf8 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 29 Oct 2012 22:59:58 +0900 Subject: percpu: make pcpu_free_chunk() use pcpu_mem_free() instead of kfree() commit 099a19d9('allow limited allocation before slab is online') made pcpu_alloc_chunk() use pcpu_mem_zalloc() but forgot to update pcpu_free_chunk() accordingly. This doesn't cause any immediate problema, but fix it for consistency. tj: commit message updated Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Signed-off-by: Tejun Heo --- mm/percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index ddc5efb9c5bb..ec2589616e7d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -631,7 +631,7 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) if (!chunk) return; pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); - kfree(chunk); + pcpu_mem_free(chunk, pcpu_chunk_struct_size); } /* -- cgit From c26251f9f06d27d1941229d237aca44a0e7b4e42 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 26 Oct 2012 13:37:28 +0200 Subject: memcg: split mem_cgroup_force_empty into reclaiming and reparenting parts mem_cgroup_force_empty did two separate things depending on free_all parameter from the very beginning. It either reclaimed as many pages as possible and moved the rest to the parent or just moved charges to the parent. The first variant is used as memory.force_empty callback while the later is used from the mem_cgroup_pre_destroy. The whole games around gotos are far from being nice and there is no reason to keep those two functions inside one. Let's split them and also move the responsibility for css reference counting to their callers to make to code easier. This patch doesn't have any functional changes. Signed-off-by: Michal Hocko Reviewed-by: Tejun Heo Reviewed-by: Glauber Costa Signed-off-by: Tejun Heo --- mm/memcontrol.c | 72 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 42 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 795e525afaba..07d92b84f448 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3739,27 +3739,21 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, } /* - * make mem_cgroup's charge to be 0 if there is no task. + * make mem_cgroup's charge to be 0 if there is no task by moving + * all the charges and pages to the parent. * This enables deleting this mem_cgroup. + * + * Caller is responsible for holding css reference on the memcg. */ -static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) +static int mem_cgroup_reparent_charges(struct mem_cgroup *memcg) { - int ret; - int node, zid, shrink; - int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct cgroup *cgrp = memcg->css.cgroup; + int node, zid; + int ret; - css_get(&memcg->css); - - shrink = 0; - /* should free all ? */ - if (free_all) - goto try_to_free; -move_account: do { - ret = -EBUSY; if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) - goto out; + return -EBUSY; /* This is for making all *used* pages to be on LRU. */ lru_add_drain_all(); drain_all_stock_sync(memcg); @@ -3783,27 +3777,34 @@ move_account: cond_resched(); /* "ret" should also be checked to ensure all lists are empty. */ } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); -out: - css_put(&memcg->css); + return ret; +} + +/* + * Reclaims as many pages from the given memcg as possible and moves + * the rest to the parent. + * + * Caller is responsible for holding css reference for memcg. + */ +static int mem_cgroup_force_empty(struct mem_cgroup *memcg) +{ + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + struct cgroup *cgrp = memcg->css.cgroup; -try_to_free: /* returns EBUSY if there is a task or if we come here twice. */ - if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { - ret = -EBUSY; - goto out; - } + if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) + return -EBUSY; + /* we call try-to-free pages for make this cgroup empty */ lru_add_drain_all(); /* try to free all pages in this cgroup */ - shrink = 1; while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { int progress; - if (signal_pending(current)) { - ret = -EINTR; - goto out; - } + if (signal_pending(current)) + return -EINTR; + progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, false); if (!progress) { @@ -3814,13 +3815,19 @@ try_to_free: } lru_add_drain(); - /* try move_account...there may be some *locked* pages. */ - goto move_account; + return mem_cgroup_reparent_charges(memcg); } static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) { - return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + int ret; + + css_get(&memcg->css); + ret = mem_cgroup_force_empty(memcg); + css_put(&memcg->css); + + return ret; } @@ -5003,8 +5010,13 @@ free_out: static int mem_cgroup_pre_destroy(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + int ret; - return mem_cgroup_force_empty(memcg, false); + css_get(&memcg->css); + ret = mem_cgroup_reparent_charges(memcg); + css_put(&memcg->css); + + return ret; } static void mem_cgroup_destroy(struct cgroup *cont) -- cgit From d842301181d9a4486aa24720ed4f96018b213292 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 26 Oct 2012 13:37:29 +0200 Subject: memcg: root_cgroup cannot reach mem_cgroup_move_parent The root cgroup cannot be destroyed so we never hit it down the mem_cgroup_pre_destroy path and mem_cgroup_force_empty_write shouldn't even try to do anything if called for the root. This means that mem_cgroup_move_parent doesn't have to bother with the root cgroup and it can assume it can always move charges upwards. Signed-off-by: Michal Hocko Reviewed-by: Tejun Heo Reviewed-by: Glauber Costa Signed-off-by: Tejun Heo --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 07d92b84f448..916132a29b36 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2715,9 +2715,7 @@ static int mem_cgroup_move_parent(struct page *page, unsigned long uninitialized_var(flags); int ret; - /* Is ROOT ? */ - if (mem_cgroup_is_root(child)) - return -EINVAL; + VM_BUG_ON(mem_cgroup_is_root(child)); ret = -EBUSY; if (!get_page_unless_zero(page)) @@ -3823,6 +3821,8 @@ static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); int ret; + if (mem_cgroup_is_root(memcg)) + return -EINVAL; css_get(&memcg->css); ret = mem_cgroup_force_empty(memcg); css_put(&memcg->css); -- cgit From 2ef37d3fe474b218e170010a59066e19427c9847 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 26 Oct 2012 13:37:30 +0200 Subject: memcg: Simplify mem_cgroup_force_empty_list error handling mem_cgroup_force_empty_list currently tries to remove all pages from the given LRU. To prevent from temoporary failures (EBUSY returned by mem_cgroup_move_parent) it uses a margin to the current LRU pages and returns the true if there are still some pages left on the list. If we consider that mem_cgroup_move_parent fails only when it is racing with somebody else removing (uncharging) the page or when the page is migrated then it is obvious that all those failures are only temporal and so we can safely retry later. Let's get rid of the safety margin and make the loop really wait for the empty LRU. The caller should still make sure that all charges have been removed from the res_counter because mem_cgroup_replace_page_cache might add a page to the LRU after the list_empty check (it doesn't touch res_counter though). This catches most of the cases except for shmem which might call mem_cgroup_replace_page_cache with a page which is not charged and on the LRU yet but this was the case also without this patch. In order to fix this we need a guarantee that try_get_mem_cgroup_from_page falls back to the current mm's cgroup so it needs css_tryget to fail. This will be fixed up in a later patch because it needs a help from cgroup core (pre_destroy has to be called after css is cleared). Although mem_cgroup_pre_destroy can still fail (if a new task or a new sub-group appears) there is no reason to retry pre_destroy callback from the cgroup core. This means that __DEPRECATED_clear_css_refs has lost its meaning and it can be removed. Changes since v2 - remove __DEPRECATED_clear_css_refs Changes since v1 - use kerndoc - be more specific about mem_cgroup_move_parent possible failures Signed-off-by: Michal Hocko Reviewed-by: Tejun Heo Reviewed-by: Glauber Costa Signed-off-by: Tejun Heo --- mm/memcontrol.c | 76 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 48 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 916132a29b36..5a1d584ffed3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2702,10 +2702,27 @@ out: return ret; } -/* - * move charges to its parent. +/** + * mem_cgroup_move_parent - moves page to the parent group + * @page: the page to move + * @pc: page_cgroup of the page + * @child: page's cgroup + * + * move charges to its parent or the root cgroup if the group has no + * parent (aka use_hierarchy==0). + * Although this might fail (get_page_unless_zero, isolate_lru_page or + * mem_cgroup_move_account fails) the failure is always temporary and + * it signals a race with a page removal/uncharge or migration. In the + * first case the page is on the way out and it will vanish from the LRU + * on the next attempt and the call should be retried later. + * Isolation from the LRU fails only if page has been isolated from + * the LRU since we looked at it and that usually means either global + * reclaim or migration going on. The page will either get back to the + * LRU or vanish. + * Finaly mem_cgroup_move_account fails only if the page got uncharged + * (!PageCgroupUsed) or moved to a different group. The page will + * disappear in the next attempt. */ - static int mem_cgroup_move_parent(struct page *page, struct page_cgroup *pc, struct mem_cgroup *child) @@ -2732,8 +2749,10 @@ static int mem_cgroup_move_parent(struct page *page, if (!parent) parent = root_mem_cgroup; - if (nr_pages > 1) + if (nr_pages > 1) { + VM_BUG_ON(!PageTransHuge(page)); flags = compound_lock_irqsave(page); + } ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent); @@ -3683,17 +3702,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, return nr_reclaimed; } -/* +/** + * mem_cgroup_force_empty_list - clears LRU of a group + * @memcg: group to clear + * @node: NUMA node + * @zid: zone id + * @lru: lru to to clear + * * Traverse a specified page_cgroup list and try to drop them all. This doesn't - * reclaim the pages page themselves - it just removes the page_cgroups. - * Returns true if some page_cgroups were not freed, indicating that the caller - * must retry this operation. + * reclaim the pages page themselves - pages are moved to the parent (or root) + * group. */ -static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, +static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, int node, int zid, enum lru_list lru) { struct mem_cgroup_per_zone *mz; - unsigned long flags, loop; + unsigned long flags; struct list_head *list; struct page *busy; struct zone *zone; @@ -3702,11 +3726,8 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, mz = mem_cgroup_zoneinfo(memcg, node, zid); list = &mz->lruvec.lists[lru]; - loop = mz->lru_size[lru]; - /* give some margin against EBUSY etc...*/ - loop += 256; busy = NULL; - while (loop--) { + do { struct page_cgroup *pc; struct page *page; @@ -3732,8 +3753,7 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, cond_resched(); } else busy = NULL; - } - return !list_empty(list); + } while (!list_empty(list)); } /* @@ -3747,7 +3767,6 @@ static int mem_cgroup_reparent_charges(struct mem_cgroup *memcg) { struct cgroup *cgrp = memcg->css.cgroup; int node, zid; - int ret; do { if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) @@ -3755,28 +3774,30 @@ static int mem_cgroup_reparent_charges(struct mem_cgroup *memcg) /* This is for making all *used* pages to be on LRU. */ lru_add_drain_all(); drain_all_stock_sync(memcg); - ret = 0; mem_cgroup_start_move(memcg); for_each_node_state(node, N_HIGH_MEMORY) { - for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { + for (zid = 0; zid < MAX_NR_ZONES; zid++) { enum lru_list lru; for_each_lru(lru) { - ret = mem_cgroup_force_empty_list(memcg, + mem_cgroup_force_empty_list(memcg, node, zid, lru); - if (ret) - break; } } - if (ret) - break; } mem_cgroup_end_move(memcg); memcg_oom_recover(memcg); cond_resched(); - /* "ret" should also be checked to ensure all lists are empty. */ - } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); - return ret; + /* + * This is a safety check because mem_cgroup_force_empty_list + * could have raced with mem_cgroup_replace_page_cache callers + * so the lru seemed empty but the page could have been added + * right after the check. RES_USAGE should be safe as we always + * charge before adding to the LRU. + */ + } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0); + + return 0; } /* @@ -5618,7 +5639,6 @@ struct cgroup_subsys mem_cgroup_subsys = { .base_cftypes = mem_cgroup_files, .early_init = 0, .use_id = 1, - .__DEPRECATED_clear_css_refs = true, }; #ifdef CONFIG_MEMCG_SWAP -- cgit From 999d8795d438d396936811b185428d70b7b7de6d Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Fri, 19 Oct 2012 09:33:10 -0300 Subject: mm/slob: Drop usage of page->private for storing page-sized allocations This field was being used to store size allocation so it could be retrieved by ksize(). However, it is a bad practice to not mark a page as a slab page and then use fields for special purposes. There is no need to store the allocated size and ksize() can simply return PAGE_SIZE << compound_order(page). Cc: Pekka Enberg Cc: Matt Mackall Acked-by: Christoph Lameter Signed-off-by: Ezequiel Garcia Signed-off-by: Pekka Enberg --- mm/slob.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index a08e4681fd0d..06a5ec70e728 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -28,9 +28,8 @@ * from kmalloc are prepended with a 4-byte header with the kmalloc size. * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls * alloc_pages() directly, allocating compound pages so the page order - * does not have to be separately tracked, and also stores the exact - * allocation size in page->private so that it can be used to accurately - * provide ksize(). These objects are detected in kfree() because slob_page() + * does not have to be separately tracked. + * These objects are detected in kfree() because PageSlab() * is false for them. * * SLAB is emulated on top of SLOB by simply calling constructors and @@ -455,11 +454,6 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) if (likely(order)) gfp |= __GFP_COMP; ret = slob_new_pages(gfp, order, node); - if (ret) { - struct page *page; - page = virt_to_page(ret); - page->private = size; - } trace_kmalloc_node(caller, ret, size, PAGE_SIZE << order, gfp, node); @@ -514,18 +508,20 @@ EXPORT_SYMBOL(kfree); size_t ksize(const void *block) { struct page *sp; + int align; + unsigned int *m; BUG_ON(!block); if (unlikely(block == ZERO_SIZE_PTR)) return 0; sp = virt_to_page(block); - if (PageSlab(sp)) { - int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); - unsigned int *m = (unsigned int *)(block - align); - return SLOB_UNITS(*m) * SLOB_UNIT; - } else - return sp->private; + if (unlikely(!PageSlab(sp))) + return PAGE_SIZE << compound_order(sp); + + align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + m = (unsigned int *)(block - align); + return SLOB_UNITS(*m) * SLOB_UNIT; } EXPORT_SYMBOL(ksize); -- cgit From fe74fe2bf293d061826f0d7afc2ca8456bdbb40e Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Fri, 19 Oct 2012 09:33:11 -0300 Subject: mm/slob: Use object_size field in kmem_cache_size() Fields object_size and size are not the same: the latter might include slab metadata. Return object_size field in kmem_cache_size(). Also, improve trace accuracy by correctly tracing reported size. Cc: Pekka Enberg Cc: Matt Mackall Acked-by: Christoph Lameter Acked-by: David Rientjes Signed-off-by: Ezequiel Garcia Signed-off-by: Pekka Enberg --- mm/slob.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index 06a5ec70e728..287a88aa4a61 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -554,12 +554,12 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node); - trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, + trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, SLOB_UNITS(c->size) * SLOB_UNIT, flags, node); } else { b = slob_new_pages(flags, get_order(c->size), node); - trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, + trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, PAGE_SIZE << get_order(c->size), flags, node); } @@ -606,7 +606,7 @@ EXPORT_SYMBOL(kmem_cache_free); unsigned int kmem_cache_size(struct kmem_cache *c) { - return c->size; + return c->object_size; } EXPORT_SYMBOL(kmem_cache_size); -- cgit From 242860a47a75b933a79a30f6a40bf4858f4a3ecc Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Fri, 19 Oct 2012 09:33:12 -0300 Subject: mm/sl[aou]b: Move common kmem_cache_size() to slab.h This function is identically defined in all three allocators and it's trivial to move it to slab.h Since now it's static, inline, header-defined function this patch also drops the EXPORT_SYMBOL tag. Cc: Pekka Enberg Cc: Matt Mackall Acked-by: Christoph Lameter Signed-off-by: Ezequiel Garcia Signed-off-by: Pekka Enberg --- mm/slab.c | 6 ------ mm/slob.c | 6 ------ mm/slub.c | 9 --------- 3 files changed, 21 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 6d5c83c6ddd5..1f7fd5f51f87 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3969,12 +3969,6 @@ void kfree(const void *objp) } EXPORT_SYMBOL(kfree); -unsigned int kmem_cache_size(struct kmem_cache *cachep) -{ - return cachep->object_size; -} -EXPORT_SYMBOL(kmem_cache_size); - /* * This initializes kmem_list3 or resizes various caches for all nodes. */ diff --git a/mm/slob.c b/mm/slob.c index 287a88aa4a61..fffbc820774d 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -604,12 +604,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b) } EXPORT_SYMBOL(kmem_cache_free); -unsigned int kmem_cache_size(struct kmem_cache *c) -{ - return c->object_size; -} -EXPORT_SYMBOL(kmem_cache_size); - int __kmem_cache_shutdown(struct kmem_cache *c) { /* No way to check for remaining objects */ diff --git a/mm/slub.c b/mm/slub.c index 35483e0ab6bc..deee7c754a7d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3121,15 +3121,6 @@ error: return -EINVAL; } -/* - * Determine the size of a slab object - */ -unsigned int kmem_cache_size(struct kmem_cache *s) -{ - return s->object_size; -} -EXPORT_SYMBOL(kmem_cache_size); - static void list_slab_objects(struct kmem_cache *s, struct page *page, const char *text) { -- cgit From 8cf9864b1382851d90c7c505f8441c8928f1469e Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Mon, 22 Oct 2012 09:04:31 -0300 Subject: mm/slob: Use free_page instead of put_page for page-size kmalloc allocations When freeing objects, the slob allocator currently free empty pages calling __free_pages(). However, page-size kmallocs are disposed using put_page() instead. It makes no sense to call put_page() for kernel pages that are provided by the object allocator, so we shouldn't be doing this ourselves. This is based on: commit d9b7f22623b5fa9cc189581dcdfb2ac605933bf4 Author: Glauber Costa slub: use free_page instead of put_page for freeing kmalloc allocation Cc: Christoph Lameter Cc: Pekka Enberg Cc: Matt Mackall Acked-by: Glauber Costa Signed-off-by: Ezequiel Garcia Signed-off-by: Pekka Enberg --- mm/slob.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index fffbc820774d..e7d790127e4b 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -500,7 +500,7 @@ void kfree(const void *block) unsigned int *m = (unsigned int *)(block - align); slob_free(m, *m + align); } else - put_page(sp); + __free_pages(sp, compound_order(sp)); } EXPORT_SYMBOL(kfree); -- cgit From d8843922fba49e887874aa1f9e748d620c5092af Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Wed, 17 Oct 2012 15:36:51 +0400 Subject: slab: Ignore internal flags in cache creation Some flags are used internally by the allocators for management purposes. One example of that is the CFLGS_OFF_SLAB flag that slab uses to mark that the metadata for that cache is stored outside of the slab. No cache should ever pass those as a creation flags. We can just ignore this bit if it happens to be passed (such as when duplicating a cache in the kmem memcg patches). Because such flags can vary from allocator to allocator, we allow them to make their own decisions on that, defining SLAB_AVAILABLE_FLAGS with all flags that are valid at creation time. Allocators that doesn't have any specific flag requirement should define that to mean all flags. Common code will mask out all flags not belonging to that set. Acked-by: Christoph Lameter Acked-by: David Rientjes Signed-off-by: Glauber Costa Signed-off-by: Pekka Enberg --- mm/slab.c | 22 ---------------------- mm/slab.h | 25 +++++++++++++++++++++++++ mm/slab_common.c | 7 +++++++ mm/slub.c | 3 --- 4 files changed, 32 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 1f7fd5f51f87..6ebb9515a3e9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -162,23 +162,6 @@ */ static bool pfmemalloc_active __read_mostly; -/* Legal flag mask for kmem_cache_create(). */ -#if DEBUG -# define CREATE_MASK (SLAB_RED_ZONE | \ - SLAB_POISON | SLAB_HWCACHE_ALIGN | \ - SLAB_CACHE_DMA | \ - SLAB_STORE_USER | \ - SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ - SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) -#else -# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ - SLAB_CACHE_DMA | \ - SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ - SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) -#endif - /* * kmem_bufctl_t: * @@ -2378,11 +2361,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (flags & SLAB_DESTROY_BY_RCU) BUG_ON(flags & SLAB_POISON); #endif - /* - * Always checks flags, a caller might be expecting debug support which - * isn't available. - */ - BUG_ON(flags & ~CREATE_MASK); /* * Check that size is in terms of words. This is needed to avoid diff --git a/mm/slab.h b/mm/slab.h index 5a43c2f13621..66a62d3536c6 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -45,6 +45,31 @@ static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t siz #endif +/* Legal flag mask for kmem_cache_create(), for various configurations */ +#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ + SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS ) + +#if defined(CONFIG_DEBUG_SLAB) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) +#elif defined(CONFIG_SLUB_DEBUG) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_DEBUG_FREE) +#else +#define SLAB_DEBUG_FLAGS (0) +#endif + +#if defined(CONFIG_SLAB) +#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK) +#elif defined(CONFIG_SLUB) +#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | SLAB_NOTRACK) +#else +#define SLAB_CACHE_FLAGS (0) +#endif + +#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) + int __kmem_cache_shutdown(struct kmem_cache *); struct seq_file; diff --git a/mm/slab_common.c b/mm/slab_common.c index 5fb753da6cf0..b705be7faa48 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -109,6 +109,13 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align if (!kmem_cache_sanity_check(name, size) == 0) goto out_locked; + /* + * Some allocators will constraint the set of valid flags to a subset + * of all flags. We expect them to define CACHE_CREATE_MASK in this + * case, and we'll just provide them with a sanitized version of the + * passed flags. + */ + flags &= CACHE_CREATE_MASK; s = __kmem_cache_alias(name, size, align, flags, ctor); if (s) diff --git a/mm/slub.c b/mm/slub.c index deee7c754a7d..b2ada3db4225 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -112,9 +112,6 @@ * the fast path and disables lockless freelists. */ -#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DEBUG_FREE) - static inline int kmem_cache_debug(struct kmem_cache *s) { #ifdef CONFIG_SLUB_DEBUG -- cgit From 789306e5ad6b3051c263ac2478875efa8bc07462 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 5 Oct 2012 16:55:20 +0200 Subject: mm/slob: use min_t() to compare ARCH_SLAB_MINALIGN The definition of ARCH_SLAB_MINALIGN is architecture dependent and can be either of type size_t or int. Comparing that value with ARCH_KMALLOC_MINALIGN can cause harmless warnings on platforms where they are different. Since both are always small positive integer numbers, using the size_t type to compare them is safe and gets rid of the warning. Without this patch, building ARM collie_defconfig results in: mm/slob.c: In function '__kmalloc_node': mm/slob.c:431:152: warning: comparison of distinct pointer types lacks a cast [enabled by default] mm/slob.c: In function 'kfree': mm/slob.c:484:153: warning: comparison of distinct pointer types lacks a cast [enabled by default] mm/slob.c: In function 'ksize': mm/slob.c:503:153: warning: comparison of distinct pointer types lacks a cast [enabled by default] Acked-by: Christoph Lameter Signed-off-by: Arnd Bergmann [ penberg@kernel.org: updates for master ] Signed-off-by: Pekka Enberg --- mm/slob.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index e7d790127e4b..87e16c4d9143 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -428,7 +428,7 @@ static __always_inline void * __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) { unsigned int *m; - int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); void *ret; gfp &= gfp_allowed_mask; @@ -496,7 +496,7 @@ void kfree(const void *block) sp = virt_to_page(block); if (PageSlab(sp)) { - int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); slob_free(m, *m + align); } else @@ -519,7 +519,7 @@ size_t ksize(const void *block) if (unlikely(!PageSlab(sp))) return PAGE_SIZE << compound_order(sp); - align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); m = (unsigned int *)(block - align); return SLOB_UNITS(*m) * SLOB_UNIT; } -- cgit From e93160803ffda2e67d9ff9cacb63bb6868c8398f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Nov 2012 09:16:58 -0800 Subject: cgroup: kill CSS_REMOVED CSS_REMOVED is one of the several contortions which were necessary to support css reference draining on cgroup removal. All css->refcnts which need draining should be deactivated and verified to equal zero atomically w.r.t. css_tryget(). If any one isn't zero, all refcnts needed to be re-activated and css_tryget() shouldn't fail in the process. This was achieved by letting css_tryget() busy-loop until either the refcnt is reactivated (failed removal attempt) or CSS_REMOVED is set (committing to removal). Now that css refcnt draining is no longer used, there's no need for atomic rollback mechanism. css_tryget() simply can look at the reference count and fail if it's deactivated - it's never getting re-activated. This patch removes CSS_REMOVED and updates __css_tryget() to fail if the refcnt is deactivated. As deactivation and removal are a single step now, they no longer need to be protected against css_tryget() happening from irq context. Remove local_irq_disable/enable() from cgroup_rmdir(). Note that this removes css_is_removed() whose only user is VM_BUG_ON() in memcontrol.c. We can replace it with a check on the refcnt but given that the only use case is a debug assert, I think it's better to simply unexport it. v2: Comment updated and explanation on local_irq_disable/enable() added per Michal Hocko. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Johannes Weiner Cc: Balbir Singh --- mm/memcontrol.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5a1d584ffed3..37c356646544 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2343,7 +2343,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, again: if (*ptr) { /* css should be a valid one */ memcg = *ptr; - VM_BUG_ON(css_is_removed(&memcg->css)); if (mem_cgroup_is_root(memcg)) goto done; if (nr_pages == 1 && consume_stock(memcg)) @@ -2483,9 +2482,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, /* * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller must check css_is_removed() or some if - * it's concern. (dropping refcnt from swap can be called against removed - * memcg.) + * rcu_read_lock(). The caller is responsible for calling css_tryget if + * the mem_cgroup is used for charging. (dropping refcnt from swap can be + * called against removed memcg.) */ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) { -- cgit From b25ed609d0eecf077db607e88ea70bae83b6adb2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Nov 2012 09:16:59 -0800 Subject: cgroup: remove CGRP_WAIT_ON_RMDIR, cgroup_exclude_rmdir() and cgroup_release_and_wakeup_rmdir() CGRP_WAIT_ON_RMDIR is another kludge which was added to make cgroup destruction rollback somewhat working. cgroup_rmdir() used to drain CSS references and CGRP_WAIT_ON_RMDIR and the associated waitqueue and helpers were used to allow the task performing rmdir to wait for the next relevant event. Unfortunately, the wait is visible to controllers too and the mechanism got exposed to memcg by 887032670d ("cgroup avoid permanent sleep at rmdir"). Now that the draining and retries are gone, CGRP_WAIT_ON_RMDIR is unnecessary. Remove it and all the mechanisms supporting it. Note that memcontrol.c changes are essentially revert of 887032670d ("cgroup avoid permanent sleep at rmdir"). Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Balbir Singh --- mm/memcontrol.c | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 37c356646544..930edfaa5187 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2681,13 +2681,6 @@ static int mem_cgroup_move_account(struct page *page, /* caller should have done css_get */ pc->mem_cgroup = to; mem_cgroup_charge_statistics(to, anon, nr_pages); - /* - * We charges against "to" which may not have any tasks. Then, "to" - * can be under rmdir(). But in current implementation, caller of - * this function is just force_empty() and move charge, so it's - * guaranteed that "to" is never removed. So, we don't check rmdir - * status here. - */ move_unlock_mem_cgroup(from, &flags); ret = 0; unlock: @@ -2893,7 +2886,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, return; if (!memcg) return; - cgroup_exclude_rmdir(&memcg->css); __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); /* @@ -2907,12 +2899,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, swp_entry_t ent = {.val = page_private(page)}; mem_cgroup_uncharge_swap(ent); } - /* - * At swapin, we may charge account against cgroup which has no tasks. - * So, rmdir()->pre_destroy() can be called while we do this charge. - * In that case, we need to call pre_destroy() again. check it here. - */ - cgroup_release_and_wakeup_rmdir(&memcg->css); } void mem_cgroup_commit_charge_swapin(struct page *page, @@ -3360,8 +3346,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, if (!memcg) return; - /* blocks rmdir() */ - cgroup_exclude_rmdir(&memcg->css); + if (!migration_ok) { used = oldpage; unused = newpage; @@ -3395,13 +3380,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, */ if (anon) mem_cgroup_uncharge_page(used); - /* - * At migration, we may charge account against cgroup which has no - * tasks. - * So, rmdir()->pre_destroy() can be called while we do this charge. - * In that case, we need to call pre_destroy() again. check it here. - */ - cgroup_release_and_wakeup_rmdir(&memcg->css); } /* -- cgit From ab5196c202c60f84c7a74975742806aad242d9e3 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 26 Oct 2012 13:37:32 +0200 Subject: memcg: make mem_cgroup_reparent_charges non failing Now that pre_destroy callbacks are called from the context where neither any task can attach the group nor any children group can be added there is no other way to fail from mem_cgroup_pre_destroy. mem_cgroup_pre_destroy doesn't have to take a reference to memcg's css because all css' are marked dead already. tj: Remove now unused local variable @cgrp from mem_cgroup_reparent_charges(). Signed-off-by: Michal Hocko Reviewed-by: Glauber Costa Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Tejun Heo --- mm/memcontrol.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 930edfaa5187..6678f991c6c6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3740,14 +3740,11 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, * * Caller is responsible for holding css reference on the memcg. */ -static int mem_cgroup_reparent_charges(struct mem_cgroup *memcg) +static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) { - struct cgroup *cgrp = memcg->css.cgroup; int node, zid; do { - if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) - return -EBUSY; /* This is for making all *used* pages to be on LRU. */ lru_add_drain_all(); drain_all_stock_sync(memcg); @@ -3773,8 +3770,6 @@ static int mem_cgroup_reparent_charges(struct mem_cgroup *memcg) * charge before adding to the LRU. */ } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0); - - return 0; } /* @@ -3811,7 +3806,9 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) } lru_add_drain(); - return mem_cgroup_reparent_charges(memcg); + mem_cgroup_reparent_charges(memcg); + + return 0; } static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) @@ -5008,13 +5005,9 @@ free_out: static int mem_cgroup_pre_destroy(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - int ret; - css_get(&memcg->css); - ret = mem_cgroup_reparent_charges(memcg); - css_put(&memcg->css); - - return ret; + mem_cgroup_reparent_charges(memcg); + return 0; } static void mem_cgroup_destroy(struct cgroup *cont) -- cgit From 9d093cb10eb482adfba6ddc71a0969b78823ee8b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 26 Oct 2012 13:37:33 +0200 Subject: hugetlb: do not fail in hugetlb_cgroup_pre_destroy Now that pre_destroy callbacks are called from the context where neither any task can attach the group nor any children group can be added there is no other way to fail from hugetlb_pre_destroy. Signed-off-by: Michal Hocko Reviewed-by: Tejun Heo Reviewed-by: Glauber Costa Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Tejun Heo --- mm/hugetlb_cgroup.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index a3f358fb8a0c..dc595c6b1f55 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -159,14 +159,9 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) { struct hstate *h; struct page *page; - int ret = 0, idx = 0; + int idx = 0; do { - if (cgroup_task_count(cgroup) || - !list_empty(&cgroup->children)) { - ret = -EBUSY; - goto out; - } for_each_hstate(h) { spin_lock(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_activelist, lru) @@ -177,8 +172,8 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) } cond_resched(); } while (hugetlb_cgroup_have_usage(cgroup)); -out: - return ret; + + return 0; } int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, -- cgit From bcf6de1b9129531215d26dd9af8331e84973bc52 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Nov 2012 09:16:59 -0800 Subject: cgroup: make ->pre_destroy() return void All ->pre_destory() implementations return 0 now, which is the only allowed return value. Make it return void. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Balbir Singh Cc: Vivek Goyal --- mm/hugetlb_cgroup.c | 4 +--- mm/memcontrol.c | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index dc595c6b1f55..0d3a1a317731 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -155,7 +155,7 @@ out: * Force the hugetlb cgroup to empty the hugetlb resources by moving them to * the parent cgroup. */ -static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) +static void hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) { struct hstate *h; struct page *page; @@ -172,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) } cond_resched(); } while (hugetlb_cgroup_have_usage(cgroup)); - - return 0; } int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6678f991c6c6..a1811ce60e20 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5002,12 +5002,11 @@ free_out: return ERR_PTR(error); } -static int mem_cgroup_pre_destroy(struct cgroup *cont) +static void mem_cgroup_pre_destroy(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); mem_cgroup_reparent_charges(memcg); - return 0; } static void mem_cgroup_destroy(struct cgroup *cont) -- cgit From a755b76ab4cefe3d3aa046f3abc62b7e087336b3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 6 Nov 2012 17:10:10 -0800 Subject: mm: fix slab.c kernel-doc warnings Fix new kernel-doc warnings in mm/slab.c: Warning(mm/slab.c:2358): No description found for parameter 'cachep' Warning(mm/slab.c:2358): Excess function parameter 'name' description in '__kmem_cache_create' Warning(mm/slab.c:2358): Excess function parameter 'size' description in '__kmem_cache_create' Warning(mm/slab.c:2358): Excess function parameter 'align' description in '__kmem_cache_create' Warning(mm/slab.c:2358): Excess function parameter 'ctor' description in '__kmem_cache_create' Signed-off-by: Randy Dunlap Cc: Christoph Lameter Cc: Pekka Enberg Cc: Matt Mackall Signed-off-by: Pekka Enberg --- mm/slab.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 6ebb9515a3e9..e26bff5ed1a6 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2314,11 +2314,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) /** * __kmem_cache_create - Create a cache. - * @name: A string which is used in /proc/slabinfo to identify this cache. - * @size: The size of objects to be created in this cache. - * @align: The required alignment for the objects. + * @cachep: cache management descriptor * @flags: SLAB flags - * @ctor: A constructor for the objects. * * Returns a ptr to the cache on success, NULL on failure. * Cannot be called within a int, but can be interrupted. -- cgit From 997071bcb34005f42e0fe5bc7930e895b070f251 Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Thu, 15 Nov 2012 14:34:42 -0800 Subject: mm: export a function to get vm committed memory It will be useful to be able to access global memory commitment from device drivers. On the Hyper-V platform, the host has a policy engine to balance the available physical memory amongst all competing virtual machines hosted on a given node. This policy engine is driven by a number of metrics including the memory commitment reported by the guests. The balloon driver for Linux on Hyper-V will use this function to retrieve guest memory commitment. This function is also used in Xen self ballooning code. [akpm@linux-foundation.org: coding-style tweak] Signed-off-by: K. Y. Srinivasan Acked-by: David Rientjes Acked-by: Dan Magenheimer Cc: Konrad Rzeszutek Wilk Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/mmap.c | 14 ++++++++++++++ mm/nommu.c | 15 +++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 2d942353d681..b064822be82e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -88,6 +88,20 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; */ struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; +/* + * The global memory commitment made in the system can be a metric + * that can be used to drive ballooning decisions when Linux is hosted + * as a guest. On Hyper-V, the host implements a policy engine for dynamically + * balancing memory across competing virtual machines that are hosted. + * Several metrics drive this policy engine including the guest reported + * memory commitment. + */ +unsigned long vm_memory_committed(void) +{ + return percpu_counter_read_positive(&vm_committed_as); +} +EXPORT_SYMBOL_GPL(vm_memory_committed); + /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to diff --git a/mm/nommu.c b/mm/nommu.c index 45131b41bcdb..79c3cac87afa 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -66,6 +66,21 @@ int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; +/* + * The global memory commitment made in the system can be a metric + * that can be used to drive ballooning decisions when Linux is hosted + * as a guest. On Hyper-V, the host implements a policy engine for dynamically + * balancing memory across competing virtual machines that are hosted. + * Several metrics drive this policy engine including the guest reported + * memory commitment. + */ +unsigned long vm_memory_committed(void) +{ + return percpu_counter_read_positive(&vm_committed_as); +} + +EXPORT_SYMBOL_GPL(vm_memory_committed); + EXPORT_SYMBOL(mem_map); EXPORT_SYMBOL(num_physpages); -- cgit From b3834be5c42a5d2fd85ff4b819fa38983b1450e6 Mon Sep 17 00:00:00 2001 From: Adam Buchbinder Date: Wed, 19 Sep 2012 21:48:02 -0400 Subject: various: Fix spelling of "asynchronous" in comments. "Asynchronous" is misspelled in some comments. No code changes. Signed-off-by: Adam Buchbinder Signed-off-by: Jiri Kosina --- mm/memory_hotplug.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 56b758ae57d2..bb81b7f417a8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -950,10 +950,10 @@ repeat: goto repeat; } } - /* drain all zone's lru pagevec, this is asyncronous... */ + /* drain all zone's lru pagevec, this is asynchronous... */ lru_add_drain_all(); yield(); - /* drain pcp pages , this is synchrouns. */ + /* drain pcp pages, this is synchronous. */ drain_all_pages(); /* check again */ offlined_pages = check_pages_isolated(start_pfn, end_pfn); @@ -962,7 +962,7 @@ repeat: goto failed_removal; } printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); - /* Ok, all of our target is islaoted. + /* Ok, all of our target is isolated. We cannot do rollback at this point. */ offline_isolated_pages(start_pfn, end_pfn); /* reset pagetype flags and makes migrate type to be MOVABLE */ -- cgit From 92fb97487a7e41b222c1417cabd1d1ab7cc3a48c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:38 -0800 Subject: cgroup: rename ->create/post_create/pre_destroy/destroy() to ->css_alloc/online/offline/free() Rename cgroup_subsys css lifetime related callbacks to better describe what their roles are. Also, update documentation. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- mm/hugetlb_cgroup.c | 14 +++++++------- mm/memcontrol.c | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 0d3a1a317731..b5bde7a5c017 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -77,7 +77,7 @@ static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg) return false; } -static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup) +static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup) { int idx; struct cgroup *parent_cgroup; @@ -101,7 +101,7 @@ static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup) return &h_cgroup->css; } -static void hugetlb_cgroup_destroy(struct cgroup *cgroup) +static void hugetlb_cgroup_css_free(struct cgroup *cgroup) { struct hugetlb_cgroup *h_cgroup; @@ -155,7 +155,7 @@ out: * Force the hugetlb cgroup to empty the hugetlb resources by moving them to * the parent cgroup. */ -static void hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) +static void hugetlb_cgroup_css_offline(struct cgroup *cgroup) { struct hstate *h; struct page *page; @@ -404,8 +404,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) struct cgroup_subsys hugetlb_subsys = { .name = "hugetlb", - .create = hugetlb_cgroup_create, - .pre_destroy = hugetlb_cgroup_pre_destroy, - .destroy = hugetlb_cgroup_destroy, - .subsys_id = hugetlb_subsys_id, + .css_alloc = hugetlb_cgroup_css_alloc, + .css_offline = hugetlb_cgroup_css_offline, + .css_free = hugetlb_cgroup_css_free, + .subsys_id = hugetlb_subsys_id, }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 08adaaae6fcc..8b0b2b028e23 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4922,7 +4922,7 @@ err_cleanup: } static struct cgroup_subsys_state * __ref -mem_cgroup_create(struct cgroup *cont) +mem_cgroup_css_alloc(struct cgroup *cont) { struct mem_cgroup *memcg, *parent; long error = -ENOMEM; @@ -5003,14 +5003,14 @@ free_out: return ERR_PTR(error); } -static void mem_cgroup_pre_destroy(struct cgroup *cont) +static void mem_cgroup_css_offline(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); mem_cgroup_reparent_charges(memcg); } -static void mem_cgroup_destroy(struct cgroup *cont) +static void mem_cgroup_css_free(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); @@ -5600,9 +5600,9 @@ static void mem_cgroup_move_task(struct cgroup *cont, struct cgroup_subsys mem_cgroup_subsys = { .name = "memory", .subsys_id = mem_cgroup_subsys_id, - .create = mem_cgroup_create, - .pre_destroy = mem_cgroup_pre_destroy, - .destroy = mem_cgroup_destroy, + .css_alloc = mem_cgroup_css_alloc, + .css_offline = mem_cgroup_css_offline, + .css_free = mem_cgroup_css_free, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .attach = mem_cgroup_move_task, -- cgit From ef6c5be658f6a70c1256fbd18e18ee0dc24c3386 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 21 Nov 2012 14:21:51 -0500 Subject: fix incorrect NR_FREE_PAGES accounting (appears like memory leak) There have been some 3.7-rc reports of vm issues, including some kswapd bugs and, more importantly, some memory "leaks": http://www.spinics.net/lists/linux-mm/msg46187.html https://bugzilla.kernel.org/show_bug.cgi?id=50181 Commit 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page immediately when it is made available") took split_free_page() and reused it for the compaction code. It does something curious with capture_free_page() (previously known as split_free_page()): int capture_free_page(struct page *page, int alloc_order, ... __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); - /* Split into individual pages */ - set_page_refcounted(page); - split_page(page, order); + if (alloc_order != order) + expand(zone, page, alloc_order, order, + &zone->free_area[order], migratetype); Note that expand() puts the pages _back_ in the allocator, but it does not bump NR_FREE_PAGES. We "return" 'alloc_order' worth of pages, but we accounted for removing 'order' in the __mod_zone_page_state() call. For the old split_page()-style use (order==alloc_order) the bug will not trigger. But, when called from the compaction code where we occasionally get a larger page out of the buddy allocator than we need, we will run in to this. This patch simply changes the NR_FREE_PAGES manipulation to the correct 'alloc_order' instead of 'order'. I've been able to repeatedly trigger this in my testing environment. The amount "leaked" very closely tracks the imbalance I see in buddy pages vs. NR_FREE_PAGES. I have confirmed that this patch fixes the imbalance Signed-off-by: Dave Hansen Acked-by: Mel Gorman Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7bb35ac0964a..bcb72c6e2b2d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1405,7 +1405,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) mt = get_pageblock_migratetype(page); if (unlikely(mt != MIGRATE_ISOLATE)) - __mod_zone_freepage_state(zone, -(1UL << order), mt); + __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); if (alloc_order != order) expand(zone, page, alloc_order, order, -- cgit From 82b212f40059bffd6808c07266a942d444d5558a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 26 Nov 2012 16:29:45 -0800 Subject: Revert "mm: remove __GFP_NO_KSWAPD" With "mm: vmscan: scale number of pages reclaimed by reclaim/compaction based on failures" reverted, Zdenek Kabelac reported the following Hmm, so it's just took longer to hit the problem and observe kswapd0 spinning on my CPU again - it's not as endless like before - but still it easily eats minutes - it helps to turn off Firefox or TB (memory hungry apps) so kswapd0 stops soon - and restart those apps again. (And I still have like >1GB of cached memory) kswapd0 R running task 0 30 2 0x00000000 Call Trace: preempt_schedule+0x42/0x60 _raw_spin_unlock+0x55/0x60 put_super+0x31/0x40 drop_super+0x22/0x30 prune_super+0x149/0x1b0 shrink_slab+0xba/0x510 The sysrq+m indicates the system has no swap so it'll never reclaim anonymous pages as part of reclaim/compaction. That is one part of the problem but not the root cause as file-backed pages could also be reclaimed. The likely underlying problem is that kswapd is woken up or kept awake for each THP allocation request in the page allocator slow path. If compaction fails for the requesting process then compaction will be deferred for a time and direct reclaim is avoided. However, if there are a storm of THP requests that are simply rejected, it will still be the the case that kswapd is awake for a prolonged period of time as pgdat->kswapd_max_order is updated each time. This is noticed by the main kswapd() loop and it will not call kswapd_try_to_sleep(). Instead it will loopp, shrinking a small number of pages and calling shrink_slab() on each iteration. The temptation is to supply a patch that checks if kswapd was woken for THP and if so ignore pgdat->kswapd_max_order but it'll be a hack and not backed up by proper testing. As 3.7 is very close to release and this is not a bug we should release with, a safer path is to revert "mm: remove __GFP_NO_KSWAPD" for now and revisit it with the view to ironing out the balance_pgdat() logic in general. Signed-off-by: Mel Gorman Cc: Zdenek Kabelac Cc: Seth Jennings Cc: Valdis Kletnieks Cc: Jiri Slaby Cc: Rik van Riel Cc: Robert Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bcb72c6e2b2d..92871579cbee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2416,8 +2416,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - wake_all_kswapd(order, zonelist, high_zoneidx, - zone_idx(preferred_zone)); + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background @@ -2494,7 +2495,7 @@ rebalance: * system then fail the allocation instead of entering direct reclaim. */ if ((deferred_compaction || contended_compaction) && - (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) + (gfp_mask & __GFP_NO_KSWAPD)) goto nopage; /* Try direct reclaim and then allocating */ -- cgit From 50694c28f1e1dbea18272980d265742a5027fb63 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 26 Nov 2012 16:29:48 -0800 Subject: mm: vmscan: check for fatal signals iff the process was throttled Commit 5515061d22f0 ("mm: throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage") introduced a check for fatal signals after a process gets throttled for network storage. The intention was that if a process was throttled and got killed that it should not trigger the OOM killer. As pointed out by Minchan Kim and David Rientjes, this check is in the wrong place and too broad. If a system is in am OOM situation and a process is exiting, it can loop in __alloc_pages_slowpath() and calling direct reclaim in a loop. As the fatal signal is pending it returns 1 as if it is making forward progress and can effectively deadlock. This patch moves the fatal_signal_pending() check after throttling to throttle_direct_reclaim() where it belongs. If the process is killed while throttled, it will return immediately without direct reclaim except now it will have TIF_MEMDIE set and will use the PFMEMALLOC reserves. Minchan pointed out that it may be better to direct reclaim before returning to avoid using the reserves because there may be pages that can easily reclaim that would avoid using the reserves. However, we do no such targetted reclaim and there is no guarantee that suitable pages are available. As it is expected that this throttling happens when swap-over-NFS is used there is a possibility that the process will instead swap which may allocate network buffers from the PFMEMALLOC reserves. Hence, in the swap-over-nfs case where a process can be throtted and be killed it can use the reserves to exit or it can potentially use reserves to swap a few pages and then exit. This patch takes the option of using the reserves if necessary to allow the process exit quickly. If this patch passes review it should be considered a -stable candidate for 3.6. Signed-off-by: Mel Gorman Cc: David Rientjes Cc: Luigi Semenzato Cc: Dan Magenheimer Cc: KOSAKI Motohiro Cc: Sonny Rao Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 48550c66f1f2..cbf84e152f04 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2207,9 +2207,12 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) * Throttle direct reclaimers if backing storage is backed by the network * and the PFMEMALLOC reserve for the preferred node is getting dangerously * depleted. kswapd will continue to make progress and wake the processes - * when the low watermark is reached + * when the low watermark is reached. + * + * Returns true if a fatal signal was delivered during throttling. If this + * happens, the page allocator should not consider triggering the OOM killer. */ -static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, +static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, nodemask_t *nodemask) { struct zone *zone; @@ -2224,13 +2227,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, * processes to block on log_wait_commit(). */ if (current->flags & PF_KTHREAD) - return; + goto out; + + /* + * If a fatal signal is pending, this process should not throttle. + * It should return quickly so it can exit and free its memory + */ + if (fatal_signal_pending(current)) + goto out; /* Check if the pfmemalloc reserves are ok */ first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); pgdat = zone->zone_pgdat; if (pfmemalloc_watermark_ok(pgdat)) - return; + goto out; /* Account for the throttling */ count_vm_event(PGSCAN_DIRECT_THROTTLE); @@ -2246,12 +2256,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, if (!(gfp_mask & __GFP_FS)) { wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, pfmemalloc_watermark_ok(pgdat), HZ); - return; + + goto check_pending; } /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, pfmemalloc_watermark_ok(pgdat)); + +check_pending: + if (fatal_signal_pending(current)) + return true; + +out: + return false; } unsigned long try_to_free_pages(struct zonelist *zonelist, int order, @@ -2273,13 +2291,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .gfp_mask = sc.gfp_mask, }; - throttle_direct_reclaim(gfp_mask, zonelist, nodemask); - /* - * Do not enter reclaim if fatal signal is pending. 1 is returned so - * that the page allocator does not consider triggering OOM + * Do not enter reclaim if fatal signal was delivered while throttled. + * 1 is returned so that the page allocator does not OOM kill at this + * point. */ - if (fatal_signal_pending(current)) + if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask)) return 1; trace_mm_vmscan_direct_reclaim_begin(order, -- cgit From 58d002097b98664e2a39cc708f30d11549d870b2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 29 Nov 2012 13:54:20 -0800 Subject: mm: compaction: fix return value of capture_free_page() Commit ef6c5be658f6 ("fix incorrect NR_FREE_PAGES accounting (appears like memory leak)") fixes a NR_FREE_PAGE accounting leak but missed the return value which was also missed by this reviewer until today. That return value is used by compaction when adding pages to a list of isolated free pages and without this follow-up fix, there is a risk of free list corruption. Signed-off-by: Mel Gorman Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 92871579cbee..7e208f0ad68c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1422,7 +1422,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) } } - return 1UL << order; + return 1UL << alloc_order; } /* -- cgit From ae64ffcac35de0db628ba9631edf8ff34c5cd7ac Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Thu, 29 Nov 2012 13:54:21 -0800 Subject: mm/vmemmap: fix wrong use of virt_to_page I enable CONFIG_DEBUG_VIRTUAL and CONFIG_SPARSEMEM_VMEMMAP, when doing memory hotremove, there is a kernel BUG at arch/x86/mm/physaddr.c:20. It is caused by free_section_usemap()->virt_to_page(), virt_to_page() is only used for kernel direct mapping address, but sparse-vmemmap uses vmemmap address, so it is going wrong here. ------------[ cut here ]------------ kernel BUG at arch/x86/mm/physaddr.c:20! invalid opcode: 0000 [#1] SMP Modules linked in: acpihp_drv acpihp_slot edd cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf fuse vfat fat loop dm_mod coretemp kvm crc32c_intel ipv6 ixgbe igb iTCO_wdt i7core_edac edac_core pcspkr iTCO_vendor_support ioatdma microcode joydev sr_mod i2c_i801 dca lpc_ich mfd_core mdio tpm_tis i2c_core hid_generic tpm cdrom sg tpm_bios rtc_cmos button ext3 jbd mbcache usbhid hid uhci_hcd ehci_hcd usbcore usb_common sd_mod crc_t10dif processor thermal_sys hwmon scsi_dh_alua scsi_dh_hp_sw scsi_dh_rdac scsi_dh_emc scsi_dh ata_generic ata_piix libata megaraid_sas scsi_mod CPU 39 Pid: 6454, comm: sh Not tainted 3.7.0-rc1-acpihp-final+ #45 QCI QSSC-S4R/QSSC-S4R RIP: 0010:[] [] __phys_addr+0x88/0x90 RSP: 0018:ffff8804440d7c08 EFLAGS: 00010006 RAX: 0000000000000006 RBX: ffffea0012000000 RCX: 000000000000002c ... Signed-off-by: Jianguo Wu Signed-off-by: Jiang Liu Reviewd-by: Wen Congyang Acked-by: Johannes Weiner Reviewed-by: Yasuaki Ishimatsu Reviewed-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index fac95f2888f2..a83de2f72b30 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -617,7 +617,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) { return; /* XXX: Not implemented yet */ } -static void free_map_bootmem(struct page *page, unsigned long nr_pages) +static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) { } #else @@ -658,10 +658,11 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) get_order(sizeof(struct page) * nr_pages)); } -static void free_map_bootmem(struct page *page, unsigned long nr_pages) +static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) { unsigned long maps_section_nr, removing_section_nr, i; unsigned long magic; + struct page *page = virt_to_page(memmap); for (i = 0; i < nr_pages; i++, page++) { magic = (unsigned long) page->lru.next; @@ -710,13 +711,10 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) */ if (memmap) { - struct page *memmap_page; - memmap_page = virt_to_page(memmap); - nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) >> PAGE_SHIFT; - free_map_bootmem(memmap_page, nr_pages); + free_map_bootmem(memmap, nr_pages); } } -- cgit From 60cefed485a02bd99b6299dad70666fe49245da7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 29 Nov 2012 13:54:23 -0800 Subject: mm: vmscan: fix endless loop in kswapd balancing Kswapd does not in all places have the same criteria for a balanced zone. Zones are only being reclaimed when their high watermark is breached, but compaction checks loop over the zonelist again when the zone does not meet the low watermark plus two times the size of the allocation. This gets kswapd stuck in an endless loop over a small zone, like the DMA zone, where the high watermark is smaller than the compaction requirement. Add a function, zone_balanced(), that checks the watermark, and, for higher order allocations, if compaction has enough free memory. Then use it uniformly to check for balanced zones. This makes sure that when the compaction watermark is not met, at least reclaim happens and progress is made - or the zone is declared unreclaimable at some point and skipped entirely. Signed-off-by: Johannes Weiner Reported-by: George Spelvin Reported-by: Johannes Hirte Reported-by: Tomas Racek Tested-by: Johannes Hirte Reviewed-by: Rik van Riel Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index cbf84e152f04..83f4d0e85601 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2414,6 +2414,19 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) } while (memcg); } +static bool zone_balanced(struct zone *zone, int order, + unsigned long balance_gap, int classzone_idx) +{ + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + + balance_gap, classzone_idx, 0)) + return false; + + if (COMPACTION_BUILD && order && !compaction_suitable(zone, order)) + return false; + + return true; +} + /* * pgdat_balanced is used when checking if a node is balanced for high-order * allocations. Only zones that meet watermarks and are in a zone allowed @@ -2492,8 +2505,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, continue; } - if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), - i, 0)) + if (!zone_balanced(zone, order, 0, i)) all_zones_ok = false; else balanced += zone->present_pages; @@ -2602,8 +2614,7 @@ loop_again: break; } - if (!zone_watermark_ok_safe(zone, order, - high_wmark_pages(zone), 0, 0)) { + if (!zone_balanced(zone, order, 0, 0)) { end_zone = i; break; } else { @@ -2679,9 +2690,8 @@ loop_again: testorder = 0; if ((buffer_heads_over_limit && is_highmem_idx(i)) || - !zone_watermark_ok_safe(zone, testorder, - high_wmark_pages(zone) + balance_gap, - end_zone, 0)) { + !zone_balanced(zone, testorder, + balance_gap, end_zone)) { shrink_zone(zone, &sc); reclaim_state->reclaimed_slab = 0; @@ -2708,8 +2718,7 @@ loop_again: continue; } - if (!zone_watermark_ok_safe(zone, testorder, - high_wmark_pages(zone), end_zone, 0)) { + if (!zone_balanced(zone, testorder, 0, end_zone)) { all_zones_ok = 0; /* * We are still under min water mark. This -- cgit From a50915394f1fc02c2861d3b7ce7014788aa5066e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 29 Nov 2012 13:54:27 -0800 Subject: revert "Revert "mm: remove __GFP_NO_KSWAPD"" It apepars that this patch was innocent, and we hope that "mm: avoid waking kswapd for THP allocations when compaction is deferred or contended" will fix the final kswapd-spinning cause. Cc: Zdenek Kabelac Cc: Seth Jennings Cc: Valdis Kletnieks Cc: Jiri Slaby Cc: Rik van Riel Cc: Robert Jennings Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7e208f0ad68c..8193809f3de0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2416,9 +2416,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - if (!(gfp_mask & __GFP_NO_KSWAPD)) - wake_all_kswapd(order, zonelist, high_zoneidx, - zone_idx(preferred_zone)); + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background @@ -2495,7 +2494,7 @@ rebalance: * system then fail the allocation instead of entering direct reclaim. */ if ((deferred_compaction || contended_compaction) && - (gfp_mask & __GFP_NO_KSWAPD)) + (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) goto nopage; /* Try direct reclaim and then allocating */ -- cgit From 782fd30406ecb9d9b082816abe0c6008fc72a7b0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 29 Nov 2012 13:54:30 -0800 Subject: mm: avoid waking kswapd for THP allocations when compaction is deferred or contended With "mm: vmscan: scale number of pages reclaimed by reclaim/compaction based on failures" reverted, Zdenek Kabelac reported the following Hmm, so it's just took longer to hit the problem and observe kswapd0 spinning on my CPU again - it's not as endless like before - but still it easily eats minutes - it helps to turn off Firefox or TB (memory hungry apps) so kswapd0 stops soon - and restart those apps again. (And I still have like >1GB of cached memory) kswapd0 R running task 0 30 2 0x00000000 Call Trace: preempt_schedule+0x42/0x60 _raw_spin_unlock+0x55/0x60 put_super+0x31/0x40 drop_super+0x22/0x30 prune_super+0x149/0x1b0 shrink_slab+0xba/0x510 The sysrq+m indicates the system has no swap so it'll never reclaim anonymous pages as part of reclaim/compaction. That is one part of the problem but not the root cause as file-backed pages could also be reclaimed. The likely underlying problem is that kswapd is woken up or kept awake for each THP allocation request in the page allocator slow path. If compaction fails for the requesting process then compaction will be deferred for a time and direct reclaim is avoided. However, if there are a storm of THP requests that are simply rejected, it will still be the the case that kswapd is awake for a prolonged period of time as pgdat->kswapd_max_order is updated each time. This is noticed by the main kswapd() loop and it will not call kswapd_try_to_sleep(). Instead it will loopp, shrinking a small number of pages and calling shrink_slab() on each iteration. This patch defers when kswapd gets woken up for THP allocations. For !THP allocations, kswapd is always woken up. For THP allocations, kswapd is woken up iff the process is willing to enter into direct reclaim/compaction. [akpm@linux-foundation.org: fix typo in comment] Signed-off-by: Mel Gorman Cc: Zdenek Kabelac Cc: Seth Jennings Cc: Jiri Slaby Cc: Rik van Riel Cc: Robert Jennings Cc: Valdis Kletnieks Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8193809f3de0..a8f2c87792c3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2378,6 +2378,15 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); } +/* Returns true if the allocation is likely for THP */ +static bool is_thp_alloc(gfp_t gfp_mask, unsigned int order) +{ + if (order == pageblock_order && + (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) + return true; + return false; +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, @@ -2416,7 +2425,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - wake_all_kswapd(order, zonelist, high_zoneidx, + /* The decision whether to wake kswapd for THP is made later */ + if (!is_thp_alloc(gfp_mask, order)) + wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(preferred_zone)); /* @@ -2487,15 +2498,21 @@ rebalance: goto got_pg; sync_migration = true; - /* - * If compaction is deferred for high-order allocations, it is because - * sync compaction recently failed. In this is the case and the caller - * requested a movable allocation that does not heavily disrupt the - * system then fail the allocation instead of entering direct reclaim. - */ - if ((deferred_compaction || contended_compaction) && - (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) - goto nopage; + if (is_thp_alloc(gfp_mask, order)) { + /* + * If compaction is deferred for high-order allocations, it is + * because sync compaction recently failed. If this is the case + * and the caller requested a movable allocation that does not + * heavily disrupt the system then fail the allocation instead + * of entering direct reclaim. + */ + if (deferred_compaction || contended_compaction) + goto nopage; + + /* If process is willing to reclaim/compact then wake kswapd */ + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); + } /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, -- cgit From 783657a7dc20e5c0efbc9a09a9dd38e238a723da Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 29 Nov 2012 13:54:34 -0800 Subject: mm: soft offline: split thp at the beginning of soft_offline_page() When we try to soft-offline a thp tail page, put_page() is called on the tail page unthinkingly and VM_BUG_ON is triggered in put_compound_page(). This patch splits thp before going into the main body of soft-offlining. Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Tony Luck Cc: Andi Kleen Cc: Wu Fengguang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6c5899b9034a..8b20278be6a6 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1476,9 +1476,17 @@ int soft_offline_page(struct page *page, int flags) { int ret; unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_trans_head(page); if (PageHuge(page)) return soft_offline_huge_page(page, flags); + if (PageTransHuge(hpage)) { + if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { + pr_info("soft offline: %#lx: failed to split THP\n", + pfn); + return -EBUSY; + } + } ret = get_any_page(page, pfn, flags); if (ret < 0) -- cgit From 5479c78ac6f688ea5ea8c49b44cf90ea87b63931 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 25 Nov 2012 01:17:13 +0400 Subject: mm, percpu: Make sure percpu_alloc early parameter has an argument Otherwise we are getting a nil dereference if percpu_alloc kernel boot argument is specified without value. | [ 0.000000] BUG: unable to handle kernel NULL pointer dereference at (null) | [ 0.000000] IP: [] strcmp+0x10/0x30 Signed-off-by: Cyrill Gorcunov Signed-off-by: Tejun Heo --- mm/percpu.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index ec2589616e7d..8c8e08f3a692 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1380,6 +1380,9 @@ enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; static int __init percpu_alloc_setup(char *str) { + if (!str) + return -EINVAL; + if (0) /* nada */; #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK -- cgit From 8fa72d234da9b6b473bbb1f74d533663e4996e6b Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 5 Dec 2012 20:17:21 +0100 Subject: bdi: add a user-tunable cpu_list for the bdi flusher threads In realtime environments, it may be desirable to keep the per-bdi flusher threads from running on certain cpus. This patch adds a cpu_list file to /sys/class/bdi/* to enable this. The default is to tie the flusher threads to the same numa node as the backing device (though I could be convinced to make it a mask of all cpus to avoid a change in behaviour). Thanks to Jeremy Eder for the original idea. Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- mm/backing-dev.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index d3ca2b3ee176..bd6a6cabef71 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -10,6 +10,7 @@ #include #include #include +#include #include static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); @@ -221,12 +222,63 @@ static ssize_t max_ratio_store(struct device *dev, } BDI_SHOW(max_ratio, bdi->max_ratio) +static ssize_t cpu_list_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + struct bdi_writeback *wb = &bdi->wb; + cpumask_var_t newmask; + ssize_t ret; + struct task_struct *task; + + if (!alloc_cpumask_var(&newmask, GFP_KERNEL)) + return -ENOMEM; + + ret = cpulist_parse(buf, newmask); + if (!ret) { + spin_lock_bh(&bdi->wb_lock); + task = wb->task; + if (task) + get_task_struct(task); + spin_unlock_bh(&bdi->wb_lock); + + mutex_lock(&bdi->flusher_cpumask_lock); + if (task) { + ret = set_cpus_allowed_ptr(task, newmask); + put_task_struct(task); + } + if (ret == 0) { + cpumask_copy(bdi->flusher_cpumask, newmask); + ret = count; + } + mutex_unlock(&bdi->flusher_cpumask_lock); + + } + free_cpumask_var(newmask); + + return ret; +} + +static ssize_t cpu_list_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + ssize_t ret; + + mutex_lock(&bdi->flusher_cpumask_lock); + ret = cpulist_scnprintf(page, PAGE_SIZE-1, bdi->flusher_cpumask); + mutex_unlock(&bdi->flusher_cpumask_lock); + + return ret; +} + #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) static struct device_attribute bdi_dev_attrs[] = { __ATTR_RW(read_ahead_kb), __ATTR_RW(min_ratio), __ATTR_RW(max_ratio), + __ATTR_RW(cpu_list), __ATTR_NULL, }; @@ -428,6 +480,7 @@ static int bdi_forker_thread(void *ptr) writeback_inodes_wb(&bdi->wb, 1024, WB_REASON_FORKER_THREAD); } else { + int ret; /* * The spinlock makes sure we do not lose * wake-ups when racing with 'bdi_queue_work()'. @@ -437,6 +490,14 @@ static int bdi_forker_thread(void *ptr) spin_lock_bh(&bdi->wb_lock); bdi->wb.task = task; spin_unlock_bh(&bdi->wb_lock); + mutex_lock(&bdi->flusher_cpumask_lock); + ret = set_cpus_allowed_ptr(task, + bdi->flusher_cpumask); + mutex_unlock(&bdi->flusher_cpumask_lock); + if (ret) + printk_once("%s: failed to bind flusher" + " thread %s, error %d\n", + __func__, task->comm, ret); wake_up_process(task); } bdi_clear_pending(bdi); @@ -509,6 +570,17 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, dev_name(dev)); if (IS_ERR(wb->task)) return PTR_ERR(wb->task); + } else { + int node; + /* + * Set up a default cpumask for the flusher threads that + * includes all cpus on the same numa node as the device. + * The mask may be overridden via sysfs. + */ + node = dev_to_node(bdi->dev); + if (node != NUMA_NO_NODE) + cpumask_copy(bdi->flusher_cpumask, + cpumask_of_node(node)); } bdi_debug_register(bdi, dev_name(dev)); @@ -634,6 +706,15 @@ int bdi_init(struct backing_dev_info *bdi) bdi_wb_init(&bdi->wb, bdi); + if (!bdi_cap_flush_forker(bdi)) { + bdi->flusher_cpumask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!bdi->flusher_cpumask) + return -ENOMEM; + cpumask_setall(bdi->flusher_cpumask); + mutex_init(&bdi->flusher_cpumask_lock); + } else + bdi->flusher_cpumask = NULL; + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { err = percpu_counter_init(&bdi->bdi_stat[i], 0); if (err) @@ -656,6 +737,7 @@ int bdi_init(struct backing_dev_info *bdi) err: while (i--) percpu_counter_destroy(&bdi->bdi_stat[i]); + kfree(bdi->flusher_cpumask); } return err; @@ -683,6 +765,8 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi_unregister(bdi); + kfree(bdi->flusher_cpumask); + /* * If bdi_unregister() had already been called earlier, the * wakeup_timer could still be armed because bdi_prune_sb() -- cgit From 6d49e352ae9aed3f599041b0c0389aa924815f14 Mon Sep 17 00:00:00 2001 From: Nadia Yvette Chambers Date: Thu, 6 Dec 2012 10:39:54 +0100 Subject: propagate name change to comments in kernel source I've legally changed my name with New York State, the US Social Security Administration, et al. This patch propagates the name change and change in initials and login to comments in the kernel source as well. Signed-off-by: Nadia Yvette Chambers Signed-off-by: Jiri Kosina --- mm/hugetlb.c | 2 +- mm/page_alloc.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 59a0059b39e2..3b7a20ea3808 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1,6 +1,6 @@ /* * Generic hugetlb support. - * (C) William Irwin, April 2004 + * (C) Nadia Yvette Chambers, April 2004 */ #include #include diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5b74de6702e0..57806b7e118b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -523,7 +523,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * If a block is freed, and its buddy is also free, then this * triggers coalescing into a block of larger size. * - * -- wli + * -- nyc */ static inline void __free_one_page(struct page *page, @@ -780,7 +780,7 @@ void __init init_cma_reserved_pageblock(struct page *page) * large block of memory acted on by a series of small allocations. * This behavior is a critical factor in sglist merging's success. * - * -- wli + * -- nyc */ static inline void expand(struct zone *zone, struct page *page, int low, int high, struct free_area *area, -- cgit From 60177d31d215bc2b4c5a7aa6f742800e04fa0a92 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 6 Dec 2012 19:01:14 +0000 Subject: mm: compaction: validate pfn range passed to isolate_freepages_block Commit 0bf380bc70ec ("mm: compaction: check pfn_valid when entering a new MAX_ORDER_NR_PAGES block during isolation for migration") added a check for pfn_valid() when isolating pages for migration as the scanner does not necessarily start pageblock-aligned. Since commit c89511ab2f8f ("mm: compaction: Restart compaction from near where it left off"), the free scanner has the same problem. This patch makes sure that the pfn range passed to isolate_freepages_block() is within the same block so that pfn_valid() checks are unnecessary. In answer to Henrik's wondering why others have not reported this: reproducing this requires a large enough hole with the right aligment to have compaction walk into a PFN range with no memmap. Size and alignment depends in the memory model - 4M for FLATMEM and 128M for SPARSEMEM on x86. It needs a "lucky" machine. Reported-by: Henrik Rydberg Signed-off-by: Mel Gorman Signed-off-by: Linus Torvalds --- mm/compaction.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 9eef55838fca..694eaabaaebd 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -713,7 +713,15 @@ static void isolate_freepages(struct zone *zone, /* Found a block suitable for isolating free pages from */ isolated = 0; - end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); + + /* + * As pfn may not start aligned, pfn+pageblock_nr_page + * may cross a MAX_ORDER_NR_PAGES boundary and miss + * a pfn_valid check. Ensure isolate_freepages_block() + * only scans within a pageblock + */ + end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + end_pfn = min(end_pfn, zone_end_pfn); isolated = isolate_freepages_block(cc, pfn, end_pfn, freelist, false); nr_freepages += isolated; -- cgit From c702418f8a2fa6cc92e84a39880d458faf7af9cc Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 4 Dec 2012 11:11:31 -0500 Subject: mm: vmscan: do not keep kswapd looping forever due to individual uncompactable zones When a zone meets its high watermark and is compactable in case of higher order allocations, it contributes to the percentage of the node's memory that is considered balanced. This requirement, that a node be only partially balanced, came about when kswapd was desparately trying to balance tiny zones when all bigger zones in the node had plenty of free memory. Arguably, the same should apply to compaction: if a significant part of the node is balanced enough to run compaction, do not get hung up on that tiny zone that might never get in shape. When the compaction logic in kswapd is reached, we know that at least 25% of the node's memory is balanced properly for compaction (see zone_balanced and pgdat_balanced). Remove the individual zone checks that restart the kswapd cycle. Otherwise, we may observe more endless looping in kswapd where the compaction code loops back to reclaim because of a single zone and reclaim does nothing because the node is considered balanced overall. See for example https://bugzilla.redhat.com/show_bug.cgi?id=866988 Signed-off-by: Johannes Weiner Reported-and-tested-by: Thorsten Leemhuis Reported-by: Jiri Slaby Tested-by: John Ellson Tested-by: Zdenek Kabelac Tested-by: Bruno Wolff III Signed-off-by: Linus Torvalds --- mm/vmscan.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 83f4d0e85601..124bbfe5cc52 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2823,22 +2823,6 @@ out: if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && - sc.priority != DEF_PRIORITY) - continue; - - /* Would compaction fail due to lack of free memory? */ - if (COMPACTION_BUILD && - compaction_suitable(zone, order) == COMPACT_SKIPPED) - goto loop_again; - - /* Confirm the zone is balanced for order-0 */ - if (!zone_watermark_ok(zone, 0, - high_wmark_pages(zone), 0, 0)) { - order = sc.order = 0; - goto loop_again; - } - /* Check if the memory needs to be defragmented. */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), *classzone_idx, 0)) -- cgit From 18a2f371f5edf41810f6469cb9be39931ef9deb9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 5 Dec 2012 14:01:41 -0800 Subject: tmpfs: fix shared mempolicy leak This fixes a regression in 3.7-rc, which has since gone into stable. Commit 00442ad04a5e ("mempolicy: fix a memory corruption by refcount imbalance in alloc_pages_vma()") changed get_vma_policy() to raise the refcount on a shmem shared mempolicy; whereas shmem_alloc_page() went on expecting alloc_page_vma() to drop the refcount it had acquired. This deserves a rework: but for now fix the leak in shmem_alloc_page(). Hugh: shmem_swapin() did not need a fix, but surely it's clearer to use the same refcounting there as in shmem_alloc_page(), delete its onstack mempolicy, and the strange mpol_cond_copy() and __mpol_cond_copy() - those were invented to let swapin_readahead() make an unknown number of calls to alloc_pages_vma() with one mempolicy; but since 00442ad04a5e, alloc_pages_vma() has kept refcount in balance, so now no problem. Reported-and-tested-by: Tommi Rantala Signed-off-by: Mel Gorman Signed-off-by: Hugh Dickins Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 22 ---------------------- mm/shmem.c | 26 ++++++++++++++++---------- 2 files changed, 16 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d04a8a54c294..4ea600da8940 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2037,28 +2037,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) return new; } -/* - * If *frompol needs [has] an extra ref, copy *frompol to *tompol , - * eliminate the * MPOL_F_* flags that require conditional ref and - * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly - * after return. Use the returned value. - * - * Allows use of a mempolicy for, e.g., multiple allocations with a single - * policy lookup, even if the policy needs/has extra ref on lookup. - * shmem_readahead needs this. - */ -struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, - struct mempolicy *frompol) -{ - if (!mpol_needs_cond_ref(frompol)) - return frompol; - - *tompol = *frompol; - tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */ - __mpol_put(frompol); - return tompol; -} - /* Slow path of a mempolicy comparison */ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) { diff --git a/mm/shmem.c b/mm/shmem.c index 89341b658bd0..50c5b8f3a359 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -910,25 +910,29 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct mempolicy mpol, *spol; struct vm_area_struct pvma; - - spol = mpol_cond_copy(&mpol, - mpol_shared_policy_lookup(&info->policy, index)); + struct page *page; /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; /* Bias interleave by inode number to distribute better across nodes */ pvma.vm_pgoff = index + info->vfs_inode.i_ino; pvma.vm_ops = NULL; - pvma.vm_policy = spol; - return swapin_readahead(swap, gfp, &pvma, 0); + pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); + + page = swapin_readahead(swap, gfp, &pvma, 0); + + /* Drop reference taken by mpol_shared_policy_lookup() */ + mpol_cond_put(pvma.vm_policy); + + return page; } static struct page *shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; + struct page *page; /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; @@ -937,10 +941,12 @@ static struct page *shmem_alloc_page(gfp_t gfp, pvma.vm_ops = NULL; pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); - /* - * alloc_page_vma() will drop the shared policy reference - */ - return alloc_page_vma(gfp, &pvma, 0); + page = alloc_page_vma(gfp, &pvma, 0); + + /* Drop reference taken by mpol_shared_policy_lookup() */ + mpol_cond_put(pvma.vm_policy); + + return page; } #else /* !CONFIG_NUMA */ #ifdef CONFIG_TMPFS -- cgit From ed23ec4f0a510528e0ffe415f9394107418ae854 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 6 Dec 2012 15:23:25 -0500 Subject: mm: vmscan: fix inappropriate zone congestion clearing commit c702418f8a2f ("mm: vmscan: do not keep kswapd looping forever due to individual uncompactable zones") removed zone watermark checks from the compaction code in kswapd but left in the zone congestion clearing, which now happens unconditionally on higher order reclaim. This messes up the reclaim throttling logic for zones with dirty/writeback pages, where zones should only lose their congestion status when their watermarks have been restored. Remove the clearing from the zone compaction section entirely. The preliminary zone check and the reclaim loop in kswapd will clear it if the zone is considered balanced. Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 124bbfe5cc52..b7ed37675644 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2827,9 +2827,6 @@ out: if (zone_watermark_ok(zone, order, low_wmark_pages(zone), *classzone_idx, 0)) zones_need_compaction = 0; - - /* If balanced, clear the congested flag */ - zone_clear_flag(zone, ZONE_CONGESTED); } if (zones_need_compaction) -- cgit From 31f8d42d44b48ba72b586ca03e810cbbd21ea16b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 10 Dec 2012 10:47:45 -0800 Subject: Revert "mm: avoid waking kswapd for THP allocations when compaction is deferred or contended" This reverts commit 782fd30406ecb9d9b082816abe0c6008fc72a7b0. We are going to reinstate the __GFP_NO_KSWAPD flag that has been removed, the removal reverted, and then removed again. Making this commit a pointless fixup for a problem that was caused by the removal of __GFP_NO_KSWAPD flag. The thing is, we really don't want to wake up kswapd for THP allocations (because they fail quite commonly under any kind of memory pressure, including when there is tons of memory free), and these patches were just trying to fix up the underlying bug: the original removal of __GFP_NO_KSWAPD in commit c654345924f7 ("mm: remove __GFP_NO_KSWAPD") was simply bogus. Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 37 ++++++++++--------------------------- 1 file changed, 10 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a8f2c87792c3..8193809f3de0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2378,15 +2378,6 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); } -/* Returns true if the allocation is likely for THP */ -static bool is_thp_alloc(gfp_t gfp_mask, unsigned int order) -{ - if (order == pageblock_order && - (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) - return true; - return false; -} - static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, @@ -2425,9 +2416,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - /* The decision whether to wake kswapd for THP is made later */ - if (!is_thp_alloc(gfp_mask, order)) - wake_all_kswapd(order, zonelist, high_zoneidx, + wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(preferred_zone)); /* @@ -2498,21 +2487,15 @@ rebalance: goto got_pg; sync_migration = true; - if (is_thp_alloc(gfp_mask, order)) { - /* - * If compaction is deferred for high-order allocations, it is - * because sync compaction recently failed. If this is the case - * and the caller requested a movable allocation that does not - * heavily disrupt the system then fail the allocation instead - * of entering direct reclaim. - */ - if (deferred_compaction || contended_compaction) - goto nopage; - - /* If process is willing to reclaim/compact then wake kswapd */ - wake_all_kswapd(order, zonelist, high_zoneidx, - zone_idx(preferred_zone)); - } + /* + * If compaction is deferred for high-order allocations, it is because + * sync compaction recently failed. In this is the case and the caller + * requested a movable allocation that does not heavily disrupt the + * system then fail the allocation instead of entering direct reclaim. + */ + if ((deferred_compaction || contended_compaction) && + (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) + goto nopage; /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, -- cgit From caf491916b1c1e939a2c7575efb7a77f11fc9bdf Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 10 Dec 2012 10:51:16 -0800 Subject: Revert "revert "Revert "mm: remove __GFP_NO_KSWAPD""" and associated damage This reverts commits a50915394f1fc02c2861d3b7ce7014788aa5066e and d7c3b937bdf45f0b844400b7bf6fd3ed50bac604. This is a revert of a revert of a revert. In addition, it reverts the even older i915 change to stop using the __GFP_NO_KSWAPD flag due to the original commits in linux-next. It turns out that the original patch really was bogus, and that the original revert was the correct thing to do after all. We thought we had fixed the problem, and then reverted the revert, but the problem really is fundamental: waking up kswapd simply isn't the right thing to do, and direct reclaim sometimes simply _is_ the right thing to do. When certain allocations fail, we simply should try some direct reclaim, and if that fails, fail the allocation. That's the right thing to do for THP allocations, which can easily fail, and the GPU allocations want to do that too. So starting kswapd is sometimes simply wrong, and removing the flag that said "don't start kswapd" was a mistake. Let's hope we never revisit this mistake again - and certainly not this many times ;) Acked-by: Mel Gorman Acked-by: Johannes Weiner Cc: Rik van Riel Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8193809f3de0..7e208f0ad68c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2416,8 +2416,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - wake_all_kswapd(order, zonelist, high_zoneidx, - zone_idx(preferred_zone)); + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background @@ -2494,7 +2495,7 @@ rebalance: * system then fail the allocation instead of entering direct reclaim. */ if ((deferred_compaction || contended_compaction) && - (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) + (gfp_mask & __GFP_NO_KSWAPD)) goto nopage; /* Try direct reclaim and then allocating */ -- cgit From 387870f2d6d679746020fa8e25ef786ff338dc98 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 7 Nov 2012 15:37:07 +0100 Subject: mm: dmapool: use provided gfp flags for all dma_alloc_coherent() calls dmapool always calls dma_alloc_coherent() with GFP_ATOMIC flag, regardless the flags provided by the caller. This causes excessive pruning of emergency memory pools without any good reason. Additionaly, on ARM architecture any driver which is using dmapools will sooner or later trigger the following error: "ERROR: 256 KiB atomic DMA coherent pool is too small! Please increase it with coherent_pool= kernel parameter!". Increasing the coherent pool size usually doesn't help much and only delays such error, because all GFP_ATOMIC DMA allocations are always served from the special, very limited memory pool. This patch changes the dmapool code to correctly use gfp flags provided by the dmapool caller. Reported-by: Soeren Moch Reported-by: Thomas Petazzoni Signed-off-by: Marek Szyprowski Tested-by: Andrew Lunn Tested-by: Soeren Moch Cc: stable@vger.kernel.org --- mm/dmapool.c | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/dmapool.c b/mm/dmapool.c index c5ab33bca0a8..da1b0f0b8709 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -50,7 +50,6 @@ struct dma_pool { /* the pool */ size_t allocation; size_t boundary; char name[32]; - wait_queue_head_t waitq; struct list_head pools; }; @@ -62,8 +61,6 @@ struct dma_page { /* cacheable header for 'allocation' bytes */ unsigned int offset; }; -#define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000) - static DEFINE_MUTEX(pools_lock); static ssize_t @@ -172,7 +169,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, retval->size = size; retval->boundary = boundary; retval->allocation = allocation; - init_waitqueue_head(&retval->waitq); if (dev) { int ret; @@ -227,7 +223,6 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) memset(page->vaddr, POOL_POISON_FREED, pool->allocation); #endif pool_initialise_page(pool, page); - list_add(&page->page_list, &pool->page_list); page->in_use = 0; page->offset = 0; } else { @@ -315,30 +310,21 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, might_sleep_if(mem_flags & __GFP_WAIT); spin_lock_irqsave(&pool->lock, flags); - restart: list_for_each_entry(page, &pool->page_list, page_list) { if (page->offset < pool->allocation) goto ready; } - page = pool_alloc_page(pool, GFP_ATOMIC); - if (!page) { - if (mem_flags & __GFP_WAIT) { - DECLARE_WAITQUEUE(wait, current); - __set_current_state(TASK_UNINTERRUPTIBLE); - __add_wait_queue(&pool->waitq, &wait); - spin_unlock_irqrestore(&pool->lock, flags); + /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */ + spin_unlock_irqrestore(&pool->lock, flags); - schedule_timeout(POOL_TIMEOUT_JIFFIES); + page = pool_alloc_page(pool, mem_flags); + if (!page) + return NULL; - spin_lock_irqsave(&pool->lock, flags); - __remove_wait_queue(&pool->waitq, &wait); - goto restart; - } - retval = NULL; - goto done; - } + spin_lock_irqsave(&pool->lock, flags); + list_add(&page->page_list, &pool->page_list); ready: page->in_use++; offset = page->offset; @@ -348,7 +334,6 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, #ifdef DMAPOOL_DEBUG memset(retval, POOL_POISON_ALLOCATED, pool->size); #endif - done: spin_unlock_irqrestore(&pool->lock, flags); return retval; } @@ -435,8 +420,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) page->in_use--; *(int *)vaddr = page->offset; page->offset = offset; - if (waitqueue_active(&pool->waitq)) - wake_up_locked(&pool->waitq); /* * Resist a temptation to do * if (!is_page_busy(page)) pool_free_page(pool, page); -- cgit From 59a09917c95e5209135b4f1a87f1263d6ef40fdb Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 28 Nov 2012 16:23:00 +0000 Subject: slub: Use correct cpu_slab on dead cpu Pass a kmem_cache_cpu pointer into unfreeze partials so that a different kmem_cache_cpu structure than the local one can be specified. Acked-by: David Rientjes Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index b2ada3db4225..33576b0cfc41 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1869,12 +1869,14 @@ redo: /* * Unfreeze all the cpu partial slabs. * - * This function must be called with interrupt disabled. + * This function must be called with interrupts disabled + * for the cpu using c (or some other guarantee must be there + * to guarantee no concurrent accesses). */ -static void unfreeze_partials(struct kmem_cache *s) +static void unfreeze_partials(struct kmem_cache *s, + struct kmem_cache_cpu *c) { struct kmem_cache_node *n = NULL, *n2 = NULL; - struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); struct page *page, *discard_page = NULL; while ((page = c->partial)) { @@ -1960,7 +1962,7 @@ static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) * set to the per node partial list. */ local_irq_save(flags); - unfreeze_partials(s); + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); local_irq_restore(flags); oldpage = NULL; pobjects = 0; @@ -2003,7 +2005,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) if (c->page) flush_slab(s, c); - unfreeze_partials(s); + unfreeze_partials(s, c); } } -- cgit From 3c58346525d82625e68e24f071804c2dc057b6f4 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 28 Nov 2012 16:23:01 +0000 Subject: slab: Simplify bootstrap The nodelists field in kmem_cache is pointing to the first unused object in the array field when bootstrap is complete. A problem with the current approach is that the statically sized kmem_cache structure use on boot can only contain NR_CPUS entries. If the number of nodes plus the number of cpus is greater then we would overwrite memory following the kmem_cache_boot definition. Increase the size of the array field to ensure that also the node pointers fit into the array field. Once we do that we no longer need the kmem_cache_nodelists array and we can then also use that structure elsewhere. Acked-by: Glauber Costa Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slab.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index e26bff5ed1a6..c7ea5234c4e9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -553,9 +553,7 @@ static struct arraycache_init initarray_generic = { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; /* internal cache of cache description objs */ -static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES]; static struct kmem_cache kmem_cache_boot = { - .nodelists = kmem_cache_nodelists, .batchcount = 1, .limit = BOOT_CPUCACHE_ENTRIES, .shared = 1, @@ -1559,6 +1557,15 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index) } } +/* + * The memory after the last cpu cache pointer is used for the + * the nodelists pointer. + */ +static void setup_nodelists_pointer(struct kmem_cache *cachep) +{ + cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; +} + /* * Initialisation. Called after the page allocator have been initialised and * before smp_init(). @@ -1573,15 +1580,14 @@ void __init kmem_cache_init(void) int node; kmem_cache = &kmem_cache_boot; + setup_nodelists_pointer(kmem_cache); if (num_possible_nodes() == 1) use_alien_caches = 0; - for (i = 0; i < NUM_INIT_LISTS; i++) { + for (i = 0; i < NUM_INIT_LISTS; i++) kmem_list3_init(&initkmem_list3[i]); - if (i < MAX_NUMNODES) - kmem_cache->nodelists[i] = NULL; - } + set_up_list3s(kmem_cache, CACHE_CACHE); /* @@ -1619,7 +1625,6 @@ void __init kmem_cache_init(void) list_add(&kmem_cache->list, &slab_caches); kmem_cache->colour_off = cache_line_size(); kmem_cache->array[smp_processor_id()] = &initarray_cache.cache; - kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; /* * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids @@ -2422,7 +2427,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) else gfp = GFP_NOWAIT; - cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; + setup_nodelists_pointer(cachep); #if DEBUG /* -- cgit From 45530c4474d258b822e2639c786606d8257aad8b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 28 Nov 2012 16:23:07 +0000 Subject: mm, sl[au]b: create common functions for boot slab creation Use a special function to create kmalloc caches and use that function in SLAB and SLUB. Acked-by: Joonsoo Kim Reviewed-by: Glauber Costa Acked-by: David Rientjes Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slab.c | 48 ++++++++++++++---------------------------------- mm/slab.h | 5 +++++ mm/slab_common.c | 36 ++++++++++++++++++++++++++++++++++++ mm/slub.c | 37 +++++-------------------------------- 4 files changed, 60 insertions(+), 66 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index c7ea5234c4e9..e351acea6026 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1659,23 +1659,13 @@ void __init kmem_cache_init(void) * bug. */ - sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name; - sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size; - sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size; - sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN; - __kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); - list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches); - - if (INDEX_AC != INDEX_L3) { - sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name; - sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size; - sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size; - sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN; - __kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); - list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches); - } + sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name, + sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS); + + if (INDEX_AC != INDEX_L3) + sizes[INDEX_L3].cs_cachep = + create_kmalloc_cache(names[INDEX_L3].name, + sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS); slab_early_init = 0; @@ -1687,24 +1677,14 @@ void __init kmem_cache_init(void) * Note for systems short on memory removing the alignment will * allow tighter packing of the smaller caches. */ - if (!sizes->cs_cachep) { - sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - sizes->cs_cachep->name = names->name; - sizes->cs_cachep->size = sizes->cs_size; - sizes->cs_cachep->object_size = sizes->cs_size; - sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN; - __kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); - list_add(&sizes->cs_cachep->list, &slab_caches); - } + if (!sizes->cs_cachep) + sizes->cs_cachep = create_kmalloc_cache(names->name, + sizes->cs_size, ARCH_KMALLOC_FLAGS); + #ifdef CONFIG_ZONE_DMA - sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - sizes->cs_dmacachep->name = names->name_dma; - sizes->cs_dmacachep->size = sizes->cs_size; - sizes->cs_dmacachep->object_size = sizes->cs_size; - sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN; - __kmem_cache_create(sizes->cs_dmacachep, - ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC); - list_add(&sizes->cs_dmacachep->list, &slab_caches); + sizes->cs_dmacachep = create_kmalloc_cache( + names->name_dma, sizes->cs_size, + SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS); #endif sizes++; names++; diff --git a/mm/slab.h b/mm/slab.h index 66a62d3536c6..492eafa0b538 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -35,6 +35,11 @@ extern struct kmem_cache *kmem_cache; /* Functions provided by the slab allocators */ extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); +extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, + unsigned long flags); +extern void create_boot_cache(struct kmem_cache *, const char *name, + size_t size, unsigned long flags); + #ifdef CONFIG_SLUB struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)); diff --git a/mm/slab_common.c b/mm/slab_common.c index b705be7faa48..497b45c25bae 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -202,6 +202,42 @@ int slab_is_available(void) return slab_state >= UP; } +#ifndef CONFIG_SLOB +/* Create a cache during boot when no slab services are available yet */ +void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, + unsigned long flags) +{ + int err; + + s->name = name; + s->size = s->object_size = size; + s->align = ARCH_KMALLOC_MINALIGN; + err = __kmem_cache_create(s, flags); + + if (err) + panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n", + name, size, err); + + s->refcount = -1; /* Exempt from merging for now */ +} + +struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, + unsigned long flags) +{ + struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + + if (!s) + panic("Out of memory when creating slab %s\n", name); + + create_boot_cache(s, name, size, flags); + list_add(&s->list, &slab_caches); + s->refcount = 1; + return s; +} + +#endif /* !CONFIG_SLOB */ + + #ifdef CONFIG_SLABINFO static void print_slabinfo_header(struct seq_file *m) { diff --git a/mm/slub.c b/mm/slub.c index 33576b0cfc41..1be172c157c3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3245,32 +3245,6 @@ static int __init setup_slub_nomerge(char *str) __setup("slub_nomerge", setup_slub_nomerge); -static struct kmem_cache *__init create_kmalloc_cache(const char *name, - int size, unsigned int flags) -{ - struct kmem_cache *s; - - s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - - s->name = name; - s->size = s->object_size = size; - s->align = ARCH_KMALLOC_MINALIGN; - - /* - * This function is called with IRQs disabled during early-boot on - * single CPU so there's no need to take slab_mutex here. - */ - if (kmem_cache_open(s, flags)) - goto panic; - - list_add(&s->list, &slab_caches); - return s; - -panic: - panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); - return NULL; -} - /* * Conversion table for small slabs sizes / 8 to the index in the * kmalloc array. This is necessary for slabs < 192 since we have non power @@ -3948,6 +3922,10 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) if (err) return err; + /* Mutex is not taken during early boot */ + if (slab_state <= UP) + return 0; + mutex_unlock(&slab_mutex); err = sysfs_slab_add(s); mutex_lock(&slab_mutex); @@ -5249,13 +5227,8 @@ static int sysfs_slab_add(struct kmem_cache *s) { int err; const char *name; - int unmergeable; - - if (slab_state < FULL) - /* Defer until later */ - return 0; + int unmergeable = slab_unmergeable(s); - unmergeable = slab_unmergeable(s); if (unmergeable) { /* * Slabcache can never be merged so we can use the name proper. -- cgit From dffb4d605c23110e3ad54b8c9f244a8235c013c2 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 28 Nov 2012 16:23:07 +0000 Subject: slub: Use statically allocated kmem_cache boot structure for bootstrap Simplify bootstrap by statically allocated two kmem_cache structures. These are freed after bootup is complete. Allows us to no longer worry about calculations of sizes of kmem_cache structures during bootstrap. Reviewed-by: Glauber Costa Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 67 +++++++++++++++++++-------------------------------------------- 1 file changed, 20 insertions(+), 47 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 1be172c157c3..c82453ac812a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -176,8 +176,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #define __OBJECT_POISON 0x80000000UL /* Poison object */ #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ -static int kmem_size = sizeof(struct kmem_cache); - #ifdef CONFIG_SMP static struct notifier_block slab_notifier; #endif @@ -3634,15 +3632,16 @@ static int slab_memory_callback(struct notifier_block *self, /* * Used for early kmem_cache structures that were allocated using - * the page allocator + * the page allocator. Allocate them properly then fix up the pointers + * that may be pointing to the wrong kmem_cache structure. */ -static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) +static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) { int node; + struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - list_add(&s->list, &slab_caches); - s->refcount = -1; + memcpy(s, static_cache, kmem_cache->object_size); for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -3658,70 +3657,44 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) #endif } } + list_add(&s->list, &slab_caches); + return s; } void __init kmem_cache_init(void) { + static __initdata struct kmem_cache boot_kmem_cache, + boot_kmem_cache_node; int i; - int caches = 0; - struct kmem_cache *temp_kmem_cache; - int order; - struct kmem_cache *temp_kmem_cache_node; - unsigned long kmalloc_size; + int caches = 2; if (debug_guardpage_minorder()) slub_max_order = 0; - kmem_size = offsetof(struct kmem_cache, node) + - nr_node_ids * sizeof(struct kmem_cache_node *); - - /* Allocate two kmem_caches from the page allocator */ - kmalloc_size = ALIGN(kmem_size, cache_line_size()); - order = get_order(2 * kmalloc_size); - kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order); + kmem_cache_node = &boot_kmem_cache_node; + kmem_cache = &boot_kmem_cache; - /* - * Must first have the slab cache available for the allocations of the - * struct kmem_cache_node's. There is special bootstrap code in - * kmem_cache_open for slab_state == DOWN. - */ - kmem_cache_node = (void *)kmem_cache + kmalloc_size; - - kmem_cache_node->name = "kmem_cache_node"; - kmem_cache_node->size = kmem_cache_node->object_size = - sizeof(struct kmem_cache_node); - kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC); + create_boot_cache(kmem_cache_node, "kmem_cache_node", + sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN); hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); /* Able to allocate the per node structures */ slab_state = PARTIAL; - temp_kmem_cache = kmem_cache; - kmem_cache->name = "kmem_cache"; - kmem_cache->size = kmem_cache->object_size = kmem_size; - kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC); + create_boot_cache(kmem_cache, "kmem_cache", + offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *), + SLAB_HWCACHE_ALIGN); - kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); - memcpy(kmem_cache, temp_kmem_cache, kmem_size); + kmem_cache = bootstrap(&boot_kmem_cache); /* * Allocate kmem_cache_node properly from the kmem_cache slab. * kmem_cache_node is separately allocated so no need to * update any list pointers. */ - temp_kmem_cache_node = kmem_cache_node; - - kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); - memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size); - - kmem_cache_bootstrap_fixup(kmem_cache_node); - - caches++; - kmem_cache_bootstrap_fixup(kmem_cache); - caches++; - /* Free temporary boot structure */ - free_pages((unsigned long)temp_kmem_cache, order); + kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ -- cgit From 2f9baa9fcf8d0a204ca129a671d6086cc100faab Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 28 Nov 2012 16:23:09 +0000 Subject: slab: Use the new create_boot_cache function to simplify bootstrap Simplify setup and reduce code in kmem_cache_init(). This allows us to get rid of initarray_cache as well as the manual setup code for the kmem_cache and kmem_cache_node arrays during bootstrap. We introduce a new bootstrap state "PARTIAL" for slab that signals the creation of a kmem_cache boot cache. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slab.c | 49 ++++++++++++++++--------------------------------- 1 file changed, 16 insertions(+), 33 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index e351acea6026..e1790e56fd86 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -547,8 +547,6 @@ static struct cache_names __initdata cache_names[] = { #undef CACHE }; -static struct arraycache_init initarray_cache __initdata = - { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; static struct arraycache_init initarray_generic = { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; @@ -1572,12 +1570,9 @@ static void setup_nodelists_pointer(struct kmem_cache *cachep) */ void __init kmem_cache_init(void) { - size_t left_over; struct cache_sizes *sizes; struct cache_names *names; int i; - int order; - int node; kmem_cache = &kmem_cache_boot; setup_nodelists_pointer(kmem_cache); @@ -1618,36 +1613,16 @@ void __init kmem_cache_init(void) * 6) Resize the head arrays of the kmalloc caches to their final sizes. */ - node = numa_mem_id(); - /* 1) create the kmem_cache */ - INIT_LIST_HEAD(&slab_caches); - list_add(&kmem_cache->list, &slab_caches); - kmem_cache->colour_off = cache_line_size(); - kmem_cache->array[smp_processor_id()] = &initarray_cache.cache; /* * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids */ - kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + - nr_node_ids * sizeof(struct kmem_list3 *); - kmem_cache->object_size = kmem_cache->size; - kmem_cache->size = ALIGN(kmem_cache->object_size, - cache_line_size()); - kmem_cache->reciprocal_buffer_size = - reciprocal_value(kmem_cache->size); - - for (order = 0; order < MAX_ORDER; order++) { - cache_estimate(order, kmem_cache->size, - cache_line_size(), 0, &left_over, &kmem_cache->num); - if (kmem_cache->num) - break; - } - BUG_ON(!kmem_cache->num); - kmem_cache->gfporder = order; - kmem_cache->colour = left_over / kmem_cache->colour_off; - kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) + - sizeof(struct slab), cache_line_size()); + create_boot_cache(kmem_cache, "kmem_cache", + offsetof(struct kmem_cache, array[nr_cpu_ids]) + + nr_node_ids * sizeof(struct kmem_list3 *), + SLAB_HWCACHE_ALIGN); + list_add(&kmem_cache->list, &slab_caches); /* 2+3) create the kmalloc caches */ sizes = malloc_sizes; @@ -1695,7 +1670,6 @@ void __init kmem_cache_init(void) ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache); memcpy(ptr, cpu_cache_get(kmem_cache), sizeof(struct arraycache_init)); /* @@ -2250,7 +2224,15 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) if (slab_state == DOWN) { /* - * Note: the first kmem_cache_create must create the cache + * Note: Creation of first cache (kmem_cache). + * The setup_list3s is taken care + * of by the caller of __kmem_cache_create + */ + cachep->array[smp_processor_id()] = &initarray_generic.cache; + slab_state = PARTIAL; + } else if (slab_state == PARTIAL) { + /* + * Note: the second kmem_cache_create must create the cache * that's used by kmalloc(24), otherwise the creation of * further caches will BUG(). */ @@ -2258,7 +2240,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) /* * If the cache that's used by kmalloc(sizeof(kmem_list3)) is - * the first cache, then we need to set up all its list3s, + * the second cache, then we need to set up all its list3s, * otherwise the creation of further caches will BUG(). */ set_up_list3s(cachep, SIZE_AC); @@ -2267,6 +2249,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) else slab_state = PARTIAL_ARRAYCACHE; } else { + /* Remaining boot caches */ cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init), gfp); -- cgit From 4590685546a374fb0f60682ce0e3a6fd48911d46 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 28 Nov 2012 16:23:16 +0000 Subject: mm/sl[aou]b: Common alignment code Extract the code to do object alignment from the allocators. Do the alignment calculations in slab_common so that the __kmem_cache_create functions of the allocators do not have to deal with alignment. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slab.c | 20 -------------------- mm/slab.h | 3 +++ mm/slab_common.c | 32 ++++++++++++++++++++++++++++++-- mm/slob.c | 10 ---------- mm/slub.c | 38 +------------------------------------- 5 files changed, 34 insertions(+), 69 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index e1790e56fd86..2c3a2e0394db 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2337,22 +2337,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) size &= ~(BYTES_PER_WORD - 1); } - /* calculate the final buffer alignment: */ - - /* 1) arch recommendation: can be overridden for debug */ - if (flags & SLAB_HWCACHE_ALIGN) { - /* - * Default alignment: as specified by the arch code. Except if - * an object is really small, then squeeze multiple objects into - * one cacheline. - */ - ralign = cache_line_size(); - while (size <= ralign / 2) - ralign /= 2; - } else { - ralign = BYTES_PER_WORD; - } - /* * Redzoning and user store require word alignment or possibly larger. * Note this will be overridden by architecture or caller mandated @@ -2369,10 +2353,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) size &= ~(REDZONE_ALIGN - 1); } - /* 2) arch mandated alignment */ - if (ralign < ARCH_SLAB_MINALIGN) { - ralign = ARCH_SLAB_MINALIGN; - } /* 3) caller mandated alignment */ if (ralign < cachep->align) { ralign = cachep->align; diff --git a/mm/slab.h b/mm/slab.h index 492eafa0b538..1cb9c9ee0e6f 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -32,6 +32,9 @@ extern struct list_head slab_caches; /* The slab cache that manages slab cache information */ extern struct kmem_cache *kmem_cache; +unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size); + /* Functions provided by the slab allocators */ extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); diff --git a/mm/slab_common.c b/mm/slab_common.c index 497b45c25bae..a8e76d79ee65 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -72,6 +72,34 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) } #endif +/* + * Figure out what the alignment of the objects will be given a set of + * flags, a user specified alignment and the size of the objects. + */ +unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size) +{ + /* + * If the user wants hardware cache aligned objects then follow that + * suggestion if the object is sufficiently large. + * + * The hardware cache alignment cannot override the specified + * alignment though. If that is greater then use it. + */ + if (flags & SLAB_HWCACHE_ALIGN) { + unsigned long ralign = cache_line_size(); + while (size <= ralign / 2) + ralign /= 2; + align = max(align, ralign); + } + + if (align < ARCH_SLAB_MINALIGN) + align = ARCH_SLAB_MINALIGN; + + return ALIGN(align, sizeof(void *)); +} + + /* * kmem_cache_create - Create a cache. * @name: A string which is used in /proc/slabinfo to identify this cache. @@ -124,7 +152,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); if (s) { s->object_size = s->size = size; - s->align = align; + s->align = calculate_alignment(flags, align, size); s->ctor = ctor; s->name = kstrdup(name, GFP_KERNEL); if (!s->name) { @@ -211,7 +239,7 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz s->name = name; s->size = s->object_size = size; - s->align = ARCH_KMALLOC_MINALIGN; + s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); err = __kmem_cache_create(s, flags); if (err) diff --git a/mm/slob.c b/mm/slob.c index 87e16c4d9143..795bab7d391d 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -123,7 +123,6 @@ static inline void clear_slob_page_free(struct page *sp) #define SLOB_UNIT sizeof(slob_t) #define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) -#define SLOB_ALIGN L1_CACHE_BYTES /* * struct slob_rcu is inserted at the tail of allocated slob blocks, which @@ -527,20 +526,11 @@ EXPORT_SYMBOL(ksize); int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) { - size_t align = c->size; - if (flags & SLAB_DESTROY_BY_RCU) { /* leave room for rcu footer at the end of object */ c->size += sizeof(struct slob_rcu); } c->flags = flags; - /* ignore alignment unless it's forced */ - c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; - if (c->align < ARCH_SLAB_MINALIGN) - c->align = ARCH_SLAB_MINALIGN; - if (c->align < align) - c->align = align; - return 0; } diff --git a/mm/slub.c b/mm/slub.c index c82453ac812a..9640edd2cc78 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2760,32 +2760,6 @@ static inline int calculate_order(int size, int reserved) return -ENOSYS; } -/* - * Figure out what the alignment of the objects will be. - */ -static unsigned long calculate_alignment(unsigned long flags, - unsigned long align, unsigned long size) -{ - /* - * If the user wants hardware cache aligned objects then follow that - * suggestion if the object is sufficiently large. - * - * The hardware cache alignment cannot override the specified - * alignment though. If that is greater then use it. - */ - if (flags & SLAB_HWCACHE_ALIGN) { - unsigned long ralign = cache_line_size(); - while (size <= ralign / 2) - ralign /= 2; - align = max(align, ralign); - } - - if (align < ARCH_SLAB_MINALIGN) - align = ARCH_SLAB_MINALIGN; - - return ALIGN(align, sizeof(void *)); -} - static void init_kmem_cache_node(struct kmem_cache_node *n) { @@ -2919,7 +2893,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) { unsigned long flags = s->flags; unsigned long size = s->object_size; - unsigned long align = s->align; int order; /* @@ -2990,20 +2963,12 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) size += sizeof(void *); #endif - /* - * Determine the alignment based on various parameters that the - * user specified and the dynamic determination of cache line size - * on bootup. - */ - align = calculate_alignment(flags, align, s->object_size); - s->align = align; - /* * SLUB stores one object immediately after another beginning from * offset 0. In order to align the objects we have to simply size * each object to conform to the alignment. */ - size = ALIGN(size, align); + size = ALIGN(size, s->align); s->size = size; if (forced_order >= 0) order = forced_order; @@ -3032,7 +2997,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) s->max = s->oo; return !!oo_objects(s->oo); - } static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) -- cgit From cef23d9db6b76732d9b0933cb162358a6a1f43d7 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 6 Nov 2012 09:56:01 +0000 Subject: mm,generic: only flush the local TLB in ptep_set_access_flags The function ptep_set_access_flags is only ever used to upgrade access permissions to a page. That means the only negative side effect of not flushing remote TLBs is that other CPUs may incur spurious page faults, if they happen to access the same address, and still have a PTE with the old permissions cached in their TLB. Having another CPU maybe incur a spurious page fault is faster than always incurring the cost of a remote TLB flush, so replace the remote TLB flush with a purely local one. This should be safe on every architecture that correctly implements flush_tlb_fix_spurious_fault() to actually invalidate the local TLB entry that caused a page fault, as well as on architectures where the hardware invalidates TLB entries that cause page faults. In the unlikely event that you are hitting what appears to be an infinite loop of page faults, and 'git bisect' took you to this changeset, your architecture needs to implement flush_tlb_fix_spurious_fault to actually flush the TLB entry. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Michel Lespinasse Cc: Ingo Molnar --- mm/pgtable-generic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index e642627da6b7..d8397da42fe6 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -12,8 +12,8 @@ #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS /* - * Only sets the access flags (dirty, accessed, and - * writable). Furthermore, we know it always gets set to a "more + * Only sets the access flags (dirty, accessed), as well as write + * permission. Furthermore, we know it always gets set to a "more * permissive" setting, which allows most architectures to optimize * this. We return whether the PTE actually changed, which in turn * instructs the caller to do things like update__mmu_cache. This @@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, int changed = !pte_same(*ptep, entry); if (changed) { set_pte_at(vma->vm_mm, address, ptep, entry); - flush_tlb_page(vma, address); + flush_tlb_fix_spurious_fault(vma, address); } return changed; } -- cgit From 8d1acce4537c4e2f5889ed9ba9b8eddb80d99820 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 9 Oct 2012 15:31:59 +0200 Subject: mm: Only flush the TLB when clearing an accessible pte If ptep_clear_flush() is called to clear a page table entry that is accessible anyway by the CPU, eg. a _PAGE_PROTNONE page table entry, there is no need to flush the TLB on remote CPUs. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-vm3rkzevahelwhejx5uwm8ex@git.kernel.org Signed-off-by: Ingo Molnar --- mm/pgtable-generic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index d8397da42fe6..0c8323fe6c8f 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, { pte_t pte; pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); - flush_tlb_page(vma, address); + if (pte_accessible(pte)) + flush_tlb_page(vma, address); return pte; } #endif -- cgit From 4fd017708c4a067da51a2b5cf8aedddf4e840b1f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 12 Oct 2011 21:06:51 +0200 Subject: mm: Check if PTE is already allocated during page fault With transparent hugepage support, handle_mm_fault() has to be careful that a normal PMD has been established before handling a PTE fault. To achieve this, it used __pte_alloc() directly instead of pte_alloc_map as pte_alloc_map is unsafe to run against a huge PMD. pte_offset_map() is called once it is known the PMD is safe. pte_alloc_map() is smart enough to check if a PTE is already present before calling __pte_alloc but this check was lost. As a consequence, PTEs may be allocated unnecessarily and the page table lock taken. Thi useless PTE does get cleaned up but it's a performance hit which is visible in page_test from aim9. This patch simply re-adds the check normally done by pte_alloc_map to check if the PTE needs to be allocated before taking the page table lock. The effect is noticable in page_test from aim9. AIM9 2.6.38-vanilla 2.6.38-checkptenone creat-clo 446.10 ( 0.00%) 424.47 (-5.10%) page_test 38.10 ( 0.00%) 42.04 ( 9.37%) brk_test 52.45 ( 0.00%) 51.57 (-1.71%) exec_test 382.00 ( 0.00%) 456.90 (16.39%) fork_test 60.11 ( 0.00%) 67.79 (11.34%) MMTests Statistics: duration Total Elapsed Time (seconds) 611.90 612.22 (While this affects 2.6.38, it is a performance rather than a functional bug and normally outside the rules -stable. While the big performance differences are to a microbench, the difference in fork and exec performance may be significant enough that -stable wants to consider the patch) Reported-by: Raz Ben Yehuda Signed-off-by: Mel Gorman Signed-off-by: Andrea Arcangeli Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Rik van Riel [ Picked this up from the AutoNUMA tree to help it upstream and to allow apples-to-apples performance comparisons. ] Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 3 ++- mm/memory.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 40f17c34b415..35c66a269bcc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -710,7 +710,8 @@ out: * run pte_offset_map on the pmd, if an huge pmd could * materialize from under us from a different thread. */ - if (unlikely(__pte_alloc(mm, vma, pmd, address))) + if (unlikely(pmd_none(*pmd)) && + unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; /* if an huge pmd materialized from under us just retry later */ if (unlikely(pmd_trans_huge(*pmd))) diff --git a/mm/memory.c b/mm/memory.c index 221fc9ffcab1..7cf762857baa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3560,7 +3560,8 @@ retry: * run pte_offset_map on the pmd, if an huge pmd could * materialize from under us from a different thread. */ - if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) + if (unlikely(pmd_none(*pmd)) && + unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; /* if an huge pmd materialized from under us just retry later */ if (unlikely(pmd_trans_huge(*pmd))) -- cgit From 7da4d641c58d201c3cc1835c05ca1a7fa26f0856 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Nov 2012 03:14:23 +0100 Subject: mm: Count the number of pages affected in change_protection() This will be used for three kinds of purposes: - to optimize mprotect() - to speed up working set scanning for working set areas that have not been touched - to more accurately scan per real working set No change in functionality from this patch. Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Mel Gorman Cc: Hugh Dickins Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- mm/hugetlb.c | 10 ++++++++-- mm/mprotect.c | 58 ++++++++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 52 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 59a0059b39e2..712895eda0d0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3014,7 +3014,7 @@ same_page: return i ? i : -EFAULT; } -void hugetlb_change_protection(struct vm_area_struct *vma, +unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) { struct mm_struct *mm = vma->vm_mm; @@ -3022,6 +3022,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, pte_t *ptep; pte_t pte; struct hstate *h = hstate_vma(vma); + unsigned long pages = 0; BUG_ON(address >= end); flush_cache_range(vma, address, end); @@ -3032,12 +3033,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma, ptep = huge_pte_offset(mm, address); if (!ptep) continue; - if (huge_pmd_unshare(mm, &address, ptep)) + if (huge_pmd_unshare(mm, &address, ptep)) { + pages++; continue; + } if (!huge_pte_none(huge_ptep_get(ptep))) { pte = huge_ptep_get_and_clear(mm, address, ptep); pte = pte_mkhuge(pte_modify(pte, newprot)); set_huge_pte_at(mm, address, ptep, pte); + pages++; } } spin_unlock(&mm->page_table_lock); @@ -3049,6 +3053,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma, */ flush_tlb_range(vma, start, end); mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + + return pages << h->order; } int hugetlb_reserve_pages(struct inode *inode, diff --git a/mm/mprotect.c b/mm/mprotect.c index a40992610ab6..1e265be25f85 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -35,12 +35,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) } #endif -static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, +static unsigned long change_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { pte_t *pte, oldpte; spinlock_t *ptl; + unsigned long pages = 0; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -60,6 +61,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, ptent = pte_mkwrite(ptent); ptep_modify_prot_commit(mm, addr, pte, ptent); + pages++; } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); @@ -72,18 +74,22 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, set_pte_at(mm, addr, pte, swp_entry_to_pte(entry)); } + pages++; } } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); + + return pages; } -static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, +static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { pmd_t *pmd; unsigned long next; + unsigned long pages = 0; pmd = pmd_offset(pud, addr); do { @@ -91,35 +97,42 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma->vm_mm, pmd); - else if (change_huge_pmd(vma, pmd, addr, newprot)) + else if (change_huge_pmd(vma, pmd, addr, newprot)) { + pages += HPAGE_PMD_NR; continue; + } /* fall through */ } if (pmd_none_or_clear_bad(pmd)) continue; - change_pte_range(vma->vm_mm, pmd, addr, next, newprot, + pages += change_pte_range(vma->vm_mm, pmd, addr, next, newprot, dirty_accountable); } while (pmd++, addr = next, addr != end); + + return pages; } -static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, +static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { pud_t *pud; unsigned long next; + unsigned long pages = 0; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - change_pmd_range(vma, pud, addr, next, newprot, + pages += change_pmd_range(vma, pud, addr, next, newprot, dirty_accountable); } while (pud++, addr = next, addr != end); + + return pages; } -static void change_protection(struct vm_area_struct *vma, +static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { @@ -127,6 +140,7 @@ static void change_protection(struct vm_area_struct *vma, pgd_t *pgd; unsigned long next; unsigned long start = addr; + unsigned long pages = 0; BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); @@ -135,10 +149,30 @@ static void change_protection(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - change_pud_range(vma, pgd, addr, next, newprot, + pages += change_pud_range(vma, pgd, addr, next, newprot, dirty_accountable); } while (pgd++, addr = next, addr != end); + flush_tlb_range(vma, start, end); + + return pages; +} + +unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgprot_t newprot, + int dirty_accountable) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long pages; + + mmu_notifier_invalidate_range_start(mm, start, end); + if (is_vm_hugetlb_page(vma)) + pages = hugetlb_change_protection(vma, start, end, newprot); + else + pages = change_protection_range(vma, start, end, newprot, dirty_accountable); + mmu_notifier_invalidate_range_end(mm, start, end); + + return pages; } int @@ -213,12 +247,8 @@ success: dirty_accountable = 1; } - mmu_notifier_invalidate_range_start(mm, start, end); - if (is_vm_hugetlb_page(vma)) - hugetlb_change_protection(vma, start, end, vma->vm_page_prot); - else - change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); - mmu_notifier_invalidate_range_end(mm, start, end); + change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); + vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); perf_event_mmap(vma); -- cgit From 1233d588210737ed3696b44c26e71dfa44a6995a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 19 Nov 2012 03:14:24 +0100 Subject: mm: Optimize the TLB flush of sys_mprotect() and change_protection() users Reuse the NUMA code's 'modified page protections' count that change_protection() computes and skip the TLB flush if there's no changes to a range that sys_mprotect() modifies. Given that mprotect() already optimizes the same-flags case I expected this optimization to dominantly trigger on CONFIG_NUMA_BALANCING=y kernels - but even with that feature disabled it triggers rather often. There's two reasons for that: 1) While sys_mprotect() already optimizes the same-flag case: if (newflags == oldflags) { *pprev = vma; return 0; } and this test works in many cases, but it is too sharp in some others, where it differentiates between protection values that the underlying PTE format makes no distinction about, such as PROT_EXEC == PROT_READ on x86. 2) Even where the pte format over vma flag changes necessiates a modification of the pagetables, there might be no pagetables yet to modify: they might not be instantiated yet. During a regular desktop bootup this optimization hits a couple of hundred times. During a Java test I measured thousands of hits. So this optimization improves sys_mprotect() in general, not just CONFIG_NUMA_BALANCING=y kernels. [ We could further increase the efficiency of this optimization if change_pte_range() and change_huge_pmd() was a bit smarter about recognizing exact-same-value protection masks - when the hardware can do that safely. This would probably further speed up mprotect(). ] Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Mel Gorman Cc: Hugh Dickins Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- mm/mprotect.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index 1e265be25f85..7c3628a8b486 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -153,7 +153,9 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, dirty_accountable); } while (pgd++, addr = next, addr != end); - flush_tlb_range(vma, start, end); + /* Only flush the TLB if we actually modified any entries: */ + if (pages) + flush_tlb_range(vma, start, end); return pages; } -- cgit From 5647bc293ab15f66a7b1cda850c5e9d162a6c7c2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 19 Oct 2012 10:46:20 +0100 Subject: mm: compaction: Move migration fail/success stats to migrate.c The compact_pages_moved and compact_pagemigrate_failed events are convenient for determining if compaction is active and to what degree migration is succeeding but it's at the wrong level. Other users of migration may also want to know if migration is working properly and this will be particularly true for any automated NUMA migration. This patch moves the counters down to migration with the new events called pgmigrate_success and pgmigrate_fail. The compact_blocks_moved counter is removed because while it was useful for debugging initially, it's worthless now as no meaningful conclusions can be drawn from its value. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- mm/compaction.c | 4 ---- mm/migrate.c | 6 ++++++ mm/vmstat.c | 7 ++++--- 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 9eef55838fca..00ad88395216 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -994,10 +994,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; - count_vm_event(COMPACTBLOCKS); - count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); - if (nr_remaining) - count_vm_events(COMPACTPAGEFAILED, nr_remaining); trace_mm_compaction_migratepages(nr_migrate - nr_remaining, nr_remaining); diff --git a/mm/migrate.c b/mm/migrate.c index 77ed2d773705..04687f69cc17 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -962,6 +962,7 @@ int migrate_pages(struct list_head *from, { int retry = 1; int nr_failed = 0; + int nr_succeeded = 0; int pass = 0; struct page *page; struct page *page2; @@ -988,6 +989,7 @@ int migrate_pages(struct list_head *from, retry++; break; case 0: + nr_succeeded++; break; default: /* Permanent failure */ @@ -998,6 +1000,10 @@ int migrate_pages(struct list_head *from, } rc = 0; out: + if (nr_succeeded) + count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); + if (nr_failed) + count_vm_events(PGMIGRATE_FAIL, nr_failed); if (!swapwrite) current->flags &= ~PF_SWAPWRITE; diff --git a/mm/vmstat.c b/mm/vmstat.c index c7370579111b..89a7fd665b32 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -774,10 +774,11 @@ const char * const vmstat_text[] = { "pgrotated", +#ifdef CONFIG_MIGRATION + "pgmigrate_success", + "pgmigrate_fail", +#endif #ifdef CONFIG_COMPACTION - "compact_blocks_moved", - "compact_pages_moved", - "compact_pagemigrate_failed", "compact_stall", "compact_fail", "compact_success", -- cgit From 7b2a2d4a18fffac3c4872021529b0657896db788 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 19 Oct 2012 14:07:31 +0100 Subject: mm: migrate: Add a tracepoint for migrate_pages The pgmigrate_success and pgmigrate_fail vmstat counters tells the user about migration activity but not the type or the reason. This patch adds a tracepoint to identify the type of page migration and why the page is being migrated. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- mm/compaction.c | 3 ++- mm/memory-failure.c | 3 ++- mm/memory_hotplug.c | 3 ++- mm/mempolicy.c | 6 ++++-- mm/migrate.c | 10 ++++++++-- mm/page_alloc.c | 3 ++- 6 files changed, 20 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 00ad88395216..2c077a78487c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -990,7 +990,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, (unsigned long)cc, false, - cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); + cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, + MR_COMPACTION); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6c5899b9034a..ddb68a169e45 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1558,7 +1558,8 @@ int soft_offline_page(struct page *page, int flags) page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, - false, MIGRATE_SYNC); + false, MIGRATE_SYNC, + MR_MEMORY_FAILURE); if (ret) { putback_lru_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e4eeacae2b91..e598bd15c041 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -812,7 +812,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * migrate_pages returns # of failed pages. */ ret = migrate_pages(&source, alloc_migrate_target, 0, - true, MIGRATE_SYNC); + true, MIGRATE_SYNC, + MR_MEMORY_HOTPLUG); if (ret) putback_lru_pages(&source); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d04a8a54c294..66e90ecc2350 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -961,7 +961,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, - false, MIGRATE_SYNC); + false, MIGRATE_SYNC, + MR_SYSCALL); if (err) putback_lru_pages(&pagelist); } @@ -1202,7 +1203,8 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, (unsigned long)vma, - false, MIGRATE_SYNC); + false, MIGRATE_SYNC, + MR_MEMPOLICY_MBIND); if (nr_failed) putback_lru_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index 04687f69cc17..27be9c923dc1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -38,6 +38,9 @@ #include +#define CREATE_TRACE_POINTS +#include + #include "internal.h" /* @@ -958,7 +961,7 @@ out: */ int migrate_pages(struct list_head *from, new_page_t get_new_page, unsigned long private, bool offlining, - enum migrate_mode mode) + enum migrate_mode mode, int reason) { int retry = 1; int nr_failed = 0; @@ -1004,6 +1007,8 @@ out: count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); if (nr_failed) count_vm_events(PGMIGRATE_FAIL, nr_failed); + trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); + if (!swapwrite) current->flags &= ~PF_SWAPWRITE; @@ -1145,7 +1150,8 @@ set_status: err = 0; if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm, 0, MIGRATE_SYNC); + (unsigned long)pm, 0, MIGRATE_SYNC, + MR_SYSCALL); if (err) putback_lru_pages(&pagelist); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7bb35ac0964a..5953dc2d196f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5707,7 +5707,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - 0, false, MIGRATE_SYNC); + 0, false, MIGRATE_SYNC, + MR_CMA); } putback_lru_pages(&cc->migratepages); -- cgit From 397487db696cae0b026a474a5cd66f4e372995e6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 19 Oct 2012 12:00:10 +0100 Subject: mm: compaction: Add scanned and isolated counters for compaction Compaction already has tracepoints to count scanned and isolated pages but it requires that ftrace be enabled and if that information has to be written to disk then it can be disruptive. This patch adds vmstat counters for compaction called compact_migrate_scanned, compact_free_scanned and compact_isolated. With these counters, it is possible to define a basic cost model for compaction. This approximates of how much work compaction is doing and can be compared that with an oprofile showing TLB misses and see if the cost of compaction is being offset by THP for example. Minimally a compaction patch can be evaluated in terms of whether it increases or decreases cost. The basic cost model looks like this Fundamental unit u: a word sizeof(void *) Ca = cost of struct page access = sizeof(struct page) / u Cmc = Cost migrate page copy = (Ca + PAGE_SIZE/u) * 2 Cmf = Cost migrate failure = Ca * 2 Ci = Cost page isolation = (Ca + Wi) where Wi is a constant that should reflect the approximate cost of the locking operation. Csm = Cost migrate scanning = Ca Csf = Cost free scanning = Ca Overall cost = (Csm * compact_migrate_scanned) + (Csf * compact_free_scanned) + (Ci * compact_isolated) + (Cmc * pgmigrate_success) + (Cmf * pgmigrate_failed) Where the values are read from /proc/vmstat. This is very basic and ignores certain costs such as the allocation cost to do a migrate page copy but any improvement to the model would still use the same vmstat counters. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- mm/compaction.c | 8 ++++++++ mm/vmstat.c | 3 +++ 2 files changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 2c077a78487c..aee7443a4d5a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -356,6 +356,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (blockpfn == end_pfn) update_pageblock_skip(cc, valid_page, total_isolated, false); + count_vm_events(COMPACTFREE_SCANNED, nr_scanned); + if (total_isolated) + count_vm_events(COMPACTISOLATED, total_isolated); + return total_isolated; } @@ -646,6 +650,10 @@ next_pageblock: trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); + count_vm_events(COMPACTMIGRATE_SCANNED, nr_scanned); + if (nr_isolated) + count_vm_events(COMPACTISOLATED, nr_isolated); + return low_pfn; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 89a7fd665b32..3a067fabe190 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -779,6 +779,9 @@ const char * const vmstat_text[] = { "pgmigrate_fail", #endif #ifdef CONFIG_COMPACTION + "compact_migrate_scanned", + "compact_free_scanned", + "compact_isolated", "compact_stall", "compact_fail", "compact_success", -- cgit From 0b9d705297b273657923518dbea2377cd03532ed Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 5 Oct 2012 21:36:27 +0200 Subject: mm: numa: Support NUMA hinting page faults from gup/gup_fast Introduce FOLL_NUMA to tell follow_page to check pte/pmd_numa. get_user_pages must use FOLL_NUMA, and it's safe to do so because it always invokes handle_mm_fault and retries the follow_page later. KVM secondary MMU page faults will trigger the NUMA hinting page faults through gup_fast -> get_user_pages -> follow_page -> handle_mm_fault. Other follow_page callers like KSM should not use FOLL_NUMA, or they would fail to get the pages if they use follow_page instead of get_user_pages. [ This patch was picked up from the AutoNUMA tree. ] Originally-by: Andrea Arcangeli Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Rik van Riel [ ported to this tree. ] Signed-off-by: Ingo Molnar Reviewed-by: Rik van Riel --- mm/memory.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 7cf762857baa..cd8e0daf1912 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1517,6 +1517,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; } + if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + goto no_page_table; if (pmd_trans_huge(*pmd)) { if (flags & FOLL_SPLIT) { split_huge_page_pmd(mm, pmd); @@ -1546,6 +1548,8 @@ split_fallthrough: pte = *ptep; if (!pte_present(pte)) goto no_page; + if ((flags & FOLL_NUMA) && pte_numa(pte)) + goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; @@ -1697,6 +1701,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= (gup_flags & FOLL_FORCE) ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + + /* + * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault + * would be called on PROT_NONE ranges. We must never invoke + * handle_mm_fault on PROT_NONE ranges or the NUMA hinting + * page faults would unprotect the PROT_NONE ranges if + * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd + * bitflag. So to avoid that, don't set FOLL_NUMA if + * FOLL_FORCE is set. + */ + if (!(gup_flags & FOLL_FORCE)) + gup_flags |= FOLL_NUMA; + i = 0; do { -- cgit From 1ba6e0b50b479cbadb8f05ebde3020da9ac87201 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 4 Oct 2012 01:51:06 +0200 Subject: mm: numa: split_huge_page: transfer the NUMA type from the pmd to the pte When we split a transparent hugepage, transfer the NUMA type from the pmd to the pte if needed. Signed-off-by: Andrea Arcangeli Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- mm/huge_memory.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 35c66a269bcc..cd24aa562144 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1364,6 +1364,8 @@ static int __split_huge_page_map(struct page *page, BUG_ON(page_mapcount(page) != 1); if (!pmd_young(*pmd)) entry = pte_mkold(entry); + if (pmd_numa(*pmd)) + entry = pte_mknuma(entry); pte = pte_offset_map(&_pmd, haddr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); -- cgit From d10e63f29488b0f312a443f9507ea9b6fd3c9090 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 25 Oct 2012 14:16:31 +0200 Subject: mm: numa: Create basic numa page hinting infrastructure Note: This patch started as "mm/mpol: Create special PROT_NONE infrastructure" and preserves the basic idea but steals *very* heavily from "autonuma: numa hinting page faults entry points" for the actual fault handlers without the migration parts. The end result is barely recognisable as either patch so all Signed-off and Reviewed-bys are dropped. If Peter, Ingo and Andrea are ok with this version, I will re-add the signed-offs-by to reflect the history. In order to facilitate a lazy -- fault driven -- migration of pages, create a special transient PAGE_NUMA variant, we can then use the 'spurious' protection faults to drive our migrations from. The meaning of PAGE_NUMA depends on the architecture but on x86 it is effectively PROT_NONE. Actual PROT_NONE mappings will not generate these NUMA faults for the reason that the page fault code checks the permission on the VMA (and will throw a segmentation fault on actual PROT_NONE mappings), before it ever calls handle_mm_fault. [dhillf@gmail.com: Fix typo] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- mm/huge_memory.c | 22 +++++++++++ mm/memory.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 131 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cd24aa562144..f5f37630c54d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1018,6 +1018,28 @@ out: return page; } +/* NUMA hinting page fault entry point for trans huge pmds */ +int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, + pmd_t pmd, pmd_t *pmdp) +{ + struct page *page; + unsigned long haddr = addr & HPAGE_PMD_MASK; + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) + goto out_unlock; + + page = pmd_page(pmd); + pmd = pmd_mknonnuma(pmd); + set_pmd_at(mm, haddr, pmdp, pmd); + VM_BUG_ON(pmd_numa(*pmdp)); + update_mmu_cache_pmd(vma, addr, pmdp); + +out_unlock: + spin_unlock(&mm->page_table_lock); + return 0; +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { diff --git a/mm/memory.c b/mm/memory.c index cd8e0daf1912..e30616f2cc3d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3448,6 +3448,103 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } +int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) +{ + struct page *page; + spinlock_t *ptl; + + /* + * The "pte" at this point cannot be used safely without + * validation through pte_unmap_same(). It's of NUMA type but + * the pfn may be screwed if the read is non atomic. + * + * ptep_modify_prot_start is not called as this is clearing + * the _PAGE_NUMA bit and it is not really expected that there + * would be concurrent hardware modifications to the PTE. + */ + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (unlikely(!pte_same(*ptep, pte))) + goto out_unlock; + pte = pte_mknonnuma(pte); + set_pte_at(mm, addr, ptep, pte); + update_mmu_cache(vma, addr, ptep); + + page = vm_normal_page(vma, addr, pte); + if (!page) { + pte_unmap_unlock(ptep, ptl); + return 0; + } + +out_unlock: + pte_unmap_unlock(ptep, ptl); + return 0; +} + +/* NUMA hinting page fault entry point for regular pmds */ +#ifdef CONFIG_NUMA_BALANCING +static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t pmd; + pte_t *pte, *orig_pte; + unsigned long _addr = addr & PMD_MASK; + unsigned long offset; + spinlock_t *ptl; + bool numa = false; + + spin_lock(&mm->page_table_lock); + pmd = *pmdp; + if (pmd_numa(pmd)) { + set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); + numa = true; + } + spin_unlock(&mm->page_table_lock); + + if (!numa) + return 0; + + /* we're in a page fault so some vma must be in the range */ + BUG_ON(!vma); + BUG_ON(vma->vm_start >= _addr + PMD_SIZE); + offset = max(_addr, vma->vm_start) & ~PMD_MASK; + VM_BUG_ON(offset >= PMD_SIZE); + orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); + pte += offset >> PAGE_SHIFT; + for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { + pte_t pteval = *pte; + struct page *page; + if (!pte_present(pteval)) + continue; + if (!pte_numa(pteval)) + continue; + if (addr >= vma->vm_end) { + vma = find_vma(mm, addr); + /* there's a pte present so there must be a vma */ + BUG_ON(!vma); + BUG_ON(addr < vma->vm_start); + } + if (pte_numa(pteval)) { + pteval = pte_mknonnuma(pteval); + set_pte_at(mm, addr, pte, pteval); + } + page = vm_normal_page(vma, addr, pteval); + if (unlikely(!page)) + continue; + } + pte_unmap_unlock(orig_pte, ptl); + + return 0; +} +#else +static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + BUG(); +} +#endif /* CONFIG_NUMA_BALANCING */ + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -3486,6 +3583,9 @@ int handle_pte_fault(struct mm_struct *mm, pte, pmd, flags, entry); } + if (pte_numa(entry)) + return do_numa_page(mm, vma, address, entry, pte, pmd); + ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) @@ -3554,9 +3654,11 @@ retry: barrier(); if (pmd_trans_huge(orig_pmd)) { - if (flags & FAULT_FLAG_WRITE && - !pmd_write(orig_pmd) && - !pmd_trans_splitting(orig_pmd)) { + if (pmd_numa(*pmd)) + return do_huge_pmd_numa_page(mm, address, + orig_pmd, pmd); + + if ((flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) { ret = do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); /* @@ -3568,10 +3670,14 @@ retry: goto retry; return ret; } + return 0; } } + if (pmd_numa(*pmd)) + return do_pmd_numa_page(mm, vma, address, pmd); + /* * Use __pte_alloc instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could -- cgit From 479e2802d09f1e18a97262c4c6f8f17ae5884bd8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:28 +0200 Subject: mm: mempolicy: Make MPOL_LOCAL a real policy Make MPOL_LOCAL a real and exposed policy such that applications that relied on the previous default behaviour can explicitly request it. Requested-by: Christoph Lameter Reviewed-by: Rik van Riel Cc: Lee Schermerhorn Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman --- mm/mempolicy.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 66e90ecc2350..54bd3e5ed776 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -269,6 +269,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); } + } else if (mode == MPOL_LOCAL) { + if (!nodes_empty(*nodes)) + return ERR_PTR(-EINVAL); + mode = MPOL_PREFERRED; } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -2399,7 +2403,6 @@ void numa_default_policy(void) * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag * Used only for mpol_parse_str() and mpol_to_str() */ -#define MPOL_LOCAL MPOL_MAX static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", @@ -2452,12 +2455,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (flags) *flags++ = '\0'; /* terminate mode string */ - for (mode = 0; mode <= MPOL_LOCAL; mode++) { + for (mode = 0; mode < MPOL_MAX; mode++) { if (!strcmp(str, policy_modes[mode])) { break; } } - if (mode > MPOL_LOCAL) + if (mode >= MPOL_MAX) goto out; switch (mode) { -- cgit From d3a710337b0590f43fd236d5e6518439afc7410a Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Thu, 25 Oct 2012 14:16:29 +0200 Subject: mm: mempolicy: Add MPOL_NOOP This patch augments the MPOL_MF_LAZY feature by adding a "NOOP" policy to mbind(). When the NOOP policy is used with the 'MOVE and 'LAZY flags, mbind() will map the pages PROT_NONE so that they will be migrated on the next touch. This allows an application to prepare for a new phase of operation where different regions of shared storage will be assigned to worker threads, w/o changing policy. Note that we could just use "default" policy in this case. However, this also allows an application to request that pages be migrated, only if necessary, to follow any arbitrary policy that might currently apply to a range of pages, without knowing the policy, or without specifying multiple mbind()s for ranges with different policies. [ Bug in early version of mpol_parse_str() reported by Fengguang Wu. ] Bug-Reported-by: Reported-by: Fengguang Wu Signed-off-by: Lee Schermerhorn Reviewed-by: Rik van Riel Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman --- mm/mempolicy.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 54bd3e5ed776..c21e91477c4f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -251,10 +251,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, pr_debug("setting mode %d flags %d nodes[0] %lx\n", mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); - if (mode == MPOL_DEFAULT) { + if (mode == MPOL_DEFAULT || mode == MPOL_NOOP) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); - return NULL; /* simply delete any existing policy */ + return NULL; } VM_BUG_ON(!nodes); @@ -1147,7 +1147,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (start & ~PAGE_MASK) return -EINVAL; - if (mode == MPOL_DEFAULT) + if (mode == MPOL_DEFAULT || mode == MPOL_NOOP) flags &= ~MPOL_MF_STRICT; len = (len + PAGE_SIZE - 1) & PAGE_MASK; @@ -2409,7 +2409,8 @@ static const char * const policy_modes[] = [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", - [MPOL_LOCAL] = "local" + [MPOL_LOCAL] = "local", + [MPOL_NOOP] = "noop", /* should not actually be used */ }; @@ -2460,7 +2461,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) break; } } - if (mode >= MPOL_MAX) + if (mode >= MPOL_MAX || mode == MPOL_NOOP) goto out; switch (mode) { -- cgit From 771fb4d806a92bf6c988fcfbd286ae40a9374332 Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Thu, 25 Oct 2012 14:16:30 +0200 Subject: mm: mempolicy: Check for misplaced page This patch provides a new function to test whether a page resides on a node that is appropriate for the mempolicy for the vma and address where the page is supposed to be mapped. This involves looking up the node where the page belongs. So, the function returns that node so that it may be used to allocated the page without consulting the policy again. A subsequent patch will call this function from the fault path. Because of this, I don't want to go ahead and allocate the page, e.g., via alloc_page_vma() only to have to free it if it has the correct policy. So, I just mimic the alloc_page_vma() node computation logic--sort of. Note: we could use this function to implement a MPOL_MF_STRICT behavior when migrating pages to match mbind() mempolicy--e.g., to ensure that pages in an interleaved range are reinterleaved rather than left where they are when they reside on any page in the interleave nodemask. Signed-off-by: Lee Schermerhorn Reviewed-by: Rik van Riel Cc: Andrew Morton Cc: Linus Torvalds [ Added MPOL_F_LAZY to trigger migrate-on-fault; simplified code now that we don't have to bother with special crap for interleaved ] Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman --- mm/mempolicy.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c21e91477c4f..df1466d3d2d8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2181,6 +2181,82 @@ static void sp_free(struct sp_node *n) kmem_cache_free(sn_cache, n); } +/** + * mpol_misplaced - check whether current page node is valid in policy + * + * @page - page to be checked + * @vma - vm area where page mapped + * @addr - virtual address where page mapped + * + * Lookup current policy node id for vma,addr and "compare to" page's + * node id. + * + * Returns: + * -1 - not misplaced, page is in the right node + * node - node id where the page should be + * + * Policy determination "mimics" alloc_page_vma(). + * Called from fault path where we know the vma and faulting address. + */ +int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol; + struct zone *zone; + int curnid = page_to_nid(page); + unsigned long pgoff; + int polnid = -1; + int ret = -1; + + BUG_ON(!vma); + + pol = get_vma_policy(current, vma, addr); + if (!(pol->flags & MPOL_F_MOF)) + goto out; + + switch (pol->mode) { + case MPOL_INTERLEAVE: + BUG_ON(addr >= vma->vm_end); + BUG_ON(addr < vma->vm_start); + + pgoff = vma->vm_pgoff; + pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; + polnid = offset_il_node(pol, vma, pgoff); + break; + + case MPOL_PREFERRED: + if (pol->flags & MPOL_F_LOCAL) + polnid = numa_node_id(); + else + polnid = pol->v.preferred_node; + break; + + case MPOL_BIND: + /* + * allows binding to multiple nodes. + * use current page if in policy nodemask, + * else select nearest allowed node, if any. + * If no allowed nodes, use current [!misplaced]. + */ + if (node_isset(curnid, pol->v.nodes)) + goto out; + (void)first_zones_zonelist( + node_zonelist(numa_node_id(), GFP_HIGHUSER), + gfp_zone(GFP_HIGHUSER), + &pol->v.nodes, &zone); + polnid = zone->node; + break; + + default: + BUG(); + } + if (curnid != polnid) + ret = polnid; +out: + mpol_cond_put(pol); + + return ret; +} + static void sp_delete(struct shared_policy *sp, struct sp_node *n) { pr_debug("deleting %lx-l%lx\n", n->start, n->end); -- cgit From 7039e1dbec6eeaa8ecab43a82d6589eeced995c3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:34 +0200 Subject: mm: migrate: Introduce migrate_misplaced_page() Note: This was originally based on Peter's patch "mm/migrate: Introduce migrate_misplaced_page()" but borrows extremely heavily from Andrea's "autonuma: memory follows CPU algorithm and task/mm_autonuma stats collection". The end result is barely recognisable so signed-offs had to be dropped. If original authors are ok with it, I'll re-add the signed-off-bys. Add migrate_misplaced_page() which deals with migrating pages from faults. Based-on-work-by: Lee Schermerhorn Based-on-work-by: Peter Zijlstra Based-on-work-by: Andrea Arcangeli Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- mm/migrate.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 106 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 27be9c923dc1..d168aec98427 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -282,7 +282,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, struct buffer_head *head, enum migrate_mode mode) { - int expected_count; + int expected_count = 0; void **pslot; if (!mapping) { @@ -1415,4 +1415,108 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, } return err; } -#endif + +#ifdef CONFIG_NUMA_BALANCING +/* + * Returns true if this is a safe migration target node for misplaced NUMA + * pages. Currently it only checks the watermarks which crude + */ +static bool migrate_balanced_pgdat(struct pglist_data *pgdat, + int nr_migrate_pages) +{ + int z; + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + struct zone *zone = pgdat->node_zones + z; + + if (!populated_zone(zone)) + continue; + + if (zone->all_unreclaimable) + continue; + + /* Avoid waking kswapd by allocating pages_to_migrate pages. */ + if (!zone_watermark_ok(zone, 0, + high_wmark_pages(zone) + + nr_migrate_pages, + 0, 0)) + continue; + return true; + } + return false; +} + +static struct page *alloc_misplaced_dst_page(struct page *page, + unsigned long data, + int **result) +{ + int nid = (int) data; + struct page *newpage; + + newpage = alloc_pages_exact_node(nid, + (GFP_HIGHUSER_MOVABLE | GFP_THISNODE | + __GFP_NOMEMALLOC | __GFP_NORETRY | + __GFP_NOWARN) & + ~GFP_IOFS, 0); + return newpage; +} + +/* + * Attempt to migrate a misplaced page to the specified destination + * node. Caller is expected to have an elevated reference count on + * the page that will be dropped by this function before returning. + */ +int migrate_misplaced_page(struct page *page, int node) +{ + int isolated = 0; + LIST_HEAD(migratepages); + + /* + * Don't migrate pages that are mapped in multiple processes. + * TODO: Handle false sharing detection instead of this hammer + */ + if (page_mapcount(page) != 1) { + put_page(page); + goto out; + } + + /* Avoid migrating to a node that is nearly full */ + if (migrate_balanced_pgdat(NODE_DATA(node), 1)) { + int page_lru; + + if (isolate_lru_page(page)) { + put_page(page); + goto out; + } + isolated = 1; + + /* + * Page is isolated which takes a reference count so now the + * callers reference can be safely dropped without the page + * disappearing underneath us during migration + */ + put_page(page); + + page_lru = page_is_file_cache(page); + inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); + list_add(&page->lru, &migratepages); + } + + if (isolated) { + int nr_remaining; + + nr_remaining = migrate_pages(&migratepages, + alloc_misplaced_dst_page, + node, false, MIGRATE_ASYNC, + MR_NUMA_MISPLACED); + if (nr_remaining) { + putback_lru_pages(&migratepages); + isolated = 0; + } + } + BUG_ON(!list_empty(&migratepages)); +out: + return isolated; +} +#endif /* CONFIG_NUMA_BALANCING */ + +#endif /* CONFIG_NUMA */ -- cgit From 149c33e1c98f83050870514f380902dc6d617bd5 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 27 Nov 2012 14:03:05 +0000 Subject: mm: migrate: Drop the misplaced pages reference count if the target node is full If we have to avoid migrating to a node that is nearly full, put page and return zero. Signed-off-by: Hillf Danton Signed-off-by: Mel Gorman --- mm/migrate.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index d168aec98427..c7d550011a64 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1489,18 +1489,21 @@ int migrate_misplaced_page(struct page *page, int node) } isolated = 1; - /* - * Page is isolated which takes a reference count so now the - * callers reference can be safely dropped without the page - * disappearing underneath us during migration - */ - put_page(page); - page_lru = page_is_file_cache(page); inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); list_add(&page->lru, &migratepages); } + /* + * Page is either isolated or there is not enough space on the target + * node. If isolated, then it has taken a reference count and the + * callers reference can be safely dropped without the page + * disappearing underneath us during migration. Otherwise the page is + * not to be migrated but the callers reference should still be + * dropped so it does not leak. + */ + put_page(page); + if (isolated) { int nr_remaining; -- cgit From 4daae3b4b9e49b7e0935499a352f1c59d90287d2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 2 Nov 2012 11:33:45 +0000 Subject: mm: mempolicy: Use _PAGE_NUMA to migrate pages Note: Based on "mm/mpol: Use special PROT_NONE to migrate pages" but sufficiently different that the signed-off-bys were dropped Combine our previous _PAGE_NUMA, mpol_misplaced and migrate_misplaced_page() pieces into an effective migrate on fault scheme. Note that (on x86) we rely on PROT_NONE pages being !present and avoid the TLB flush from try_to_unmap(TTU_MIGRATION). This greatly improves the page-migration performance. Based-on-work-by: Peter Zijlstra Signed-off-by: Mel Gorman --- mm/huge_memory.c | 31 ++++++++++++++++++++++++++++--- mm/memory.c | 32 +++++++++++++++++++++++++++----- 2 files changed, 55 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f5f37630c54d..5723b551c023 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -1019,17 +1020,39 @@ out: } /* NUMA hinting page fault entry point for trans huge pmds */ -int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, - pmd_t pmd, pmd_t *pmdp) +int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, pmd_t *pmdp) { - struct page *page; + struct page *page = NULL; unsigned long haddr = addr & HPAGE_PMD_MASK; + int target_nid; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; page = pmd_page(pmd); + get_page(page); + spin_unlock(&mm->page_table_lock); + + target_nid = mpol_misplaced(page, vma, haddr); + if (target_nid == -1) + goto clear_pmdnuma; + + /* + * Due to lacking code to migrate thp pages, we'll split + * (which preserves the special PROT_NONE) and re-take the + * fault on the normal pages. + */ + split_huge_page(page); + put_page(page); + return 0; + +clear_pmdnuma: + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) + goto out_unlock; + pmd = pmd_mknonnuma(pmd); set_pmd_at(mm, haddr, pmdp, pmd); VM_BUG_ON(pmd_numa(*pmdp)); @@ -1037,6 +1060,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, out_unlock: spin_unlock(&mm->page_table_lock); + if (page) + put_page(page); return 0; } diff --git a/mm/memory.c b/mm/memory.c index e30616f2cc3d..d52542680e10 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -3451,8 +3452,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) { - struct page *page; + struct page *page = NULL; spinlock_t *ptl; + int current_nid, target_nid; /* * The "pte" at this point cannot be used safely without @@ -3465,8 +3467,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, */ ptl = pte_lockptr(mm, pmd); spin_lock(ptl); - if (unlikely(!pte_same(*ptep, pte))) - goto out_unlock; + if (unlikely(!pte_same(*ptep, pte))) { + pte_unmap_unlock(ptep, ptl); + goto out; + } + pte = pte_mknonnuma(pte); set_pte_at(mm, addr, ptep, pte); update_mmu_cache(vma, addr, ptep); @@ -3477,8 +3482,25 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } -out_unlock: + get_page(page); + current_nid = page_to_nid(page); + target_nid = mpol_misplaced(page, vma, addr); pte_unmap_unlock(ptep, ptl); + if (target_nid == -1) { + /* + * Account for the fault against the current node if it not + * being replaced regardless of where the page is located. + */ + current_nid = numa_node_id(); + put_page(page); + goto out; + } + + /* Migrate to the requested node */ + if (migrate_misplaced_page(page, target_nid)) + current_nid = target_nid; + +out: return 0; } @@ -3655,7 +3677,7 @@ retry: barrier(); if (pmd_trans_huge(orig_pmd)) { if (pmd_numa(*pmd)) - return do_huge_pmd_numa_page(mm, address, + return do_huge_pmd_numa_page(mm, vma, address, orig_pmd, pmd); if ((flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) { -- cgit From b24f53a0bea38b266d219ee651b22dba727c44ae Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Thu, 25 Oct 2012 14:16:32 +0200 Subject: mm: mempolicy: Add MPOL_MF_LAZY NOTE: Once again there is a lot of patch stealing and the end result is sufficiently different that I had to drop the signed-offs. Will re-add if the original authors are ok with that. This patch adds another mbind() flag to request "lazy migration". The flag, MPOL_MF_LAZY, modifies MPOL_MF_MOVE* such that the selected pages are marked PROT_NONE. The pages will be migrated in the fault path on "first touch", if the policy dictates at that time. "Lazy Migration" will allow testing of migrate-on-fault via mbind(). Also allows applications to specify that only subsequently touched pages be migrated to obey new policy, instead of all pages in range. This can be useful for multi-threaded applications working on a large shared data area that is initialized by an initial thread resulting in all pages on one [or a few, if overflowed] nodes. After PROT_NONE, the pages in regions assigned to the worker threads will be automatically migrated local to the threads on 1st touch. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- mm/mempolicy.c | 185 ++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 170 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index df1466d3d2d8..51d3ebd8561e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -90,6 +90,7 @@ #include #include #include +#include #include #include @@ -565,6 +566,145 @@ static inline int check_pgd_range(struct vm_area_struct *vma, return 0; } +#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +/* + * Here we search for not shared page mappings (mapcount == 1) and we + * set up the pmd/pte_numa on those mappings so the very next access + * will fire a NUMA hinting page fault. + */ +static int +change_prot_numa_range(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, *_pte; + struct page *page; + unsigned long _address, end; + spinlock_t *ptl; + int ret = 0; + + VM_BUG_ON(address & ~PAGE_MASK); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + goto out; + + if (pmd_trans_huge_lock(pmd, vma) == 1) { + int page_nid; + ret = HPAGE_PMD_NR; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + if (pmd_numa(*pmd)) { + spin_unlock(&mm->page_table_lock); + goto out; + } + + page = pmd_page(*pmd); + + /* only check non-shared pages */ + if (page_mapcount(page) != 1) { + spin_unlock(&mm->page_table_lock); + goto out; + } + + page_nid = page_to_nid(page); + + if (pmd_numa(*pmd)) { + spin_unlock(&mm->page_table_lock); + goto out; + } + + set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd)); + ret += HPAGE_PMD_NR; + /* defer TLB flush to lower the overhead */ + spin_unlock(&mm->page_table_lock); + goto out; + } + + if (pmd_trans_unstable(pmd)) + goto out; + VM_BUG_ON(!pmd_present(*pmd)); + + end = min(vma->vm_end, (address + PMD_SIZE) & PMD_MASK); + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + for (_address = address, _pte = pte; _address < end; + _pte++, _address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (!pte_present(pteval)) + continue; + if (pte_numa(pteval)) + continue; + page = vm_normal_page(vma, _address, pteval); + if (unlikely(!page)) + continue; + /* only check non-shared pages */ + if (page_mapcount(page) != 1) + continue; + + set_pte_at(mm, _address, _pte, pte_mknuma(pteval)); + + /* defer TLB flush to lower the overhead */ + ret++; + } + pte_unmap_unlock(pte, ptl); + + if (ret && !pmd_numa(*pmd)) { + spin_lock(&mm->page_table_lock); + set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd)); + spin_unlock(&mm->page_table_lock); + /* defer TLB flush to lower the overhead */ + } + +out: + return ret; +} + +/* Assumes mmap_sem is held */ +void +change_prot_numa(struct vm_area_struct *vma, + unsigned long address, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + int progress = 0; + + while (address < end) { + VM_BUG_ON(address < vma->vm_start || + address + PAGE_SIZE > vma->vm_end); + + progress += change_prot_numa_range(mm, vma, address); + address = (address + PMD_SIZE) & PMD_MASK; + } + + /* + * Flush the TLB for the mm to start the NUMA hinting + * page faults after we finish scanning this vma part + * if there were any PTE updates + */ + if (progress) { + mmu_notifier_invalidate_range_start(vma->vm_mm, address, end); + flush_tlb_range(vma, address, end); + mmu_notifier_invalidate_range_end(vma->vm_mm, address, end); + } +} +#else +static unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + return 0; +} +#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ + /* * Check if all pages in a range are on a set of nodes. * If pagelist != NULL then isolate pages from the LRU and @@ -583,22 +723,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, return ERR_PTR(-EFAULT); prev = NULL; for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { + unsigned long endvma = vma->vm_end; + + if (endvma > end) + endvma = end; + if (vma->vm_start > start) + start = vma->vm_start; + if (!(flags & MPOL_MF_DISCONTIG_OK)) { if (!vma->vm_next && vma->vm_end < end) return ERR_PTR(-EFAULT); if (prev && prev->vm_end < vma->vm_start) return ERR_PTR(-EFAULT); } - if (!is_vm_hugetlb_page(vma) && - ((flags & MPOL_MF_STRICT) || + + if (is_vm_hugetlb_page(vma)) + goto next; + + if (flags & MPOL_MF_LAZY) { + change_prot_numa(vma, start, endvma); + goto next; + } + + if ((flags & MPOL_MF_STRICT) || ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && - vma_migratable(vma)))) { - unsigned long endvma = vma->vm_end; + vma_migratable(vma))) { - if (endvma > end) - endvma = end; - if (vma->vm_start > start) - start = vma->vm_start; err = check_pgd_range(vma, start, endvma, nodes, flags, private); if (err) { @@ -606,6 +756,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, break; } } +next: prev = vma; } return first; @@ -1138,8 +1289,7 @@ static long do_mbind(unsigned long start, unsigned long len, int err; LIST_HEAD(pagelist); - if (flags & ~(unsigned long)(MPOL_MF_STRICT | - MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + if (flags & ~(unsigned long)MPOL_MF_VALID) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; @@ -1162,6 +1312,9 @@ static long do_mbind(unsigned long start, unsigned long len, if (IS_ERR(new)) return PTR_ERR(new); + if (flags & MPOL_MF_LAZY) + new->flags |= MPOL_F_MOF; + /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all @@ -1198,13 +1351,15 @@ static long do_mbind(unsigned long start, unsigned long len, vma = check_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); - err = PTR_ERR(vma); - if (!IS_ERR(vma)) { - int nr_failed = 0; - + err = PTR_ERR(vma); /* maybe ... */ + if (!IS_ERR(vma) && mode != MPOL_NOOP) err = mbind_range(mm, start, end, new); + if (!err) { + int nr_failed = 0; + if (!list_empty(&pagelist)) { + WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_vma_page, (unsigned long)vma, false, MIGRATE_SYNC, @@ -1213,7 +1368,7 @@ static long do_mbind(unsigned long start, unsigned long len, putback_lru_pages(&pagelist); } - if (!err && nr_failed && (flags & MPOL_MF_STRICT)) + if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } else putback_lru_pages(&pagelist); -- cgit From 4b10e7d562c90d0a72f324832c26653947a07381 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 25 Oct 2012 14:16:32 +0200 Subject: mm: mempolicy: Implement change_prot_numa() in terms of change_protection() This patch converts change_prot_numa() to use change_protection(). As pte_numa and friends check the PTE bits directly it is necessary for change_protection() to use pmd_mknuma(). Hence the required modifications to change_protection() are a little clumsy but the end result is that most of the numa page table helpers are just one or two instructions. Signed-off-by: Mel Gorman --- mm/huge_memory.c | 14 +++++- mm/mempolicy.c | 137 ++++++------------------------------------------------- mm/mprotect.c | 72 ++++++++++++++++++++++------- 3 files changed, 81 insertions(+), 142 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5723b551c023..d79f7a55bf6f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1147,7 +1147,7 @@ out: } int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot) + unsigned long addr, pgprot_t newprot, int prot_numa) { struct mm_struct *mm = vma->vm_mm; int ret = 0; @@ -1155,7 +1155,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma) == 1) { pmd_t entry; entry = pmdp_get_and_clear(mm, addr, pmd); - entry = pmd_modify(entry, newprot); + if (!prot_numa) + entry = pmd_modify(entry, newprot); + else { + struct page *page = pmd_page(*pmd); + + /* only check non-shared pages */ + if (page_mapcount(page) == 1 && + !pmd_numa(*pmd)) { + entry = pmd_mknuma(entry); + } + } set_pmd_at(mm, addr, pmd, entry); spin_unlock(&vma->vm_mm->page_table_lock); ret = 1; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 51d3ebd8561e..75d4600a5e92 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -568,134 +568,23 @@ static inline int check_pgd_range(struct vm_area_struct *vma, #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE /* - * Here we search for not shared page mappings (mapcount == 1) and we - * set up the pmd/pte_numa on those mappings so the very next access - * will fire a NUMA hinting page fault. + * This is used to mark a range of virtual addresses to be inaccessible. + * These are later cleared by a NUMA hinting fault. Depending on these + * faults, pages may be migrated for better NUMA placement. + * + * This is assuming that NUMA faults are handled using PROT_NONE. If + * an architecture makes a different choice, it will need further + * changes to the core. */ -static int -change_prot_numa_range(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte, *_pte; - struct page *page; - unsigned long _address, end; - spinlock_t *ptl; - int ret = 0; - - VM_BUG_ON(address & ~PAGE_MASK); - - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - goto out; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - goto out; - - if (pmd_trans_huge_lock(pmd, vma) == 1) { - int page_nid; - ret = HPAGE_PMD_NR; - - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - - if (pmd_numa(*pmd)) { - spin_unlock(&mm->page_table_lock); - goto out; - } - - page = pmd_page(*pmd); - - /* only check non-shared pages */ - if (page_mapcount(page) != 1) { - spin_unlock(&mm->page_table_lock); - goto out; - } - - page_nid = page_to_nid(page); - - if (pmd_numa(*pmd)) { - spin_unlock(&mm->page_table_lock); - goto out; - } - - set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd)); - ret += HPAGE_PMD_NR; - /* defer TLB flush to lower the overhead */ - spin_unlock(&mm->page_table_lock); - goto out; - } - - if (pmd_trans_unstable(pmd)) - goto out; - VM_BUG_ON(!pmd_present(*pmd)); - - end = min(vma->vm_end, (address + PMD_SIZE) & PMD_MASK); - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - for (_address = address, _pte = pte; _address < end; - _pte++, _address += PAGE_SIZE) { - pte_t pteval = *_pte; - if (!pte_present(pteval)) - continue; - if (pte_numa(pteval)) - continue; - page = vm_normal_page(vma, _address, pteval); - if (unlikely(!page)) - continue; - /* only check non-shared pages */ - if (page_mapcount(page) != 1) - continue; - - set_pte_at(mm, _address, _pte, pte_mknuma(pteval)); - - /* defer TLB flush to lower the overhead */ - ret++; - } - pte_unmap_unlock(pte, ptl); - - if (ret && !pmd_numa(*pmd)) { - spin_lock(&mm->page_table_lock); - set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd)); - spin_unlock(&mm->page_table_lock); - /* defer TLB flush to lower the overhead */ - } - -out: - return ret; -} - -/* Assumes mmap_sem is held */ -void -change_prot_numa(struct vm_area_struct *vma, - unsigned long address, unsigned long end) +unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) { - struct mm_struct *mm = vma->vm_mm; - int progress = 0; - - while (address < end) { - VM_BUG_ON(address < vma->vm_start || - address + PAGE_SIZE > vma->vm_end); + int nr_updated; + BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); - progress += change_prot_numa_range(mm, vma, address); - address = (address + PMD_SIZE) & PMD_MASK; - } + nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); - /* - * Flush the TLB for the mm to start the NUMA hinting - * page faults after we finish scanning this vma part - * if there were any PTE updates - */ - if (progress) { - mmu_notifier_invalidate_range_start(vma->vm_mm, address, end); - flush_tlb_range(vma, address, end); - mmu_notifier_invalidate_range_end(vma->vm_mm, address, end); - } + return nr_updated; } #else static unsigned long change_prot_numa(struct vm_area_struct *vma, diff --git a/mm/mprotect.c b/mm/mprotect.c index 7c3628a8b486..7ef6ae964e8f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -35,10 +35,11 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) } #endif -static unsigned long change_pte_range(struct mm_struct *mm, pmd_t *pmd, +static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { + struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; @@ -49,19 +50,39 @@ static unsigned long change_pte_range(struct mm_struct *mm, pmd_t *pmd, oldpte = *pte; if (pte_present(oldpte)) { pte_t ptent; + bool updated = false; ptent = ptep_modify_prot_start(mm, addr, pte); - ptent = pte_modify(ptent, newprot); + if (!prot_numa) { + ptent = pte_modify(ptent, newprot); + updated = true; + } else { + struct page *page; + + page = vm_normal_page(vma, addr, oldpte); + if (page) { + /* only check non-shared pages */ + if (!pte_numa(oldpte) && + page_mapcount(page) == 1) { + ptent = pte_mknuma(ptent); + updated = true; + } + } + } /* * Avoid taking write faults for pages we know to be * dirty. */ - if (dirty_accountable && pte_dirty(ptent)) + if (dirty_accountable && pte_dirty(ptent)) { ptent = pte_mkwrite(ptent); + updated = true; + } + + if (updated) + pages++; ptep_modify_prot_commit(mm, addr, pte, ptent); - pages++; } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); @@ -83,9 +104,25 @@ static unsigned long change_pte_range(struct mm_struct *mm, pmd_t *pmd, return pages; } +#ifdef CONFIG_NUMA_BALANCING +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, + pmd_t *pmd) +{ + spin_lock(&mm->page_table_lock); + set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); + spin_unlock(&mm->page_table_lock); +} +#else +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, + pmd_t *pmd) +{ + BUG(); +} +#endif /* CONFIG_NUMA_BALANCING */ + static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { pmd_t *pmd; unsigned long next; @@ -97,7 +134,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma->vm_mm, pmd); - else if (change_huge_pmd(vma, pmd, addr, newprot)) { + else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { pages += HPAGE_PMD_NR; continue; } @@ -105,8 +142,11 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * } if (pmd_none_or_clear_bad(pmd)) continue; - pages += change_pte_range(vma->vm_mm, pmd, addr, next, newprot, - dirty_accountable); + pages += change_pte_range(vma, pmd, addr, next, newprot, + dirty_accountable, prot_numa); + + if (prot_numa) + change_pmd_protnuma(vma->vm_mm, addr, pmd); } while (pmd++, addr = next, addr != end); return pages; @@ -114,7 +154,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { pud_t *pud; unsigned long next; @@ -126,7 +166,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t * if (pud_none_or_clear_bad(pud)) continue; pages += change_pmd_range(vma, pud, addr, next, newprot, - dirty_accountable); + dirty_accountable, prot_numa); } while (pud++, addr = next, addr != end); return pages; @@ -134,7 +174,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t * static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -150,7 +190,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, if (pgd_none_or_clear_bad(pgd)) continue; pages += change_pud_range(vma, pgd, addr, next, newprot, - dirty_accountable); + dirty_accountable, prot_numa); } while (pgd++, addr = next, addr != end); /* Only flush the TLB if we actually modified any entries: */ @@ -162,7 +202,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { struct mm_struct *mm = vma->vm_mm; unsigned long pages; @@ -171,7 +211,7 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot); else - pages = change_protection_range(vma, start, end, newprot, dirty_accountable); + pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); mmu_notifier_invalidate_range_end(mm, start, end); return pages; @@ -249,7 +289,7 @@ success: dirty_accountable = 1; } - change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); + change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); -- cgit From a720094ded8cbb303111035be91858011d2eac71 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 16 Nov 2012 09:37:58 +0000 Subject: mm: mempolicy: Hide MPOL_NOOP and MPOL_MF_LAZY from userspace for now The use of MPOL_NOOP and MPOL_MF_LAZY to allow an application to explicitly request lazy migration is a good idea but the actual API has not been well reviewed and once released we have to support it. For now this patch prevents an application using the services. This will need to be revisited. Signed-off-by: Mel Gorman --- mm/mempolicy.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 75d4600a5e92..a7a62fe7c280 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -252,7 +252,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, pr_debug("setting mode %d flags %d nodes[0] %lx\n", mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); - if (mode == MPOL_DEFAULT || mode == MPOL_NOOP) { + if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); return NULL; @@ -1186,7 +1186,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (start & ~PAGE_MASK) return -EINVAL; - if (mode == MPOL_DEFAULT || mode == MPOL_NOOP) + if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; len = (len + PAGE_SIZE - 1) & PAGE_MASK; @@ -1241,7 +1241,7 @@ static long do_mbind(unsigned long start, unsigned long len, flags | MPOL_MF_INVERT, &pagelist); err = PTR_ERR(vma); /* maybe ... */ - if (!IS_ERR(vma) && mode != MPOL_NOOP) + if (!IS_ERR(vma)) err = mbind_range(mm, start, end, new); if (!err) { @@ -2530,7 +2530,6 @@ static const char * const policy_modes[] = [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", [MPOL_LOCAL] = "local", - [MPOL_NOOP] = "noop", /* should not actually be used */ }; @@ -2581,7 +2580,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) break; } } - if (mode >= MPOL_MAX || mode == MPOL_NOOP) + if (mode >= MPOL_MAX) goto out; switch (mode) { -- cgit From cbee9f88ec1b8dd6b58f25f54e4f52c82ed77690 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:43 +0200 Subject: mm: numa: Add fault driven placement and migration NOTE: This patch is based on "sched, numa, mm: Add fault driven placement and migration policy" but as it throws away all the policy to just leave a basic foundation I had to drop the signed-offs-by. This patch creates a bare-bones method for setting PTEs pte_numa in the context of the scheduler that when faulted later will be faulted onto the node the CPU is running on. In itself this does nothing useful but any placement policy will fundamentally depend on receiving hints on placement from fault context and doing something intelligent about it. Signed-off-by: Mel Gorman Acked-by: Rik van Riel --- mm/huge_memory.c | 5 ++++- mm/memory.c | 14 +++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d79f7a55bf6f..ee8133794a56 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1046,6 +1046,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, */ split_huge_page(page); put_page(page); + return 0; clear_pmdnuma: @@ -1060,8 +1061,10 @@ clear_pmdnuma: out_unlock: spin_unlock(&mm->page_table_lock); - if (page) + if (page) { put_page(page); + task_numa_fault(numa_node_id(), HPAGE_PMD_NR); + } return 0; } diff --git a/mm/memory.c b/mm/memory.c index d52542680e10..8012c1907895 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3454,7 +3454,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *page = NULL; spinlock_t *ptl; - int current_nid, target_nid; + int current_nid = -1; + int target_nid; /* * The "pte" at this point cannot be used safely without @@ -3501,6 +3502,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, current_nid = target_nid; out: + task_numa_fault(current_nid, 1); return 0; } @@ -3537,6 +3539,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { pte_t pteval = *pte; struct page *page; + int curr_nid; if (!pte_present(pteval)) continue; if (!pte_numa(pteval)) @@ -3554,6 +3557,15 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = vm_normal_page(vma, addr, pteval); if (unlikely(!page)) continue; + /* only check non-shared pages */ + if (unlikely(page_mapcount(page) != 1)) + continue; + pte_unmap_unlock(pte, ptl); + + curr_nid = page_to_nid(page); + task_numa_fault(curr_nid, 1); + + pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } pte_unmap_unlock(orig_pte, ptl); -- cgit From 03c5a6e16322c997bf8f264851bfa3f532ad515f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 2 Nov 2012 14:52:48 +0000 Subject: mm: numa: Add pte updates, hinting and migration stats It is tricky to quantify the basic cost of automatic NUMA placement in a meaningful manner. This patch adds some vmstats that can be used as part of a basic costing model. u = basic unit = sizeof(void *) Ca = cost of struct page access = sizeof(struct page) / u Cpte = Cost PTE access = Ca Cupdate = Cost PTE update = (2 * Cpte) + (2 * Wlock) where Cpte is incurred twice for a read and a write and Wlock is a constant representing the cost of taking or releasing a lock Cnumahint = Cost of a minor page fault = some high constant e.g. 1000 Cpagerw = Cost to read or write a full page = Ca + PAGE_SIZE/u Ci = Cost of page isolation = Ca + Wi where Wi is a constant that should reflect the approximate cost of the locking operation Cpagecopy = Cpagerw + (Cpagerw * Wnuma) + Ci + (Ci * Wnuma) where Wnuma is the approximate NUMA factor. 1 is local. 1.2 would imply that remote accesses are 20% more expensive Balancing cost = Cpte * numa_pte_updates + Cnumahint * numa_hint_faults + Ci * numa_pages_migrated + Cpagecopy * numa_pages_migrated Note that numa_pages_migrated is used as a measure of how many pages were isolated even though it would miss pages that failed to migrate. A vmstat counter could have been added for it but the isolation cost is pretty marginal in comparison to the overall cost so it seemed overkill. The ideal way to measure automatic placement benefit would be to count the number of remote accesses versus local accesses and do something like benefit = (remote_accesses_before - remove_access_after) * Wnuma but the information is not readily available. As a workload converges, the expection would be that the number of remote numa hints would reduce to 0. convergence = numa_hint_faults_local / numa_hint_faults where this is measured for the last N number of numa hints recorded. When the workload is fully converged the value is 1. This can measure if the placement policy is converging and how fast it is doing it. Signed-off-by: Mel Gorman Acked-by: Rik van Riel --- mm/huge_memory.c | 5 +++++ mm/memory.c | 12 ++++++++++++ mm/mempolicy.c | 2 ++ mm/migrate.c | 3 ++- mm/vmstat.c | 6 ++++++ 5 files changed, 27 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ee8133794a56..f3a477fffd09 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1026,6 +1026,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page = NULL; unsigned long haddr = addr & HPAGE_PMD_MASK; int target_nid; + int current_nid = -1; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) @@ -1034,6 +1035,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pmd_page(pmd); get_page(page); spin_unlock(&mm->page_table_lock); + current_nid = page_to_nid(page); + count_vm_numa_event(NUMA_HINT_FAULTS); + if (current_nid == numa_node_id()) + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); target_nid = mpol_misplaced(page, vma, haddr); if (target_nid == -1) diff --git a/mm/memory.c b/mm/memory.c index 8012c1907895..8a7b4ccbe136 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3477,6 +3477,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, addr, ptep, pte); update_mmu_cache(vma, addr, ptep); + count_vm_numa_event(NUMA_HINT_FAULTS); page = vm_normal_page(vma, addr, pte); if (!page) { pte_unmap_unlock(ptep, ptl); @@ -3485,6 +3486,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, get_page(page); current_nid = page_to_nid(page); + if (current_nid == numa_node_id()) + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); target_nid = mpol_misplaced(page, vma, addr); pte_unmap_unlock(ptep, ptl); if (target_nid == -1) { @@ -3517,6 +3520,9 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long offset; spinlock_t *ptl; bool numa = false; + int local_nid = numa_node_id(); + unsigned long nr_faults = 0; + unsigned long nr_faults_local = 0; spin_lock(&mm->page_table_lock); pmd = *pmdp; @@ -3565,10 +3571,16 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, curr_nid = page_to_nid(page); task_numa_fault(curr_nid, 1); + nr_faults++; + if (curr_nid == local_nid) + nr_faults_local++; + pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } pte_unmap_unlock(orig_pte, ptl); + count_vm_numa_events(NUMA_HINT_FAULTS, nr_faults); + count_vm_numa_events(NUMA_HINT_FAULTS_LOCAL, nr_faults_local); return 0; } #else diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a7a62fe7c280..516491fbfaa8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -583,6 +583,8 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); + if (nr_updated) + count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); return nr_updated; } diff --git a/mm/migrate.c b/mm/migrate.c index c7d550011a64..23bba5d6edff 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1514,7 +1514,8 @@ int migrate_misplaced_page(struct page *page, int node) if (nr_remaining) { putback_lru_pages(&migratepages); isolated = 0; - } + } else + count_vm_numa_event(NUMA_PAGE_MIGRATE); } BUG_ON(!list_empty(&migratepages)); out: diff --git a/mm/vmstat.c b/mm/vmstat.c index 3a067fabe190..c0f1f6db5182 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -774,6 +774,12 @@ const char * const vmstat_text[] = { "pgrotated", +#ifdef CONFIG_NUMA_BALANCING + "numa_pte_updates", + "numa_hint_faults", + "numa_hint_faults_local", + "numa_pages_migrated", +#endif #ifdef CONFIG_MIGRATION "pgmigrate_success", "pgmigrate_fail", -- cgit From 5606e3877ad8baea42f3a71ebde0a03622bbb551 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 2 Nov 2012 18:19:13 +0000 Subject: mm: numa: Migrate on reference policy This is the simplest possible policy that still does something of note. When a pte_numa is faulted, it is moved immediately. Any replacement policy must at least do better than this and in all likelihood this policy regresses normal workloads. Signed-off-by: Mel Gorman Acked-by: Rik van Riel --- mm/mempolicy.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 516491fbfaa8..4c1c8d83ac6a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -118,6 +118,26 @@ static struct mempolicy default_policy = { .flags = MPOL_F_LOCAL, }; +static struct mempolicy preferred_node_policy[MAX_NUMNODES]; + +static struct mempolicy *get_task_policy(struct task_struct *p) +{ + struct mempolicy *pol = p->mempolicy; + int node; + + if (!pol) { + node = numa_node_id(); + if (node != -1) + pol = &preferred_node_policy[node]; + + /* preferred_node_policy is not initialised early in boot */ + if (!pol->mode) + pol = NULL; + } + + return pol; +} + static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); /* @@ -1598,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, struct mempolicy *get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) { - struct mempolicy *pol = task->mempolicy; + struct mempolicy *pol = get_task_policy(task); if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) { @@ -2021,7 +2041,7 @@ retry_cpuset: */ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { - struct mempolicy *pol = current->mempolicy; + struct mempolicy *pol = get_task_policy(current); struct page *page; unsigned int cpuset_mems_cookie; @@ -2295,6 +2315,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long default: BUG(); } + + /* Migrate the page towards the node whose CPU is referencing it */ + if (pol->flags & MPOL_F_MORON) + polnid = numa_node_id(); + if (curnid != polnid) ret = polnid; out: @@ -2483,6 +2508,15 @@ void __init numa_policy_init(void) sizeof(struct sp_node), 0, SLAB_PANIC, NULL); + for_each_node(nid) { + preferred_node_policy[nid] = (struct mempolicy) { + .refcnt = ATOMIC_INIT(1), + .mode = MPOL_PREFERRED, + .flags = MPOL_F_MOF | MPOL_F_MORON, + .v = { .preferred_node = nid, }, + }; + } + /* * Set interleaving policy for system init. Interleaving is only * enabled across suitably sized nodes (default is >= 16MB), or -- cgit From 9532fec118d485ea37ab6e3ea372d68cd8b4cd0d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 15 Nov 2012 01:24:32 +0000 Subject: mm: numa: Migrate pages handled during a pmd_numa hinting fault To say that the PMD handling code was incorrectly transferred from autonuma is an understatement. The intention was to handle a PMDs worth of pages in the same fault and effectively batch the taking of the PTL and page migration. The copied version instead has the impact of clearing a number of pte_numa PTE entries and whether any page migration takes place depends on racing. This just happens to work in some cases. This patch handles pte_numa faults in batch when a pmd_numa fault is handled. The pages are migrated if they are currently misplaced. Essentially this is making an assumption that NUMA locality is on a PMD boundary but that could be addressed by only setting pmd_numa if all the pages within that PMD are on the same node if necessary. Signed-off-by: Mel Gorman --- mm/memory.c | 51 ++++++++++++++++++++++++++++++++++----------------- mm/mprotect.c | 25 ++++++++++++++++++++----- 2 files changed, 54 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 8a7b4ccbe136..84c6d9eab182 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3449,6 +3449,18 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } +int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, + unsigned long addr, int current_nid) +{ + get_page(page); + + count_vm_numa_event(NUMA_HINT_FAULTS); + if (current_nid == numa_node_id()) + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + + return mpol_misplaced(page, vma, addr); +} + int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) { @@ -3477,18 +3489,14 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, addr, ptep, pte); update_mmu_cache(vma, addr, ptep); - count_vm_numa_event(NUMA_HINT_FAULTS); page = vm_normal_page(vma, addr, pte); if (!page) { pte_unmap_unlock(ptep, ptl); return 0; } - get_page(page); current_nid = page_to_nid(page); - if (current_nid == numa_node_id()) - count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); - target_nid = mpol_misplaced(page, vma, addr); + target_nid = numa_migrate_prep(page, vma, addr, current_nid); pte_unmap_unlock(ptep, ptl); if (target_nid == -1) { /* @@ -3505,7 +3513,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, current_nid = target_nid; out: - task_numa_fault(current_nid, 1); + if (current_nid != -1) + task_numa_fault(current_nid, 1); return 0; } @@ -3521,8 +3530,6 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; bool numa = false; int local_nid = numa_node_id(); - unsigned long nr_faults = 0; - unsigned long nr_faults_local = 0; spin_lock(&mm->page_table_lock); pmd = *pmdp; @@ -3545,7 +3552,8 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { pte_t pteval = *pte; struct page *page; - int curr_nid; + int curr_nid = local_nid; + int target_nid; if (!pte_present(pteval)) continue; if (!pte_numa(pteval)) @@ -3566,21 +3574,30 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* only check non-shared pages */ if (unlikely(page_mapcount(page) != 1)) continue; - pte_unmap_unlock(pte, ptl); - curr_nid = page_to_nid(page); - task_numa_fault(curr_nid, 1); + /* + * Note that the NUMA fault is later accounted to either + * the node that is currently running or where the page is + * migrated to. + */ + curr_nid = local_nid; + target_nid = numa_migrate_prep(page, vma, addr, + page_to_nid(page)); + if (target_nid == -1) { + put_page(page); + continue; + } - nr_faults++; - if (curr_nid == local_nid) - nr_faults_local++; + /* Migrate to the requested node */ + pte_unmap_unlock(pte, ptl); + if (migrate_misplaced_page(page, target_nid)) + curr_nid = target_nid; + task_numa_fault(curr_nid, 1); pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } pte_unmap_unlock(orig_pte, ptl); - count_vm_numa_events(NUMA_HINT_FAULTS, nr_faults); - count_vm_numa_events(NUMA_HINT_FAULTS_LOCAL, nr_faults_local); return 0; } #else diff --git a/mm/mprotect.c b/mm/mprotect.c index 7ef6ae964e8f..dce6fb48edc4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -37,12 +37,14 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + int dirty_accountable, int prot_numa, bool *ret_all_same_node) { struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; + bool all_same_node = true; + int last_nid = -1; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -61,6 +63,12 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page = vm_normal_page(vma, addr, oldpte); if (page) { + int this_nid = page_to_nid(page); + if (last_nid == -1) + last_nid = this_nid; + if (last_nid != this_nid) + all_same_node = false; + /* only check non-shared pages */ if (!pte_numa(oldpte) && page_mapcount(page) == 1) { @@ -81,7 +89,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (updated) pages++; - ptep_modify_prot_commit(mm, addr, pte, ptent); } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); @@ -101,6 +108,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); + *ret_all_same_node = all_same_node; return pages; } @@ -127,6 +135,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * pmd_t *pmd; unsigned long next; unsigned long pages = 0; + bool all_same_node; pmd = pmd_offset(pud, addr); do { @@ -143,9 +152,15 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * if (pmd_none_or_clear_bad(pmd)) continue; pages += change_pte_range(vma, pmd, addr, next, newprot, - dirty_accountable, prot_numa); - - if (prot_numa) + dirty_accountable, prot_numa, &all_same_node); + + /* + * If we are changing protections for NUMA hinting faults then + * set pmd_numa if the examined pages were all on the same + * node. This allows a regular PMD to be handled as one fault + * and effectively batches the taking of the PTL + */ + if (prot_numa && all_same_node) change_pmd_protnuma(vma->vm_mm, addr, pmd); } while (pmd++, addr = next, addr != end); -- cgit From 8177a420ed7c16c171ed3c3aec5b0676db38c247 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 23 Mar 2012 20:56:34 +0100 Subject: mm: numa: Structures for Migrate On Fault per NUMA migration rate limiting This defines the per-node data used by Migrate On Fault in order to rate limit the migration. The rate limiting is applied independently to each destination node. Signed-off-by: Andrea Arcangeli Signed-off-by: Mel Gorman --- mm/page_alloc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5953dc2d196f..ef025e20dbee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4449,6 +4449,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, int ret; pgdat_resize_init(pgdat); +#ifdef CONFIG_NUMA_BALANCING + spin_lock_init(&pgdat->numabalancing_migrate_lock); + pgdat->numabalancing_migrate_nr_pages = 0; + pgdat->numabalancing_migrate_next_window = jiffies; +#endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); pgdat_page_cgroup_init(pgdat); -- cgit From a8f6077213d285ca08dbf6d4a67470787388138b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 14 Nov 2012 21:41:46 +0000 Subject: mm: numa: Rate limit the amount of memory that is migrated between nodes NOTE: This is very heavily based on similar logic in autonuma. It should be signed off by Andrea but because there was no standalone patch and it's sufficiently different from what he did that the signed-off is omitted. Will be added back if requested. If a large number of pages are misplaced then the memory bus can be saturated just migrating pages between nodes. This patch rate-limits the amount of memory that can be migrating between nodes. Signed-off-by: Mel Gorman --- mm/migrate.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 23bba5d6edff..4b8267f1842f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1460,6 +1460,14 @@ static struct page *alloc_misplaced_dst_page(struct page *page, return newpage; } +/* + * page migration rate limiting control. + * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs + * window of time. Default here says do not migrate more than 1280M per second. + */ +static unsigned int migrate_interval_millisecs __read_mostly = 100; +static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); + /* * Attempt to migrate a misplaced page to the specified destination * node. Caller is expected to have an elevated reference count on @@ -1467,6 +1475,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, */ int migrate_misplaced_page(struct page *page, int node) { + pg_data_t *pgdat = NODE_DATA(node); int isolated = 0; LIST_HEAD(migratepages); @@ -1479,8 +1488,27 @@ int migrate_misplaced_page(struct page *page, int node) goto out; } + /* + * Rate-limit the amount of data that is being migrated to a node. + * Optimal placement is no good if the memory bus is saturated and + * all the time is being spent migrating! + */ + spin_lock(&pgdat->numabalancing_migrate_lock); + if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { + pgdat->numabalancing_migrate_nr_pages = 0; + pgdat->numabalancing_migrate_next_window = jiffies + + msecs_to_jiffies(migrate_interval_millisecs); + } + if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { + spin_unlock(&pgdat->numabalancing_migrate_lock); + put_page(page); + goto out; + } + pgdat->numabalancing_migrate_nr_pages++; + spin_unlock(&pgdat->numabalancing_migrate_lock); + /* Avoid migrating to a node that is nearly full */ - if (migrate_balanced_pgdat(NODE_DATA(node), 1)) { + if (migrate_balanced_pgdat(pgdat, 1)) { int page_lru; if (isolate_lru_page(page)) { -- cgit From e14808b49f55e0e1135da5e4a154a540dd9f3662 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 19 Nov 2012 10:59:15 +0000 Subject: mm: numa: Rate limit setting of pte_numa if node is saturated If there are a large number of NUMA hinting faults and all of them are resulting in migrations it may indicate that memory is just bouncing uselessly around. NUMA balancing cost is likely exceeding any benefit from locality. Rate limit the PTE updates if the node is migration rate-limited. As noted in the comments, this distorts the NUMA faulting statistics. Signed-off-by: Mel Gorman --- mm/migrate.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 4b8267f1842f..32a1afca6009 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1464,10 +1464,30 @@ static struct page *alloc_misplaced_dst_page(struct page *page, * page migration rate limiting control. * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs * window of time. Default here says do not migrate more than 1280M per second. + * If a node is rate-limited then PTE NUMA updates are also rate-limited. However + * as it is faults that reset the window, pte updates will happen unconditionally + * if there has not been a fault since @pteupdate_interval_millisecs after the + * throttle window closed. */ static unsigned int migrate_interval_millisecs __read_mostly = 100; +static unsigned int pteupdate_interval_millisecs __read_mostly = 1000; static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); +/* Returns true if NUMA migration is currently rate limited */ +bool migrate_ratelimited(int node) +{ + pg_data_t *pgdat = NODE_DATA(node); + + if (time_after(jiffies, pgdat->numabalancing_migrate_next_window + + msecs_to_jiffies(pteupdate_interval_millisecs))) + return false; + + if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages) + return false; + + return true; +} + /* * Attempt to migrate a misplaced page to the specified destination * node. Caller is expected to have an elevated reference count on -- cgit From 57e0a0309160b1b4ebde9f3c6a867cd96ac368bf Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 12 Nov 2012 09:06:20 +0000 Subject: mm: numa: Introduce last_nid to the page frame This patch introduces a last_nid field to the page struct. This is used to build a two-stage filter in the next patch that is aimed at mitigating a problem whereby pages migrate to the wrong node when referenced by a process that was running off its home node. Signed-off-by: Mel Gorman --- mm/page_alloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef025e20dbee..73f226a1206e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -608,6 +608,7 @@ static inline int free_pages_check(struct page *page) bad_page(page); return 1; } + reset_page_last_nid(page); if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; return 0; @@ -3826,6 +3827,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); reset_page_mapcount(page); + reset_page_last_nid(page); SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for -- cgit From 5aa80374a10567f8e25de0615d3d40f3aa3a4298 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 27 Nov 2012 14:40:32 +0000 Subject: mm: numa: split_huge_page: Transfer last_nid on tail page Pass last_nid from head page to tail page. Signed-off-by: Hillf Danton Signed-off-by: Mel Gorman --- mm/huge_memory.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f3a477fffd09..79b96064f8fc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1362,6 +1362,7 @@ static void __split_huge_page_refcount(struct page *page) page_tail->mapping = page->mapping; page_tail->index = page->index + i; + page_xchg_last_nid(page_tail, page_last_nid(page)); BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); -- cgit From bac0382c6ad764156025978845147e5a6eccca09 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 27 Nov 2012 14:46:24 +0000 Subject: mm: numa: migrate: Set last_nid on newly allocated page Pass last_nid from misplaced page to newly allocated migration target page. Signed-off-by: Hillf Danton Signed-off-by: Mel Gorman --- mm/migrate.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 32a1afca6009..2a5ce135eef0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1457,6 +1457,9 @@ static struct page *alloc_misplaced_dst_page(struct page *page, __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & ~GFP_IOFS, 0); + if (newpage) + page_xchg_last_nid(newpage, page_last_nid(page)); + return newpage; } -- cgit From e42c8ff2999de1239a57d434bfbd8e9f2a56e814 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 12 Nov 2012 09:17:07 +0000 Subject: mm: numa: Use a two-stage filter to restrict pages being migrated for unlikely task<->node relationships Note: This two-stage filter was taken directly from the sched/numa patch "sched, numa, mm: Add the scanning page fault machinery" but is only a partial extraction. As the end result is not necessarily recognisable, the signed-offs-by had to be removed. Will be added back if requested. While it is desirable that all threads in a process run on its home node, this is not always possible or necessary. There may be more threads than exist within the node or the node might over-subscribed with unrelated processes. This can cause a situation whereby a page gets migrated off its home node because the threads clearing pte_numa were running off-node. This patch uses page->last_nid to build a two-stage filter before pages get migrated to avoid problems with short or unlikely task<->node relationships. Signed-off-by: Mel Gorman --- mm/mempolicy.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4c1c8d83ac6a..fd20e28fd2ad 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2317,9 +2317,37 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long } /* Migrate the page towards the node whose CPU is referencing it */ - if (pol->flags & MPOL_F_MORON) + if (pol->flags & MPOL_F_MORON) { + int last_nid; + polnid = numa_node_id(); + /* + * Multi-stage node selection is used in conjunction + * with a periodic migration fault to build a temporal + * task<->page relation. By using a two-stage filter we + * remove short/unlikely relations. + * + * Using P(p) ~ n_p / n_t as per frequentist + * probability, we can equate a task's usage of a + * particular page (n_p) per total usage of this + * page (n_t) (in a given time-span) to a probability. + * + * Our periodic faults will sample this probability and + * getting the same result twice in a row, given these + * samples are fully independent, is then given by + * P(n)^2, provided our sample period is sufficiently + * short compared to the usage pattern. + * + * This quadric squishes small probabilities, making + * it less likely we act on an unlikely task<->page + * relation. + */ + last_nid = page_xchg_last_nid(page, polnid); + if (last_nid != polnid) + goto out; + } + if (curnid != polnid) ret = polnid; out: -- cgit From b8593bfda1652755136333cdd362de125b283a9c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 21 Nov 2012 01:18:23 +0000 Subject: mm: sched: Adapt the scanning rate if a NUMA hinting fault does not migrate The PTE scanning rate and fault rates are two of the biggest sources of system CPU overhead with automatic NUMA placement. Ideally a proper policy would detect if a workload was properly placed, schedule and adjust the PTE scanning rate accordingly. We do not track the necessary information to do that but we at least know if we migrated or not. This patch scans slower if a page was not migrated as the result of a NUMA hinting fault up to sysctl_numa_balancing_scan_period_max which is now higher than the previous default. Once every minute it will reset the scanner in case of phase changes. This is hilariously crude and the numbers are arbitrary. Workloads will converge quite slowly in comparison to what a proper policy should be able to do. On the plus side, we will chew up less CPU for workloads that have no need for automatic balancing. Signed-off-by: Mel Gorman --- mm/huge_memory.c | 2 +- mm/memory.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 79b96064f8fc..199b261a257e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1068,7 +1068,7 @@ out_unlock: spin_unlock(&mm->page_table_lock); if (page) { put_page(page); - task_numa_fault(numa_node_id(), HPAGE_PMD_NR); + task_numa_fault(numa_node_id(), HPAGE_PMD_NR, false); } return 0; } diff --git a/mm/memory.c b/mm/memory.c index 84c6d9eab182..39edb11b63dc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3468,6 +3468,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; int current_nid = -1; int target_nid; + bool migrated = false; /* * The "pte" at this point cannot be used safely without @@ -3509,12 +3510,13 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } /* Migrate to the requested node */ - if (migrate_misplaced_page(page, target_nid)) + migrated = migrate_misplaced_page(page, target_nid); + if (migrated) current_nid = target_nid; out: if (current_nid != -1) - task_numa_fault(current_nid, 1); + task_numa_fault(current_nid, 1, migrated); return 0; } @@ -3554,6 +3556,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; int curr_nid = local_nid; int target_nid; + bool migrated; if (!pte_present(pteval)) continue; if (!pte_numa(pteval)) @@ -3590,9 +3593,10 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Migrate to the requested node */ pte_unmap_unlock(pte, ptl); - if (migrate_misplaced_page(page, target_nid)) + migrated = migrate_misplaced_page(page, target_nid); + if (migrated) curr_nid = target_nid; - task_numa_fault(curr_nid, 1); + task_numa_fault(curr_nid, 1, migrated); pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } -- cgit From 1a687c2e9a99335c9e77392f050fe607fa18a652 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 22 Nov 2012 11:16:36 +0000 Subject: mm: sched: numa: Control enabling and disabling of NUMA balancing This patch adds Kconfig options and kernel parameters to allow the enabling and disabling of automatic NUMA balancing. The existance of such a switch was and is very important when debugging problems related to transparent hugepages and we should have the same for automatic NUMA placement. Signed-off-by: Mel Gorman --- mm/mempolicy.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fd20e28fd2ad..046308e9b999 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2521,6 +2521,50 @@ void mpol_free_shared_policy(struct shared_policy *p) mutex_unlock(&p->mutex); } +#ifdef CONFIG_NUMA_BALANCING +static bool __initdata numabalancing_override; + +static void __init check_numabalancing_enable(void) +{ + bool numabalancing_default = false; + + if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) + numabalancing_default = true; + + if (nr_node_ids > 1 && !numabalancing_override) { + printk(KERN_INFO "Enabling automatic NUMA balancing. " + "Configure with numa_balancing= or sysctl"); + set_numabalancing_state(numabalancing_default); + } +} + +static int __init setup_numabalancing(char *str) +{ + int ret = 0; + if (!str) + goto out; + numabalancing_override = true; + + if (!strcmp(str, "enable")) { + set_numabalancing_state(true); + ret = 1; + } else if (!strcmp(str, "disable")) { + set_numabalancing_state(false); + ret = 1; + } +out: + if (!ret) + printk(KERN_WARNING "Unable to parse numa_balancing=\n"); + + return ret; +} +__setup("numa_balancing=", setup_numabalancing); +#else +static inline void __init check_numabalancing_enable(void) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + /* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { @@ -2571,6 +2615,8 @@ void __init numa_policy_init(void) if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) printk("numa_policy_init: interleaving failed\n"); + + check_numabalancing_enable(); } /* Reset policy of current process to default */ -- cgit From b32967ff101a7508f70be8de59b278d4df92fa00 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 19 Nov 2012 12:35:47 +0000 Subject: mm: numa: Add THP migration for the NUMA working set scanning fault case. Note: This is very heavily based on a patch from Peter Zijlstra with fixes from Ingo Molnar, Hugh Dickins and Johannes Weiner. That patch put a lot of migration logic into mm/huge_memory.c where it does not belong. This version puts tries to share some of the migration logic with migrate_misplaced_page. However, it should be noted that now migrate.c is doing more with the pagetable manipulation than is preferred. The end result is barely recognisable so as before, the signed-offs had to be removed but will be re-added if the original authors are ok with it. Add THP migration for the NUMA working set scanning fault case. It uses the page lock to serialize. No migration pte dance is necessary because the pte is already unmapped when we decide to migrate. [dhillf@gmail.com: Fix memory leak on isolation failure] [dhillf@gmail.com: Fix transfer of last_nid information] Signed-off-by: Mel Gorman --- mm/huge_memory.c | 59 +++++++++----- mm/internal.h | 7 +- mm/memcontrol.c | 7 +- mm/migrate.c | 231 +++++++++++++++++++++++++++++++++++++++++++++---------- 4 files changed, 240 insertions(+), 64 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 199b261a257e..711baf84b153 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -600,7 +600,7 @@ out: } __setup("transparent_hugepage=", setup_transparent_hugepage); -static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pmd = pmd_mkwrite(pmd); @@ -1023,10 +1023,12 @@ out: int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, pmd_t *pmdp) { - struct page *page = NULL; + struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; int target_nid; int current_nid = -1; + bool migrated; + bool page_locked = false; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) @@ -1034,42 +1036,61 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pmd_page(pmd); get_page(page); - spin_unlock(&mm->page_table_lock); current_nid = page_to_nid(page); count_vm_numa_event(NUMA_HINT_FAULTS); if (current_nid == numa_node_id()) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == -1) + if (target_nid == -1) { + put_page(page); goto clear_pmdnuma; + } - /* - * Due to lacking code to migrate thp pages, we'll split - * (which preserves the special PROT_NONE) and re-take the - * fault on the normal pages. - */ - split_huge_page(page); - put_page(page); - - return 0; + /* Acquire the page lock to serialise THP migrations */ + spin_unlock(&mm->page_table_lock); + lock_page(page); + page_locked = true; -clear_pmdnuma: + /* Confirm the PTE did not while locked */ spin_lock(&mm->page_table_lock); - if (unlikely(!pmd_same(pmd, *pmdp))) + if (unlikely(!pmd_same(pmd, *pmdp))) { + unlock_page(page); + put_page(page); goto out_unlock; + } + spin_unlock(&mm->page_table_lock); + + /* Migrate the THP to the requested node */ + migrated = migrate_misplaced_transhuge_page(mm, vma, + pmdp, pmd, addr, + page, target_nid); + if (migrated) + current_nid = target_nid; + else { + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) { + unlock_page(page); + goto out_unlock; + } + goto clear_pmdnuma; + } + + task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); + return 0; +clear_pmdnuma: pmd = pmd_mknonnuma(pmd); set_pmd_at(mm, haddr, pmdp, pmd); VM_BUG_ON(pmd_numa(*pmdp)); update_mmu_cache_pmd(vma, addr, pmdp); + if (page_locked) + unlock_page(page); out_unlock: spin_unlock(&mm->page_table_lock); - if (page) { - put_page(page); - task_numa_fault(numa_node_id(), HPAGE_PMD_NR, false); - } + if (current_nid != -1) + task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); return 0; } diff --git a/mm/internal.h b/mm/internal.h index a4fa284f6bc2..7e60ac826f2b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -212,15 +212,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) { if (TestClearPageMlocked(page)) { unsigned long flags; + int nr_pages = hpage_nr_pages(page); local_irq_save(flags); - __dec_zone_page_state(page, NR_MLOCK); + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); SetPageMlocked(newpage); - __inc_zone_page_state(newpage, NR_MLOCK); + __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); local_irq_restore(flags); } } +extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); + #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dd39ba000b31..d97af9636ab2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3288,15 +3288,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **memcgp) { struct mem_cgroup *memcg = NULL; + unsigned int nr_pages = 1; struct page_cgroup *pc; enum charge_type ctype; *memcgp = NULL; - VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) return; + if (PageTransHuge(page)) + nr_pages <<= compound_order(page); + pc = lookup_page_cgroup(page); lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { @@ -3358,7 +3361,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, * charged to the res_counter since we plan on replacing the * old one and only one page is going to be left afterwards. */ - __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); + __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); } /* remove redundant charge if migration failed*/ diff --git a/mm/migrate.c b/mm/migrate.c index 2a5ce135eef0..c9400960fd52 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -410,7 +410,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, */ void migrate_page_copy(struct page *newpage, struct page *page) { - if (PageHuge(page)) + if (PageHuge(page) || PageTransHuge(page)) copy_huge_page(newpage, page); else copy_highpage(newpage, page); @@ -1491,25 +1491,10 @@ bool migrate_ratelimited(int node) return true; } -/* - * Attempt to migrate a misplaced page to the specified destination - * node. Caller is expected to have an elevated reference count on - * the page that will be dropped by this function before returning. - */ -int migrate_misplaced_page(struct page *page, int node) +/* Returns true if the node is migrate rate-limited after the update */ +bool numamigrate_update_ratelimit(pg_data_t *pgdat) { - pg_data_t *pgdat = NODE_DATA(node); - int isolated = 0; - LIST_HEAD(migratepages); - - /* - * Don't migrate pages that are mapped in multiple processes. - * TODO: Handle false sharing detection instead of this hammer - */ - if (page_mapcount(page) != 1) { - put_page(page); - goto out; - } + bool rate_limited = false; /* * Rate-limit the amount of data that is being migrated to a node. @@ -1522,13 +1507,18 @@ int migrate_misplaced_page(struct page *page, int node) pgdat->numabalancing_migrate_next_window = jiffies + msecs_to_jiffies(migrate_interval_millisecs); } - if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { - spin_unlock(&pgdat->numabalancing_migrate_lock); - put_page(page); - goto out; - } - pgdat->numabalancing_migrate_nr_pages++; + if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) + rate_limited = true; + else + pgdat->numabalancing_migrate_nr_pages++; spin_unlock(&pgdat->numabalancing_migrate_lock); + + return rate_limited; +} + +int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) +{ + int ret = 0; /* Avoid migrating to a node that is nearly full */ if (migrate_balanced_pgdat(pgdat, 1)) { @@ -1536,13 +1526,18 @@ int migrate_misplaced_page(struct page *page, int node) if (isolate_lru_page(page)) { put_page(page); - goto out; + return 0; } - isolated = 1; + /* Page is isolated */ + ret = 1; page_lru = page_is_file_cache(page); - inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); - list_add(&page->lru, &migratepages); + if (!PageTransHuge(page)) + inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); + else + mod_zone_page_state(page_zone(page), + NR_ISOLATED_ANON + page_lru, + HPAGE_PMD_NR); } /* @@ -1555,23 +1550,177 @@ int migrate_misplaced_page(struct page *page, int node) */ put_page(page); - if (isolated) { - int nr_remaining; - - nr_remaining = migrate_pages(&migratepages, - alloc_misplaced_dst_page, - node, false, MIGRATE_ASYNC, - MR_NUMA_MISPLACED); - if (nr_remaining) { - putback_lru_pages(&migratepages); - isolated = 0; - } else - count_vm_numa_event(NUMA_PAGE_MIGRATE); + return ret; +} + +/* + * Attempt to migrate a misplaced page to the specified destination + * node. Caller is expected to have an elevated reference count on + * the page that will be dropped by this function before returning. + */ +int migrate_misplaced_page(struct page *page, int node) +{ + pg_data_t *pgdat = NODE_DATA(node); + int isolated = 0; + int nr_remaining; + LIST_HEAD(migratepages); + + /* + * Don't migrate pages that are mapped in multiple processes. + * TODO: Handle false sharing detection instead of this hammer + */ + if (page_mapcount(page) != 1) { + put_page(page); + goto out; } + + /* + * Rate-limit the amount of data that is being migrated to a node. + * Optimal placement is no good if the memory bus is saturated and + * all the time is being spent migrating! + */ + if (numamigrate_update_ratelimit(pgdat)) { + put_page(page); + goto out; + } + + isolated = numamigrate_isolate_page(pgdat, page); + if (!isolated) + goto out; + + list_add(&page->lru, &migratepages); + nr_remaining = migrate_pages(&migratepages, + alloc_misplaced_dst_page, + node, false, MIGRATE_ASYNC, + MR_NUMA_MISPLACED); + if (nr_remaining) { + putback_lru_pages(&migratepages); + isolated = 0; + } else + count_vm_numa_event(NUMA_PAGE_MIGRATE); BUG_ON(!list_empty(&migratepages)); out: return isolated; } + +int migrate_misplaced_transhuge_page(struct mm_struct *mm, + struct vm_area_struct *vma, + pmd_t *pmd, pmd_t entry, + unsigned long address, + struct page *page, int node) +{ + unsigned long haddr = address & HPAGE_PMD_MASK; + pg_data_t *pgdat = NODE_DATA(node); + int isolated = 0; + struct page *new_page = NULL; + struct mem_cgroup *memcg = NULL; + int page_lru = page_is_file_cache(page); + + /* + * Don't migrate pages that are mapped in multiple processes. + * TODO: Handle false sharing detection instead of this hammer + */ + if (page_mapcount(page) != 1) + goto out_dropref; + + /* + * Rate-limit the amount of data that is being migrated to a node. + * Optimal placement is no good if the memory bus is saturated and + * all the time is being spent migrating! + */ + if (numamigrate_update_ratelimit(pgdat)) + goto out_dropref; + + new_page = alloc_pages_node(node, + (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); + if (!new_page) + goto out_dropref; + page_xchg_last_nid(new_page, page_last_nid(page)); + + isolated = numamigrate_isolate_page(pgdat, page); + if (!isolated) { + put_page(new_page); + goto out_keep_locked; + } + + /* Prepare a page as a migration target */ + __set_page_locked(new_page); + SetPageSwapBacked(new_page); + + /* anon mapping, we can simply copy page->mapping to the new page: */ + new_page->mapping = page->mapping; + new_page->index = page->index; + migrate_page_copy(new_page, page); + WARN_ON(PageLRU(new_page)); + + /* Recheck the target PMD */ + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, entry))) { + spin_unlock(&mm->page_table_lock); + + /* Reverse changes made by migrate_page_copy() */ + if (TestClearPageActive(new_page)) + SetPageActive(page); + if (TestClearPageUnevictable(new_page)) + SetPageUnevictable(page); + mlock_migrate_page(page, new_page); + + unlock_page(new_page); + put_page(new_page); /* Free it */ + + unlock_page(page); + putback_lru_page(page); + + count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); + goto out; + } + + /* + * Traditional migration needs to prepare the memcg charge + * transaction early to prevent the old page from being + * uncharged when installing migration entries. Here we can + * save the potential rollback and start the charge transfer + * only when migration is already known to end successfully. + */ + mem_cgroup_prepare_migration(page, new_page, &memcg); + + entry = mk_pmd(new_page, vma->vm_page_prot); + entry = pmd_mknonnuma(entry); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = pmd_mkhuge(entry); + + page_add_new_anon_rmap(new_page, vma, haddr); + + set_pmd_at(mm, haddr, pmd, entry); + update_mmu_cache_pmd(vma, address, entry); + page_remove_rmap(page); + /* + * Finish the charge transaction under the page table lock to + * prevent split_huge_page() from dividing up the charge + * before it's fully transferred to the new page. + */ + mem_cgroup_end_migration(memcg, page, new_page, true); + spin_unlock(&mm->page_table_lock); + + unlock_page(new_page); + unlock_page(page); + put_page(page); /* Drop the rmap reference */ + put_page(page); /* Drop the LRU isolation reference */ + + count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); + count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); + +out: + mod_zone_page_state(page_zone(page), + NR_ISOLATED_ANON + page_lru, + -HPAGE_PMD_NR); + return isolated; + +out_dropref: + put_page(page); +out_keep_locked: + return 0; +} #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA */ -- cgit From 220018d388b8ab1fca1c5f0c6474bab47ad2c9c0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 5 Dec 2012 09:32:56 +0000 Subject: mm: numa: Add THP migration for the NUMA working set scanning fault case build fix Commit "Add THP migration for the NUMA working set scanning fault case" breaks the build because HPAGE_PMD_SHIFT and HPAGE_PMD_MASK defined to explode without CONFIG_TRANSPARENT_HUGEPAGE: mm/migrate.c: In function 'migrate_misplaced_transhuge_page_put': mm/migrate.c:1549: error: call to '__build_bug_failed' declared with attribute error: BUILD_BUG failed mm/migrate.c:1564: error: call to '__build_bug_failed' declared with attribute error: BUILD_BUG failed mm/migrate.c:1566: error: call to '__build_bug_failed' declared with attribute error: BUILD_BUG failed mm/migrate.c:1573: error: call to '__build_bug_failed' declared with attribute error: BUILD_BUG failed mm/migrate.c:1606: error: call to '__build_bug_failed' declared with attribute error: BUILD_BUG failed mm/migrate.c:1648: error: call to '__build_bug_failed' declared with attribute error: BUILD_BUG failed CONFIG_NUMA_BALANCING allows compilation without enabling transparent hugepages, so define the dummy function for such a configuration and only define migrate_misplaced_transhuge_page_put() when transparent hugepages are enabled. Signed-off-by: David Rientjes Signed-off-by: Mel Gorman --- mm/migrate.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index c9400960fd52..9341a501d168 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1602,7 +1602,9 @@ int migrate_misplaced_page(struct page *page, int node) out: return isolated; } +#endif /* CONFIG_NUMA_BALANCING */ +#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) int migrate_misplaced_transhuge_page(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd, pmd_t entry, -- cgit From 7548341b28956ccd35a63ab12f01d8541041aa70 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 27 Nov 2012 10:31:44 +0000 Subject: mm: numa: Account for failed allocations and isolations as migration failures Subject says it all. Allocation failures and a failure to isolate should be accounted as a migration failure. This is partially another difference between base page and transhuge page migration. A base page migration makes multiple attempts for these conditions before it would be accounted for as a failure. Signed-off-by: Mel Gorman --- mm/migrate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 9341a501d168..26537c4f3094 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1635,12 +1635,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, new_page = alloc_pages_node(node, (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); - if (!new_page) + if (!new_page) { + count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); goto out_dropref; + } page_xchg_last_nid(new_page, page_last_nid(page)); isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) { + count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); put_page(new_page); goto out_keep_locked; } -- cgit From d28d433512f4f387e2563c14db45a7bb8a338b1a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 29 Nov 2012 09:24:36 +0000 Subject: mm: migrate: Account a transhuge page properly when rate limiting If there is excessive migration due to NUMA balancing it gets rate limited. It does this by counting the number of pages it has migrated recently but counts a transhuge page as 1 page. Account for it properly. Signed-off-by: Mel Gorman --- mm/migrate.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 26537c4f3094..f24e9cc49cc4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1492,7 +1492,7 @@ bool migrate_ratelimited(int node) } /* Returns true if the node is migrate rate-limited after the update */ -bool numamigrate_update_ratelimit(pg_data_t *pgdat) +bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) { bool rate_limited = false; @@ -1510,7 +1510,7 @@ bool numamigrate_update_ratelimit(pg_data_t *pgdat) if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) rate_limited = true; else - pgdat->numabalancing_migrate_nr_pages++; + pgdat->numabalancing_migrate_nr_pages += nr_pages; spin_unlock(&pgdat->numabalancing_migrate_lock); return rate_limited; @@ -1579,7 +1579,7 @@ int migrate_misplaced_page(struct page *page, int node) * Optimal placement is no good if the memory bus is saturated and * all the time is being spent migrating! */ - if (numamigrate_update_ratelimit(pgdat)) { + if (numamigrate_update_ratelimit(pgdat, 1)) { put_page(page); goto out; } @@ -1630,7 +1630,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, * Optimal placement is no good if the memory bus is saturated and * all the time is being spent migrating! */ - if (numamigrate_update_ratelimit(pgdat)) + if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) goto out_dropref; new_page = alloc_pages_node(node, -- cgit From 5a505085f043e8380f83610f79642853c051e2f1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 2 Dec 2012 19:56:46 +0000 Subject: mm/rmap: Convert the struct anon_vma::mutex to an rwsem Convert the struct anon_vma::mutex to an rwsem, which will help in solving a page-migration scalability problem. (Addressed in a separate patch.) The conversion is simple and straightforward: in every case where we mutex_lock()ed we'll now down_write(). Suggested-by: Linus Torvalds Reviewed-by: Rik van Riel Cc: Peter Zijlstra Cc: Paul Turner Cc: Lee Schermerhorn Cc: Christoph Lameter Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman --- mm/huge_memory.c | 4 ++-- mm/mmap.c | 8 ++++---- mm/rmap.c | 22 +++++++++++----------- 3 files changed, 17 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 711baf84b153..acd37fe55eb7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1292,7 +1292,7 @@ static int __split_huge_page_splitting(struct page *page, * We can't temporarily set the pmd to null in order * to split it, the pmd must remain marked huge at all * times or the VM won't take the pmd_trans_huge paths - * and it won't wait on the anon_vma->root->mutex to + * and it won't wait on the anon_vma->root->rwsem to * serialize against split_huge_page*. */ pmdp_splitting_flush(vma, address, pmd); @@ -1495,7 +1495,7 @@ static int __split_huge_page_map(struct page *page, return ret; } -/* must be called with anon_vma->root->mutex hold */ +/* must be called with anon_vma->root->rwsem held */ static void __split_huge_page(struct page *page, struct anon_vma *anon_vma) { diff --git a/mm/mmap.c b/mm/mmap.c index 9a796c41e7d9..88408632da66 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2561,15 +2561,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem); + down_write(&anon_vma->root->rwsem); /* * We can safely modify head.next after taking the - * anon_vma->root->mutex. If some other vma in this mm shares + * anon_vma->root->rwsem. If some other vma in this mm shares * the same anon_vma we won't take it again. * * No need of atomic instructions here, head.next * can't change from under us thanks to the - * anon_vma->root->mutex. + * anon_vma->root->rwsem. */ if (__test_and_set_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) @@ -2671,7 +2671,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) * * No need of atomic instructions here, head.next * can't change from under us until we release the - * anon_vma->root->mutex. + * anon_vma->root->rwsem. */ if (!__test_and_clear_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) diff --git a/mm/rmap.c b/mm/rmap.c index 2ee1ef0f317b..6e3ee3b82798 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -24,7 +24,7 @@ * mm->mmap_sem * page->flags PG_locked (lock_page) * mapping->i_mmap_mutex - * anon_vma->mutex + * anon_vma->rwsem * mm->page_table_lock or pte_lock * zone->lru_lock (in mark_page_accessed, isolate_lru_page) * swap_lock (in swap_duplicate, swap_info_get) @@ -37,7 +37,7 @@ * in arch-dependent flush_dcache_mmap_lock, * within bdi.wb->list_lock in __sync_single_inode) * - * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) + * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock */ @@ -103,7 +103,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) * LOCK should suffice since the actual taking of the lock must * happen _before_ what follows. */ - if (mutex_is_locked(&anon_vma->root->mutex)) { + if (rwsem_is_locked(&anon_vma->root->rwsem)) { anon_vma_lock(anon_vma); anon_vma_unlock(anon_vma); } @@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct struct anon_vma *new_root = anon_vma->root; if (new_root != root) { if (WARN_ON_ONCE(root)) - mutex_unlock(&root->mutex); + up_write(&root->rwsem); root = new_root; - mutex_lock(&root->mutex); + down_write(&root->rwsem); } return root; } @@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct static inline void unlock_anon_vma_root(struct anon_vma *root) { if (root) - mutex_unlock(&root->mutex); + up_write(&root->rwsem); } /* @@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) /* * Iterate the list once more, it now only contains empty and unlinked * anon_vmas, destroy them. Could not do before due to __put_anon_vma() - * needing to acquire the anon_vma->root->mutex. + * needing to write-acquire the anon_vma->root->rwsem. */ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; @@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data) { struct anon_vma *anon_vma = data; - mutex_init(&anon_vma->mutex); + init_rwsem(&anon_vma->rwsem); atomic_set(&anon_vma->refcount, 0); anon_vma->rb_root = RB_ROOT; } @@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page) anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); root_anon_vma = ACCESS_ONCE(anon_vma->root); - if (mutex_trylock(&root_anon_vma->mutex)) { + if (down_write_trylock(&root_anon_vma->rwsem)) { /* * If the page is still mapped, then this anon_vma is still * its anon_vma, and holding the mutex ensures that it will * not go away, see anon_vma_free(). */ if (!page_mapped(page)) { - mutex_unlock(&root_anon_vma->mutex); + up_write(&root_anon_vma->rwsem); anon_vma = NULL; } goto out; @@ -1299,7 +1299,7 @@ out_mlock: /* * We need mmap_sem locking, Otherwise VM_LOCKED check makes * unstable result and race. Plus, We can't wait here because - * we now hold anon_vma->mutex or mapping->i_mmap_mutex. + * we now hold anon_vma->rwsem or mapping->i_mmap_mutex. * if trylock failed, the page remain in evictable lru and later * vmscan could retry to move the page to unevictable lru if the * page is actually mlocked. -- cgit From 4fc3f1d66b1ef0d7b8dc11f4ff1cc510f78b37d6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 2 Dec 2012 19:56:50 +0000 Subject: mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable rmap_walk_anon() and try_to_unmap_anon() appears to be too careful about locking the anon vma: while it needs protection against anon vma list modifications, it does not need exclusive access to the list itself. Transforming this exclusive lock to a read-locked rwsem removes a global lock from the hot path of page-migration intense threaded workloads which can cause pathological performance like this: 96.43% process 0 [kernel.kallsyms] [k] perf_trace_sched_switch | --- perf_trace_sched_switch __schedule schedule schedule_preempt_disabled __mutex_lock_common.isra.6 __mutex_lock_slowpath mutex_lock | |--50.61%-- rmap_walk | move_to_new_page | migrate_pages | migrate_misplaced_page | __do_numa_page.isra.69 | handle_pte_fault | handle_mm_fault | __do_page_fault | do_page_fault | page_fault | __memset_sse2 | | | --100.00%-- worker_thread | | | --100.00%-- start_thread | --49.39%-- page_lock_anon_vma try_to_unmap_anon try_to_unmap migrate_pages migrate_misplaced_page __do_numa_page.isra.69 handle_pte_fault handle_mm_fault __do_page_fault do_page_fault page_fault __memset_sse2 | --100.00%-- worker_thread start_thread With this change applied the profile is now nicely flat and there's no anon-vma related scheduling/blocking. Rename anon_vma_[un]lock() => anon_vma_[un]lock_write(), to make it clearer that it's an exclusive write-lock in that case - suggested by Rik van Riel. Suggested-by: Linus Torvalds Cc: Peter Zijlstra Cc: Paul Turner Cc: Lee Schermerhorn Cc: Christoph Lameter Cc: Rik van Riel Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman --- mm/huge_memory.c | 6 +++--- mm/ksm.c | 6 +++--- mm/memory-failure.c | 4 ++-- mm/migrate.c | 2 +- mm/mmap.c | 2 +- mm/mremap.c | 2 +- mm/rmap.c | 48 ++++++++++++++++++++++++------------------------ 7 files changed, 35 insertions(+), 35 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index acd37fe55eb7..a24c9cb9c83e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1549,7 +1549,7 @@ int split_huge_page(struct page *page) int ret = 1; BUG_ON(!PageAnon(page)); - anon_vma = page_lock_anon_vma(page); + anon_vma = page_lock_anon_vma_read(page); if (!anon_vma) goto out; ret = 0; @@ -1562,7 +1562,7 @@ int split_huge_page(struct page *page) BUG_ON(PageCompound(page)); out_unlock: - page_unlock_anon_vma(anon_vma); + page_unlock_anon_vma_read(anon_vma); out: return ret; } @@ -2074,7 +2074,7 @@ static void collapse_huge_page(struct mm_struct *mm, if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) goto out; - anon_vma_lock(vma->anon_vma); + anon_vma_lock_write(vma->anon_vma); pte = pte_offset_map(pmd, address); ptl = pte_lockptr(mm, pmd); diff --git a/mm/ksm.c b/mm/ksm.c index ae539f0b8aa1..7fa37de1ee0c 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1634,7 +1634,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1688,7 +1688,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1741,7 +1741,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ddb68a169e45..f2cd830f66c0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct anon_vma *av; pgoff_t pgoff; - av = page_lock_anon_vma(page); + av = page_lock_anon_vma_read(page); if (av == NULL) /* Not actually mapped anymore */ return; @@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, } } read_unlock(&tasklist_lock); - page_unlock_anon_vma(av); + page_unlock_anon_vma_read(av); } /* diff --git a/mm/migrate.c b/mm/migrate.c index f24e9cc49cc4..6e46485f014c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -754,7 +754,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, */ if (PageAnon(page)) { /* - * Only page_lock_anon_vma() understands the subtleties of + * Only page_lock_anon_vma_read() understands the subtleties of * getting a hold on an anon_vma from outside one of its mms. */ anon_vma = page_get_anon_vma(page); diff --git a/mm/mmap.c b/mm/mmap.c index 88408632da66..68a16b40c209 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -602,7 +602,7 @@ again: remove_next = 1 + (end > next->vm_end); if (anon_vma) { VM_BUG_ON(adjust_next && next->anon_vma && anon_vma != next->anon_vma); - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_interval_tree_pre_update_vma(vma); if (adjust_next) anon_vma_interval_tree_pre_update_vma(next); diff --git a/mm/mremap.c b/mm/mremap.c index 1b61c2d3307a..3dabd170753a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, } if (vma->anon_vma) { anon_vma = vma->anon_vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); } } diff --git a/mm/rmap.c b/mm/rmap.c index 6e3ee3b82798..b0f612df7b9d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) VM_BUG_ON(atomic_read(&anon_vma->refcount)); /* - * Synchronize against page_lock_anon_vma() such that + * Synchronize against page_lock_anon_vma_read() such that * we can safely hold the lock without the anon_vma getting * freed. * * Relies on the full mb implied by the atomic_dec_and_test() from * put_anon_vma() against the acquire barrier implied by - * mutex_trylock() from page_lock_anon_vma(). This orders: + * down_read_trylock() from page_lock_anon_vma_read(). This orders: * - * page_lock_anon_vma() VS put_anon_vma() - * mutex_trylock() atomic_dec_and_test() + * page_lock_anon_vma_read() VS put_anon_vma() + * down_read_trylock() atomic_dec_and_test() * LOCK MB - * atomic_read() mutex_is_locked() + * atomic_read() rwsem_is_locked() * * LOCK should suffice since the actual taking of the lock must * happen _before_ what follows. */ if (rwsem_is_locked(&anon_vma->root->rwsem)) { - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_unlock(anon_vma); } @@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * allocate a new one. * * Anon-vma allocations are very subtle, because we may have - * optimistically looked up an anon_vma in page_lock_anon_vma() + * optimistically looked up an anon_vma in page_lock_anon_vma_read() * and that may actually touch the spinlock even in the newly * allocated vma (it depends on RCU to make sure that the * anon_vma isn't actually destroyed). @@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) allocated = anon_vma; } - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { @@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_chain_link(vma, avc, anon_vma); anon_vma_unlock(anon_vma); @@ -442,7 +442,7 @@ out: * atomic op -- the trylock. If we fail the trylock, we fall back to getting a * reference like with page_get_anon_vma() and then block on the mutex. */ -struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *page_lock_anon_vma_read(struct page *page) { struct anon_vma *anon_vma = NULL; struct anon_vma *root_anon_vma; @@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page) anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); root_anon_vma = ACCESS_ONCE(anon_vma->root); - if (down_write_trylock(&root_anon_vma->rwsem)) { + if (down_read_trylock(&root_anon_vma->rwsem)) { /* * If the page is still mapped, then this anon_vma is still * its anon_vma, and holding the mutex ensures that it will * not go away, see anon_vma_free(). */ if (!page_mapped(page)) { - up_write(&root_anon_vma->rwsem); + up_read(&root_anon_vma->rwsem); anon_vma = NULL; } goto out; @@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page) /* we pinned the anon_vma, its safe to sleep */ rcu_read_unlock(); - anon_vma_lock(anon_vma); + anon_vma_lock_read(anon_vma); if (atomic_dec_and_test(&anon_vma->refcount)) { /* * Oops, we held the last refcount, release the lock * and bail -- can't simply use put_anon_vma() because - * we'll deadlock on the anon_vma_lock() recursion. + * we'll deadlock on the anon_vma_lock_write() recursion. */ - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); __put_anon_vma(anon_vma); anon_vma = NULL; } @@ -504,9 +504,9 @@ out: return anon_vma; } -void page_unlock_anon_vma(struct anon_vma *anon_vma) +void page_unlock_anon_vma_read(struct anon_vma *anon_vma) { - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); } /* @@ -732,7 +732,7 @@ static int page_referenced_anon(struct page *page, struct anon_vma_chain *avc; int referenced = 0; - anon_vma = page_lock_anon_vma(page); + anon_vma = page_lock_anon_vma_read(page); if (!anon_vma) return referenced; @@ -754,7 +754,7 @@ static int page_referenced_anon(struct page *page, break; } - page_unlock_anon_vma(anon_vma); + page_unlock_anon_vma_read(anon_vma); return referenced; } @@ -1474,7 +1474,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) struct anon_vma_chain *avc; int ret = SWAP_AGAIN; - anon_vma = page_lock_anon_vma(page); + anon_vma = page_lock_anon_vma_read(page); if (!anon_vma) return ret; @@ -1501,7 +1501,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) break; } - page_unlock_anon_vma(anon_vma); + page_unlock_anon_vma_read(anon_vma); return ret; } @@ -1696,7 +1696,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, int ret = SWAP_AGAIN; /* - * Note: remove_migration_ptes() cannot use page_lock_anon_vma() + * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() * because that depends on page_mapped(); but not all its usages * are holding mmap_sem. Users without mmap_sem are required to * take a reference count to prevent the anon_vma disappearing @@ -1704,7 +1704,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, anon_vma = page_anon_vma(page); if (!anon_vma) return ret; - anon_vma_lock(anon_vma); + anon_vma_lock_read(anon_vma); anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); @@ -1712,7 +1712,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, if (ret != SWAP_AGAIN) break; } - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); return ret; } -- cgit From d0e1d66b5aa1ec9f556f951aa9a114cc192cd01c Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 11 Dec 2012 16:00:21 -0800 Subject: writeback: remove nr_pages_dirtied arg from balance_dirty_pages_ratelimited_nr() There is no reason to pass the nr_pages_dirtied argument, because nr_pages_dirtied value from the caller is unused in balance_dirty_pages_ratelimited_nr(). Signed-off-by: Namjae Jeon Signed-off-by: Vivek Trivedi Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 830893b2b3c7..6f4271224493 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1069,7 +1069,7 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi, } /* - * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr() + * After a task dirtied this many pages, balance_dirty_pages_ratelimited() * will look to see if it needs to start dirty throttling. * * If dirty_poll_interval is too low, big NUMA machines will call the expensive @@ -1436,9 +1436,8 @@ static DEFINE_PER_CPU(int, bdp_ratelimits); DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; /** - * balance_dirty_pages_ratelimited_nr - balance dirty memory state + * balance_dirty_pages_ratelimited - balance dirty memory state * @mapping: address_space which was dirtied - * @nr_pages_dirtied: number of pages which the caller has just dirtied * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's @@ -1449,8 +1448,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; * limit we decrease the ratelimiting by a lot, to prevent individual processes * from overshooting the limit by (ratelimit_pages) each. */ -void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, - unsigned long nr_pages_dirtied) +void balance_dirty_pages_ratelimited(struct address_space *mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; int ratelimit; @@ -1484,6 +1482,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, */ p = &__get_cpu_var(dirty_throttle_leaks); if (*p > 0 && current->nr_dirtied < ratelimit) { + unsigned long nr_pages_dirtied; nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); *p -= nr_pages_dirtied; current->nr_dirtied += nr_pages_dirtied; @@ -1493,7 +1492,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, if (unlikely(current->nr_dirtied >= ratelimit)) balance_dirty_pages(mapping, current->nr_dirtied); } -EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); +EXPORT_SYMBOL(balance_dirty_pages_ratelimited); void throttle_vm_writeout(gfp_t gfp_mask) { -- cgit From 377e4f167664d8bc390c04c911d846366000159c Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Tue, 11 Dec 2012 16:00:24 -0800 Subject: mm: show migration types in show_mem This is useful to diagnose the reason for page allocation failure for cases where there appear to be several free pages. Example, with this alloc_pages(GFP_ATOMIC) failure: swapper/0: page allocation failure: order:0, mode:0x0 ... Mem-info: Normal per-cpu: CPU 0: hi: 90, btch: 15 usd: 48 CPU 1: hi: 90, btch: 15 usd: 21 active_anon:0 inactive_anon:0 isolated_anon:0 active_file:0 inactive_file:84 isolated_file:0 unevictable:0 dirty:0 writeback:0 unstable:0 free:4026 slab_reclaimable:75 slab_unreclaimable:484 mapped:0 shmem:0 pagetables:0 bounce:0 Normal free:16104kB min:2296kB low:2868kB high:3444kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:336kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:331776kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:300kB slab_unreclaimable:1936kB kernel_stack:328kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 Before the patch, it's hard (for me, at least) to say why all these free chunks weren't considered for allocation: Normal: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 1*256kB 1*512kB 1*1024kB 1*2048kB 3*4096kB = 16128kB After the patch, it's obvious that the reason is that all of these are in the MIGRATE_CMA (C) freelist: Normal: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 1*256kB (C) 1*512kB (C) 1*1024kB (C) 1*2048kB (C) 3*4096kB (C) = 16128kB Signed-off-by: Rabin Vincent Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7e208f0ad68c..dc018b486b74 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2877,6 +2877,31 @@ out: #define K(x) ((x) << (PAGE_SHIFT-10)) +static void show_migration_types(unsigned char type) +{ + static const char types[MIGRATE_TYPES] = { + [MIGRATE_UNMOVABLE] = 'U', + [MIGRATE_RECLAIMABLE] = 'E', + [MIGRATE_MOVABLE] = 'M', + [MIGRATE_RESERVE] = 'R', +#ifdef CONFIG_CMA + [MIGRATE_CMA] = 'C', +#endif + [MIGRATE_ISOLATE] = 'I', + }; + char tmp[MIGRATE_TYPES + 1]; + char *p = tmp; + int i; + + for (i = 0; i < MIGRATE_TYPES; i++) { + if (type & (1 << i)) + *p++ = types[i]; + } + + *p = '\0'; + printk("(%s) ", tmp); +} + /* * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the @@ -3005,6 +3030,7 @@ void show_free_areas(unsigned int filter) for_each_populated_zone(zone) { unsigned long nr[MAX_ORDER], flags, order, total = 0; + unsigned char types[MAX_ORDER]; if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; @@ -3013,12 +3039,24 @@ void show_free_areas(unsigned int filter) spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { - nr[order] = zone->free_area[order].nr_free; + struct free_area *area = &zone->free_area[order]; + int type; + + nr[order] = area->nr_free; total += nr[order] << order; + + types[order] = 0; + for (type = 0; type < MIGRATE_TYPES; type++) { + if (!list_empty(&area->free_list[type])) + types[order] |= 1 << type; + } } spin_unlock_irqrestore(&zone->lock, flags); - for (order = 0; order < MAX_ORDER; order++) + for (order = 0; order < MAX_ORDER; order++) { printk("%lu*%lukB ", nr[order], K(1UL) << order); + if (nr[order]) + show_migration_types(types[order]); + } printk("= %lukB\n", K(total)); } -- cgit From 19965460e31c73a934d2c19c152f876a75bdff3e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 11 Dec 2012 16:00:26 -0800 Subject: mm, memcg: make mem_cgroup_out_of_memory() static mem_cgroup_out_of_memory() is only referenced from within file scope, so it can be marked static. Signed-off-by: David Rientjes Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dd39ba000b31..cf6d0df4849c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1498,8 +1498,8 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) return limit; } -void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, - int order) +static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, + int order) { struct mem_cgroup *iter; unsigned long chosen_points = 0; -- cgit From e5adfffc857788c8b7eca0e98cf1e26f1964b292 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 11 Dec 2012 16:00:29 -0800 Subject: mm: use IS_ENABLED(CONFIG_NUMA) instead of NUMA_BUILD We don't need custom NUMA_BUILD anymore, since we have handy IS_ENABLED(). Signed-off-by: Kirill A. Shutemov Acked-by: KOSAKI Motohiro Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 ++++++++++-------- mm/vmalloc.c | 4 ++-- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dc018b486b74..a49b0ea3cc2f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1871,7 +1871,7 @@ zonelist_scan: */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { - if (NUMA_BUILD && zlc_active && + if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; if ((alloc_flags & ALLOC_CPUSET) && @@ -1917,7 +1917,8 @@ zonelist_scan: classzone_idx, alloc_flags)) goto try_this_zone; - if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { + if (IS_ENABLED(CONFIG_NUMA) && + !did_zlc_setup && nr_online_nodes > 1) { /* * we do zlc_setup if there are multiple nodes * and before considering the first zone allowed @@ -1936,7 +1937,7 @@ zonelist_scan: * As we may have just activated ZLC, check if the first * eligible zone has failed zone_reclaim recently. */ - if (NUMA_BUILD && zlc_active && + if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; @@ -1962,11 +1963,11 @@ try_this_zone: if (page) break; this_zone_full: - if (NUMA_BUILD) + if (IS_ENABLED(CONFIG_NUMA)) zlc_mark_zone_full(zonelist, z); } - if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { + if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { /* Disable zlc cache for second zonelist scan */ zlc_active = 0; goto zonelist_scan; @@ -2266,7 +2267,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, return NULL; /* After successful reclaim, reconsider all zones for allocation */ - if (NUMA_BUILD) + if (IS_ENABLED(CONFIG_NUMA)) zlc_clear_zones_full(zonelist); retry: @@ -2412,7 +2413,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * allowed per node queues are empty and that nodes are * over allocated. */ - if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) + if (IS_ENABLED(CONFIG_NUMA) && + (gfp_mask & GFP_THISNODE) == GFP_THISNODE) goto nopage; restart: @@ -2819,7 +2821,7 @@ unsigned int nr_free_pagecache_pages(void) static inline void show_node(struct zone *zone) { - if (NUMA_BUILD) + if (IS_ENABLED(CONFIG_NUMA)) printk("Node %d ", zone_to_nid(zone)); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 78e08300db21..5123a169ab7b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2550,7 +2550,7 @@ static void s_stop(struct seq_file *m, void *p) static void show_numa_info(struct seq_file *m, struct vm_struct *v) { - if (NUMA_BUILD) { + if (IS_ENABLED(CONFIG_NUMA)) { unsigned int nr, *counters = m->private; if (!counters) @@ -2615,7 +2615,7 @@ static int vmalloc_open(struct inode *inode, struct file *file) unsigned int *ptr = NULL; int ret; - if (NUMA_BUILD) { + if (IS_ENABLED(CONFIG_NUMA)) { ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); if (ptr == NULL) return -ENOMEM; -- cgit From d84da3f9e4f18809821562bd960e00a10673b341 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 11 Dec 2012 16:00:31 -0800 Subject: mm: use IS_ENABLED(CONFIG_COMPACTION) instead of COMPACTION_BUILD We don't need custom COMPACTION_BUILD anymore, since we have handy IS_ENABLED(). Signed-off-by: Kirill A. Shutemov Acked-by: Minchan Kim Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index b7ed37675644..a1ce17f44be0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1752,7 +1752,7 @@ out: /* Use reclaim/compaction for costly allocs or under memory pressure */ static bool in_reclaim_compaction(struct scan_control *sc) { - if (COMPACTION_BUILD && sc->order && + if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && (sc->order > PAGE_ALLOC_COSTLY_ORDER || sc->priority < DEF_PRIORITY - 2)) return true; @@ -2005,7 +2005,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ - if (COMPACTION_BUILD) { + if (IS_ENABLED(CONFIG_COMPACTION)) { /* * If we already have plenty of memory free for * compaction in this zone, don't free any more. @@ -2421,7 +2421,8 @@ static bool zone_balanced(struct zone *zone, int order, balance_gap, classzone_idx, 0)) return false; - if (COMPACTION_BUILD && order && !compaction_suitable(zone, order)) + if (IS_ENABLED(CONFIG_COMPACTION) && order && + !compaction_suitable(zone, order)) return false; return true; @@ -2684,7 +2685,7 @@ loop_again: * Do not reclaim more than needed for compaction. */ testorder = order; - if (COMPACTION_BUILD && order && + if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, order) != COMPACT_SKIPPED) testorder = 0; -- cgit From 344aa35c27acdf70d3c67b5aa7cb6aa8585f80c1 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Tue, 11 Dec 2012 16:00:34 -0800 Subject: thp: clean up __collapse_huge_page_isolate There are duplicated places using release_pte_pages(). And release_all_pte_pages() can be removed. Signed-off-by: Bob Liu Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Minchan Kim Cc: Ni zhan Chen Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 38 +++++++++++--------------------------- 1 file changed, 11 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 40f17c34b415..6f022f505e88 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1701,64 +1701,49 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte) } } -static void release_all_pte_pages(pte_t *pte) -{ - release_pte_pages(pte, pte + HPAGE_PMD_NR); -} - static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte) { struct page *page; pte_t *_pte; - int referenced = 0, isolated = 0, none = 0; + int referenced = 0, none = 0; for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval)) { if (++none <= khugepaged_max_ptes_none) continue; - else { - release_pte_pages(pte, _pte); + else goto out; - } } - if (!pte_present(pteval) || !pte_write(pteval)) { - release_pte_pages(pte, _pte); + if (!pte_present(pteval) || !pte_write(pteval)) goto out; - } page = vm_normal_page(vma, address, pteval); - if (unlikely(!page)) { - release_pte_pages(pte, _pte); + if (unlikely(!page)) goto out; - } + VM_BUG_ON(PageCompound(page)); BUG_ON(!PageAnon(page)); VM_BUG_ON(!PageSwapBacked(page)); /* cannot use mapcount: can't collapse if there's a gup pin */ - if (page_count(page) != 1) { - release_pte_pages(pte, _pte); + if (page_count(page) != 1) goto out; - } /* * We can do it before isolate_lru_page because the * page can't be freed from under us. NOTE: PG_lock * is needed to serialize against split_huge_page * when invoked from the VM. */ - if (!trylock_page(page)) { - release_pte_pages(pte, _pte); + if (!trylock_page(page)) goto out; - } /* * Isolate the page to avoid collapsing an hugepage * currently in use by the VM. */ if (isolate_lru_page(page)) { unlock_page(page); - release_pte_pages(pte, _pte); goto out; } /* 0 stands for page_is_file_cache(page) == false */ @@ -1771,12 +1756,11 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, mmu_notifier_test_young(vma->vm_mm, address)) referenced = 1; } - if (unlikely(!referenced)) - release_all_pte_pages(pte); - else - isolated = 1; + if (likely(referenced)) + return 1; out: - return isolated; + release_pte_pages(pte, _pte); + return 0; } static void __collapse_huge_page_copy(pte_t *pte, struct page *page, -- cgit From 6219049ae1ce32b89236646cccaec2a5fc6c4fd2 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Tue, 11 Dec 2012 16:00:37 -0800 Subject: mm: introduce mm_find_pmd() Several place need to find the pmd by(mm_struct, address), so introduce a function to simplify it. [akpm@linux-foundation.org: fix warning] Signed-off-by: Bob Liu Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Minchan Kim Cc: Ni zhan Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 55 ++++++++++--------------------------------------------- mm/internal.h | 5 +++++ mm/ksm.c | 14 ++------------ mm/migrate.c | 14 ++------------ mm/rmap.c | 48 +++++++++++++++++++++++++----------------------- 5 files changed, 44 insertions(+), 92 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6f022f505e88..9ae97242aa8d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1146,22 +1146,14 @@ pmd_t *page_check_address_pmd(struct page *page, unsigned long address, enum page_check_address_pmd_flag flag) { - pgd_t *pgd; - pud_t *pud; pmd_t *pmd, *ret = NULL; if (address & ~HPAGE_PMD_MASK) goto out; - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) + pmd = mm_find_pmd(mm, address); + if (!pmd) goto out; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) goto out; if (pmd_page(*pmd) != page) @@ -1908,8 +1900,6 @@ static void collapse_huge_page(struct mm_struct *mm, struct vm_area_struct *vma, int node) { - pgd_t *pgd; - pud_t *pud; pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; @@ -1955,17 +1945,10 @@ static void collapse_huge_page(struct mm_struct *mm, goto out; VM_BUG_ON(vma->vm_flags & VM_NO_THP); - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) + pmd = mm_find_pmd(mm, address); + if (!pmd) goto out; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, address); - /* pmd can't go away or become huge under us */ - if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + if (pmd_trans_huge(*pmd)) goto out; anon_vma_lock(vma->anon_vma); @@ -2048,8 +2031,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, unsigned long address, struct page **hpage) { - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; pte_t *pte, *_pte; int ret = 0, referenced = 0, none = 0; @@ -2060,16 +2041,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) + pmd = mm_find_pmd(mm, address); + if (!pmd) goto out; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + if (pmd_trans_huge(*pmd)) goto out; pte = pte_offset_map_lock(mm, pmd, address, &ptl); @@ -2363,22 +2338,12 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) static void split_huge_page_address(struct mm_struct *mm, unsigned long address) { - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - return; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return; - - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) + pmd = mm_find_pmd(mm, address); + if (!pmd) return; /* * Caller holds the mmap_sem write mode, so a huge pmd cannot diff --git a/mm/internal.h b/mm/internal.h index a4fa284f6bc2..52d1fa957194 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -91,6 +91,11 @@ extern unsigned long highest_memmap_pfn; extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); +/* + * in mm/rmap.c: + */ +extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); + /* * in mm/page_alloc.c */ diff --git a/mm/ksm.c b/mm/ksm.c index ae539f0b8aa1..31ae5ea1eac0 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -778,8 +778,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage, pte_t orig_pte) { struct mm_struct *mm = vma->vm_mm; - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; pte_t *ptep; spinlock_t *ptl; @@ -792,18 +790,10 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (addr == -EFAULT) goto out; - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) + pmd = mm_find_pmd(mm, addr); + if (!pmd) goto out; - - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, addr); BUG_ON(pmd_trans_huge(*pmd)); - if (!pmd_present(*pmd)) - goto out; mmun_start = addr; mmun_end = addr + PAGE_SIZE; diff --git a/mm/migrate.c b/mm/migrate.c index 77ed2d773705..1dc4598d2513 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -91,8 +91,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; swp_entry_t entry; - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; spinlock_t *ptl; @@ -103,19 +101,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, goto out; ptl = &mm->page_table_lock; } else { - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) + pmd = mm_find_pmd(mm, addr); + if (!pmd) goto out; - - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, addr); if (pmd_trans_huge(*pmd)) goto out; - if (!pmd_present(*pmd)) - goto out; ptep = pte_offset_map(pmd, addr); diff --git a/mm/rmap.c b/mm/rmap.c index 2ee1ef0f317b..46823fb0e801 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -562,6 +562,27 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return address; } +pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd = NULL; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + pmd = NULL; +out: + return pmd; +} + /* * Check that @page is mapped at @address into @mm. * @@ -574,8 +595,6 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) pte_t *__page_check_address(struct page *page, struct mm_struct *mm, unsigned long address, spinlock_t **ptlp, int sync) { - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; pte_t *pte; spinlock_t *ptl; @@ -586,17 +605,10 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, goto check; } - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - return NULL; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) + pmd = mm_find_pmd(mm, address); + if (!pmd) return NULL; - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - return NULL; if (pmd_trans_huge(*pmd)) return NULL; @@ -1345,8 +1357,6 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, struct vm_area_struct *vma, struct page *check_page) { struct mm_struct *mm = vma->vm_mm; - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; pte_t *pte; pte_t pteval; @@ -1366,16 +1376,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, if (end > vma->vm_end) end = vma->vm_end; - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - return ret; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return ret; - - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) + pmd = mm_find_pmd(mm, address); + if (!pmd) return ret; mmun_start = address; -- cgit From fa475e517adb422cb3492e636195f9b2c0d009c8 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Tue, 11 Dec 2012 16:00:39 -0800 Subject: thp: introduce hugepage_vma_check() Multiple places do the same check. Signed-off-by: Bob Liu Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Minchan Kim Cc: Ni zhan Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9ae97242aa8d..26002683a16c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1894,6 +1894,20 @@ static struct page } #endif +static bool hugepage_vma_check(struct vm_area_struct *vma) +{ + if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) + return false; + + if (!vma->anon_vma || vma->vm_ops) + return false; + if (is_vma_temporary_stack(vma)) + return false; + VM_BUG_ON(vma->vm_flags & VM_NO_THP); + return true; +} + static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, @@ -1934,17 +1948,8 @@ static void collapse_huge_page(struct mm_struct *mm, hend = vma->vm_end & HPAGE_PMD_MASK; if (address < hstart || address + HPAGE_PMD_SIZE > hend) goto out; - - if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE)) - goto out; - - if (!vma->anon_vma || vma->vm_ops) - goto out; - if (is_vma_temporary_stack(vma)) + if (!hugepage_vma_check(vma)) goto out; - VM_BUG_ON(vma->vm_flags & VM_NO_THP); - pmd = mm_find_pmd(mm, address); if (!pmd) goto out; @@ -2152,20 +2157,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, progress++; break; } - - if ((!(vma->vm_flags & VM_HUGEPAGE) && - !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE)) { - skip: + if (!hugepage_vma_check(vma)) { +skip: progress++; continue; } - if (!vma->anon_vma || vma->vm_ops) - goto skip; - if (is_vma_temporary_stack(vma)) - goto skip; - VM_BUG_ON(vma->vm_flags & VM_NO_THP); - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart >= hend) -- cgit From b3092b3b734f146d96ca023a75cacf78078f96d5 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Tue, 11 Dec 2012 16:00:41 -0800 Subject: thp: cleanup: introduce mk_huge_pmd() Introduce mk_huge_pmd() to simplify the code Signed-off-by: Bob Liu Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Minchan Kim Cc: Ni zhan Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 26002683a16c..ea5fb93a53a9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -606,6 +606,15 @@ static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) return pmd; } +static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma) +{ + pmd_t entry; + entry = mk_pmd(page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = pmd_mkhuge(entry); + return entry; +} + static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, @@ -629,9 +638,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pte_free(mm, pgtable); } else { pmd_t entry; - entry = mk_pmd(page, vma->vm_page_prot); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - entry = pmd_mkhuge(entry); + entry = mk_huge_pmd(page, vma); /* * The spinlocking to take the lru_lock inside * page_add_new_anon_rmap() acts as a full memory @@ -951,9 +958,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } else { pmd_t entry; VM_BUG_ON(!PageHead(page)); - entry = mk_pmd(new_page, vma->vm_page_prot); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - entry = pmd_mkhuge(entry); + entry = mk_huge_pmd(new_page, vma); pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); @@ -2000,9 +2005,7 @@ static void collapse_huge_page(struct mm_struct *mm, __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); - _pmd = mk_pmd(new_page, vma->vm_page_prot); - _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); - _pmd = pmd_mkhuge(_pmd); + _pmd = mk_huge_pmd(new_page, vma); /* * spin_lock() below is not the equivalent of smp_wmb(), so -- cgit From b023f46813cde6e3b8a8c24f432ff9c1fd8e9a64 Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Tue, 11 Dec 2012 16:00:45 -0800 Subject: memory-hotplug: skip HWPoisoned page when offlining pages hwpoisoned may be set when we offline a page by the sysfs interface /sys/devices/system/memory/soft_offline_page or /sys/devices/system/memory/hard_offline_page. If we don't clear this flag when onlining pages, this page can't be freed, and will not in free list. So we can't offline these pages again. So we should skip such page when offlining pages. Signed-off-by: Wen Congyang Cc: David Rientjes Cc: Jiang Liu Cc: Len Brown Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Christoph Lameter Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Yasuaki Ishimatsu Cc: Andi Kleen Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 5 +++-- mm/page_alloc.c | 27 +++++++++++++++++++++++---- mm/page_isolation.c | 27 ++++++++++++++++++++------- 4 files changed, 47 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8b20278be6a6..2c9fc7340b12 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1385,7 +1385,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) * Isolate the page, so that it doesn't get reallocated if it * was free. */ - set_migratetype_isolate(p); + set_migratetype_isolate(p, true); /* * When the target page is a free hugepage, just remove it * from free hugepage list. diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e4eeacae2b91..0095d156324a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -847,7 +847,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, { int ret; long offlined = *(long *)data; - ret = test_pages_isolated(start_pfn, start_pfn + nr_pages); + ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); offlined = nr_pages; if (!ret) *(long *)data += offlined; @@ -894,7 +894,8 @@ static int __ref __offline_pages(unsigned long start_pfn, nr_pages = end_pfn - start_pfn; /* set above range as isolated */ - ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + ret = start_isolate_page_range(start_pfn, end_pfn, + MIGRATE_MOVABLE, true); if (ret) goto out; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a49b0ea3cc2f..6f50cfe98a7b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5616,7 +5616,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, * MIGRATE_MOVABLE block might include unmovable pages. It means you can't * expect this function should be exact. */ -bool has_unmovable_pages(struct zone *zone, struct page *page, int count) +bool has_unmovable_pages(struct zone *zone, struct page *page, int count, + bool skip_hwpoisoned_pages) { unsigned long pfn, iter, found; int mt; @@ -5651,6 +5652,13 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count) continue; } + /* + * The HWPoisoned page may be not in buddy system, and + * page_count() is not 0. + */ + if (skip_hwpoisoned_pages && PageHWPoison(page)) + continue; + if (!PageLRU(page)) found++; /* @@ -5693,7 +5701,7 @@ bool is_pageblock_removable_nolock(struct page *page) zone->zone_start_pfn + zone->spanned_pages <= pfn) return false; - return !has_unmovable_pages(zone, page, 0); + return !has_unmovable_pages(zone, page, 0, true); } #ifdef CONFIG_CMA @@ -5864,7 +5872,8 @@ int alloc_contig_range(unsigned long start, unsigned long end, */ ret = start_isolate_page_range(pfn_max_align_down(start), - pfn_max_align_up(end), migratetype); + pfn_max_align_up(end), migratetype, + false); if (ret) return ret; @@ -5903,7 +5912,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, } /* Make sure the range is really isolated. */ - if (test_pages_isolated(outer_start, end)) { + if (test_pages_isolated(outer_start, end, false)) { pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", outer_start, end); ret = -EBUSY; @@ -6018,6 +6027,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) continue; } page = pfn_to_page(pfn); + /* + * The HWPoisoned page may be not in buddy system, and + * page_count() is not 0. + */ + if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { + pfn++; + SetPageReserved(page); + continue; + } + BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); order = page_order(page); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f2f5b4818e94..9d2264ea4606 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -30,7 +30,7 @@ static void restore_pageblock_isolate(struct page *page, int migratetype) zone->nr_pageblock_isolate--; } -int set_migratetype_isolate(struct page *page) +int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) { struct zone *zone; unsigned long flags, pfn; @@ -66,7 +66,8 @@ int set_migratetype_isolate(struct page *page) * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. * We just check MOVABLE pages. */ - if (!has_unmovable_pages(zone, page, arg.pages_found)) + if (!has_unmovable_pages(zone, page, arg.pages_found, + skip_hwpoisoned_pages)) ret = 0; /* @@ -134,7 +135,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * Returns 0 on success and -EBUSY if any part of range cannot be isolated. */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - unsigned migratetype) + unsigned migratetype, bool skip_hwpoisoned_pages) { unsigned long pfn; unsigned long undo_pfn; @@ -147,7 +148,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, pfn < end_pfn; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (page && set_migratetype_isolate(page)) { + if (page && + set_migratetype_isolate(page, skip_hwpoisoned_pages)) { undo_pfn = pfn; goto undo; } @@ -190,7 +192,8 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, * Returns 1 if all pages in the range are isolated. */ static int -__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) +__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, + bool skip_hwpoisoned_pages) { struct page *page; @@ -220,6 +223,14 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) else if (page_count(page) == 0 && get_freepage_migratetype(page) == MIGRATE_ISOLATE) pfn += 1; + else if (skip_hwpoisoned_pages && PageHWPoison(page)) { + /* + * The HWPoisoned page may be not in buddy + * system, and page_count() is not 0. + */ + pfn++; + continue; + } else break; } @@ -228,7 +239,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) return 1; } -int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) +int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, + bool skip_hwpoisoned_pages) { unsigned long pfn, flags; struct page *page; @@ -251,7 +263,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) /* Check all pages are free or Marked as ISOLATED */ zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); - ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); + ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn, + skip_hwpoisoned_pages); spin_unlock_irqrestore(&zone->lock, flags); return ret ? 0 : -EBUSY; } -- cgit From 95a4774d055c72d96ab192a1c6675cbf4d513f71 Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Tue, 11 Dec 2012 16:00:47 -0800 Subject: memory-hotplug: update mce_bad_pages when removing the memory When we hotremove a memory device, we will free the memory to store struct page. If the page is hwpoisoned page, we should decrease mce_bad_pages. [akpm@linux-foundation.org: cleanup ifdefs] Signed-off-by: Wen Congyang Cc: David Rientjes Cc: Jiang Liu Cc: Len Brown Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Christoph Lameter Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Yasuaki Ishimatsu Cc: Dave Hansen Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index a83de2f72b30..c7be01906998 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -771,6 +771,27 @@ out: return ret; } +#ifdef CONFIG_MEMORY_FAILURE +static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) +{ + int i; + + if (!memmap) + return; + + for (i = 0; i < PAGES_PER_SECTION; i++) { + if (PageHWPoison(&memmap[i])) { + atomic_long_sub(1, &mce_bad_pages); + ClearPageHWPoison(&memmap[i]); + } + } +} +#else +static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) +{ +} +#endif + void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) { struct page *memmap = NULL; @@ -784,6 +805,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) ms->pageblock_flags = NULL; } + clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); free_section_usemap(memmap, usemap); } #endif -- cgit From 7c72eb327282ee7fcadc5ef227c075cf72467ba7 Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Tue, 11 Dec 2012 16:00:49 -0800 Subject: memory-hotplug: auto offline page_cgroup when onlining memory block failed When a memory block is onlined, we will try allocate memory on that node to store page_cgroup. If onlining the memory block failed, we don't offline the page cgroup, and we have no chance to offline this page cgroup unless the memory block is onlined successfully again. It will cause that we can't hot-remove the memory device on that node, because some memory is used to store page cgroup. If onlining the memory block is failed, there is no need to stort page cgroup for this memory. So auto offline page_cgroup when onlining memory block failed. Signed-off-by: Wen Congyang Cc: David Rientjes Cc: Jiang Liu Cc: Len Brown Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Christoph Lameter Cc: Minchan Kim Acked-by: KOSAKI Motohiro Cc: Yasuaki Ishimatsu Cc: Dave Hansen Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 5ddad0c6daa6..44db00e253ed 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -251,6 +251,9 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, mn->nr_pages, mn->status_change_nid); break; case MEM_CANCEL_ONLINE: + offline_page_cgroup(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; case MEM_GOING_OFFLINE: break; case MEM_ONLINE: -- cgit From 97d0da2204ed9e34d9d42c2024c5bea5543f13c6 Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Tue, 11 Dec 2012 16:00:52 -0800 Subject: memory-hotplug: fix NR_FREE_PAGES mismatch NR_FREE_PAGES will be wrong after offlining pages. We add/dec NR_FREE_PAGES like this now: 1. move all pages in buddy system to MIGRATE_ISOLATE, and dec NR_FREE_PAGES 2. don't add NR_FREE_PAGES when it is freed and the migratetype is MIGRATE_ISOLATE 3. dec NR_FREE_PAGES when offlining isolated pages. 4. add NR_FREE_PAGES when undoing isolate pages. When we come to step 3, all pages are in MIGRATE_ISOLATE list, and NR_FREE_PAGES are right. When we come to step4, all pages are not in buddy system, so we don't change NR_FREE_PAGES in this step, but we change NR_FREE_PAGES in step3. So NR_FREE_PAGES is wrong after offlining pages. So there is no need to change NR_FREE_PAGES in step3. This patch also fixs a problem in step2: if the migratetype is MIGRATE_ISOLATE, we should not add NR_FRR_PAGES when we remove pages from pcppages. Signed-off-by: Wen Congyang Cc: David Rientjes Cc: Jiang Liu Cc: Len Brown Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Christoph Lameter Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Yasuaki Ishimatsu Cc: Dave Hansen Cc: Mel Gorman Cc: Jianguo Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6f50cfe98a7b..4dba04f06880 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -667,11 +667,13 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); - if (is_migrate_cma(mt)) - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); + if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) { + __mod_zone_page_state(zone, NR_FREE_PAGES, 1); + if (is_migrate_cma(mt)) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); + } } while (--to_free && --batch_free && !list_empty(list)); } - __mod_zone_page_state(zone, NR_FREE_PAGES, count); spin_unlock(&zone->lock); } @@ -6047,8 +6049,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) list_del(&page->lru); rmv_page_order(page); zone->free_area[order].nr_free--; - __mod_zone_page_state(zone, NR_FREE_PAGES, - - (1UL << order)); for (i = 0; i < (1 << order); i++) SetPageReserved((page+i)); pfn += (1 << order); -- cgit From 8732794b166196cc501c2ddd9e7c97cf45ab64c5 Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Tue, 11 Dec 2012 16:00:56 -0800 Subject: numa: convert static memory to dynamically allocated memory for per node device We use a static array to store struct node. In many cases, we don't have too many nodes, and some memory will be unused. Convert it to per-device dynamically allocated memory. Signed-off-by: Wen Congyang Cc: David Rientjes Cc: Jiang Liu Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 59a0059b39e2..1ef2cd4ae3c9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1800,7 +1800,7 @@ static void hugetlb_unregister_all_nodes(void) * remove hstate attributes from any nodes that have them. */ for (nid = 0; nid < nr_node_ids; nid++) - hugetlb_unregister_node(&node_devices[nid]); + hugetlb_unregister_node(node_devices[nid]); } /* @@ -1845,7 +1845,7 @@ static void hugetlb_register_all_nodes(void) int nid; for_each_node_state(nid, N_HIGH_MEMORY) { - struct node *node = &node_devices[nid]; + struct node *node = node_devices[nid]; if (node->dev.id == nid) hugetlb_register_node(node); } -- cgit From 3ac19f8efe26451cacac31d0be34fa9c51114c2a Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Tue, 11 Dec 2012 16:00:59 -0800 Subject: memory-hotplug, mm/sparse.c: clear the memory to store struct page If sparse memory vmemmap is enabled, we can't free the memory to store struct page when a memory device is hotremoved, because we may store struct page in the memory to manage the memory which doesn't belong to this memory device. When we hotadded this memory device again, we will reuse this memory to store struct page, and struct page may contain some obsolete information, and we will get bad-page state: init_memory_mapping: [mem 0x80000000-0x9fffffff] Built 2 zonelists in Node order, mobility grouping on. Total pages: 547617 Policy zone: Normal BUG: Bad page state in process bash pfn:9b6dc page:ffffea0002200020 count:0 mapcount:0 mapping: (null) index:0xfdfdfdfdfdfdfdfd page flags: 0x2fdfdfdfd5df9fd(locked|referenced|uptodate|dirty|lru|active|slab|owner_priv_1|private|private_2|writeback|head|tail|swapcache|reclaim|swapbacked|unevictable|uncached|compound_lock) Modules linked in: netconsole acpiphp pci_hotplug acpi_memhotplug loop kvm_amd kvm microcode tpm_tis tpm tpm_bios evdev psmouse serio_raw i2c_piix4 i2c_core parport_pc parport processor button thermal_sys ext3 jbd mbcache sg sr_mod cdrom ata_generic virtio_net ata_piix virtio_blk libata virtio_pci virtio_ring virtio scsi_mod Pid: 988, comm: bash Not tainted 3.6.0-rc7-guest #12 Call Trace: [] ? bad_page+0xb0/0x100 [] ? free_pages_prepare+0xb3/0x100 [] ? free_hot_cold_page+0x48/0x1a0 [] ? online_pages_range+0x68/0xa0 [] ? __online_page_increment_counters+0x10/0x10 [] ? walk_system_ram_range+0x101/0x110 [] ? online_pages+0x1a5/0x2b0 [] ? __memory_block_change_state+0x20d/0x270 [] ? store_mem_state+0xb6/0xf0 [] ? sysfs_write_file+0xd2/0x160 [] ? vfs_write+0xaa/0x160 [] ? sys_write+0x47/0x90 [] ? async_page_fault+0x25/0x30 [] ? system_call_fastpath+0x16/0x1b Disabling lock debugging due to kernel taint This patch clears the memory to store struct page to avoid unexpected error. Signed-off-by: Wen Congyang Cc: David Rientjes Cc: Jiang Liu Cc: Minchan Kim Acked-by: KOSAKI Motohiro Cc: Yasuaki Ishimatsu Reported-by: Vasilis Liaskovitis Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index c7be01906998..6b5fb762e2ca 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -638,7 +638,6 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages) got_map_page: ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); got_map_ptr: - memset(ret, 0, memmap_size); return ret; } @@ -758,6 +757,8 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, goto out; } + memset(memmap, 0, sizeof(struct page) * nr_pages); + ms->section_mem_map |= SECTION_MARKED_PRESENT; ret = sparse_init_one_section(ms, section_nr, memmap, usemap); -- cgit From 6dcd73d7011ba9046f9b98e7f7c9d958f5810e6b Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Tue, 11 Dec 2012 16:01:01 -0800 Subject: memory-hotplug: allocate zone's pcp before onlining pages We use __free_page() to put a page to buddy system when onlining pages. __free_page() will store NR_FREE_PAGES in zone's pcp.vm_stat_diff, so we should allocate zone's pcp before onlining pages, otherwise we will lose some free pages. [mhocko@suse.cz: make zone_pcp_reset independent of MEMORY_HOTREMOVE] Signed-off-by: Wen Congyang Cc: David Rientjes Cc: Jiang Liu Cc: Len Brown Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Christoph Lameter Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Yasuaki Ishimatsu Cc: Dave Hansen Cc: Mel Gorman Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 8 ++++++-- mm/page_alloc.c | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0095d156324a..ec2f199cc5f7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -498,12 +498,16 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) * So, zonelist must be updated after online. */ mutex_lock(&zonelists_mutex); - if (!populated_zone(zone)) + if (!populated_zone(zone)) { need_zonelists_rebuild = 1; + build_all_zonelists(NULL, zone); + } ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, online_pages_range); if (ret) { + if (need_zonelists_rebuild) + zone_pcp_reset(zone); mutex_unlock(&zonelists_mutex); printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", (unsigned long long) pfn << PAGE_SHIFT, @@ -519,7 +523,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) if (onlined_pages) { node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); if (need_zonelists_rebuild) - build_all_zonelists(NULL, zone); + build_all_zonelists(NULL, NULL); else zone_pcp_update(zone); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4dba04f06880..5a7b7611d4e4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5983,7 +5983,6 @@ void __meminit zone_pcp_update(struct zone *zone) } #endif -#ifdef CONFIG_MEMORY_HOTREMOVE void zone_pcp_reset(struct zone *zone) { unsigned long flags; @@ -6003,6 +6002,7 @@ void zone_pcp_reset(struct zone *zone) local_irq_restore(flags); } +#ifdef CONFIG_MEMORY_HOTREMOVE /* * All pages in the range must be isolated before calling this. */ -- cgit From d9713679dbd2a6ecb840cd5b65a3ec555c1ec3d4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 11 Dec 2012 16:01:03 -0800 Subject: memory_hotplug: fix possible incorrect node_states[N_NORMAL_MEMORY] Currently memory_hotplug only manages the node_states[N_HIGH_MEMORY], it forgets to manage node_states[N_NORMAL_MEMORY]. This may cause node_states[N_NORMAL_MEMORY] to become incorrect. Example, if a node is empty before online, and we online a memory which is in ZONE_NORMAL. And after online, node_states[N_HIGH_MEMORY] is correct, but node_states[N_NORMAL_MEMORY] is incorrect, the online code doesn't set the new online node to node_states[N_NORMAL_MEMORY]. The same thing will happen when offlining (the offline code doesn't clear the node from node_states[N_NORMAL_MEMORY] when needed). Some memory managment code depends node_states[N_NORMAL_MEMORY], so we have to fix up the node_states[N_NORMAL_MEMORY]. We add node_states_check_changes_online() and node_states_check_changes_offline() to detect whether node_states[N_HIGH_MEMORY] and node_states[N_NORMAL_MEMORY] are changed while hotpluging. Also add @status_change_nid_normal to struct memory_notify, thus the memory hotplug callbacks know whether the node_states[N_NORMAL_MEMORY] are changed. (We can add a @flags and reuse @status_change_nid instead of introducing @status_change_nid_normal, but it will add much more complexity in memory hotplug callback in every subsystem. So introducing @status_change_nid_normal is better and it doesn't change the sematics of @status_change_nid) Signed-off-by: Lai Jiangshan Cc: David Rientjes Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Yasuaki Ishimatsu Cc: Rob Landley Cc: Jiang Liu Cc: Kay Sievers Cc: Greg Kroah-Hartman Cc: Mel Gorman Cc: Wen Congyang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 136 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 120 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ec2f199cc5f7..72195602ded5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -460,6 +460,53 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, return 0; } +/* check which state of node_states will be changed when online memory */ +static void node_states_check_changes_online(unsigned long nr_pages, + struct zone *zone, struct memory_notify *arg) +{ + int nid = zone_to_nid(zone); + enum zone_type zone_last = ZONE_NORMAL; + + /* + * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes + * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. + * + * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes + * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. + */ + if (N_HIGH_MEMORY == N_NORMAL_MEMORY) + zone_last = ZONE_MOVABLE; + + /* + * if the memory to be online is in a zone of 0...zone_last, and + * the zones of 0...zone_last don't have memory before online, we will + * need to set the node to node_states[N_NORMAL_MEMORY] after + * the memory is online. + */ + if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) + arg->status_change_nid_normal = nid; + else + arg->status_change_nid_normal = -1; + + /* + * if the node don't have memory befor online, we will need to + * set the node to node_states[N_HIGH_MEMORY] after the memory + * is online. + */ + if (!node_state(nid, N_HIGH_MEMORY)) + arg->status_change_nid = nid; + else + arg->status_change_nid = -1; +} + +static void node_states_set_node(int node, struct memory_notify *arg) +{ + if (arg->status_change_nid_normal >= 0) + node_set_state(node, N_NORMAL_MEMORY); + + node_set_state(node, N_HIGH_MEMORY); +} + int __ref online_pages(unsigned long pfn, unsigned long nr_pages) { @@ -471,13 +518,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) struct memory_notify arg; lock_memory_hotplug(); + /* + * This doesn't need a lock to do pfn_to_page(). + * The section can't be removed here because of the + * memory_block->state_mutex. + */ + zone = page_zone(pfn_to_page(pfn)); + arg.start_pfn = pfn; arg.nr_pages = nr_pages; - arg.status_change_nid = -1; + node_states_check_changes_online(nr_pages, zone, &arg); nid = page_to_nid(pfn_to_page(pfn)); - if (node_present_pages(nid) == 0) - arg.status_change_nid = nid; ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); @@ -486,12 +538,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) unlock_memory_hotplug(); return ret; } - /* - * This doesn't need a lock to do pfn_to_page(). - * The section can't be removed here because of the - * memory_block->state_mutex. - */ - zone = page_zone(pfn_to_page(pfn)); /* * If this zone is not populated, then it is not in zonelist. * This means the page allocator ignores this zone. @@ -521,7 +567,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; if (onlined_pages) { - node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); + node_states_set_node(zone_to_nid(zone), &arg); if (need_zonelists_rebuild) build_all_zonelists(NULL, NULL); else @@ -871,6 +917,67 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) return offlined; } +/* check which state of node_states will be changed when offline memory */ +static void node_states_check_changes_offline(unsigned long nr_pages, + struct zone *zone, struct memory_notify *arg) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long present_pages = 0; + enum zone_type zt, zone_last = ZONE_NORMAL; + + /* + * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes + * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. + * + * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes + * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. + */ + if (N_HIGH_MEMORY == N_NORMAL_MEMORY) + zone_last = ZONE_MOVABLE; + + /* + * check whether node_states[N_NORMAL_MEMORY] will be changed. + * If the memory to be offline is in a zone of 0...zone_last, + * and it is the last present memory, 0...zone_last will + * become empty after offline , thus we can determind we will + * need to clear the node from node_states[N_NORMAL_MEMORY]. + */ + for (zt = 0; zt <= zone_last; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) + arg->status_change_nid_normal = zone_to_nid(zone); + else + arg->status_change_nid_normal = -1; + + /* + * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE + */ + zone_last = ZONE_MOVABLE; + + /* + * check whether node_states[N_HIGH_MEMORY] will be changed + * If we try to offline the last present @nr_pages from the node, + * we can determind we will need to clear the node from + * node_states[N_HIGH_MEMORY]. + */ + for (; zt <= zone_last; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + if (nr_pages >= present_pages) + arg->status_change_nid = zone_to_nid(zone); + else + arg->status_change_nid = -1; +} + +static void node_states_clear_node(int node, struct memory_notify *arg) +{ + if (arg->status_change_nid_normal >= 0) + node_clear_state(node, N_NORMAL_MEMORY); + + if ((N_HIGH_MEMORY != N_NORMAL_MEMORY) && + (arg->status_change_nid >= 0)) + node_clear_state(node, N_HIGH_MEMORY); +} + static int __ref __offline_pages(unsigned long start_pfn, unsigned long end_pfn, unsigned long timeout) { @@ -905,9 +1012,7 @@ static int __ref __offline_pages(unsigned long start_pfn, arg.start_pfn = start_pfn; arg.nr_pages = nr_pages; - arg.status_change_nid = -1; - if (nr_pages >= node_present_pages(node)) - arg.status_change_nid = node; + node_states_check_changes_offline(nr_pages, zone, &arg); ret = memory_notify(MEM_GOING_OFFLINE, &arg); ret = notifier_to_errno(ret); @@ -980,10 +1085,9 @@ repeat: } else zone_pcp_update(zone); - if (!node_present_pages(node)) { - node_clear_state(node, N_HIGH_MEMORY); + node_states_clear_node(node, &arg); + if (arg.status_change_nid >= 0) kswapd_stop(node); - } vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); -- cgit From b9d5ab2562eceeada5e4837a621b6260574dd11d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 11 Dec 2012 16:01:05 -0800 Subject: slub, hotplug: ignore unrelated node's hot-adding and hot-removing SLUB only focuses on the nodes which have normal memory and it ignores the other node's hot-adding and hot-removing. Aka: if some memory of a node which has no onlined memory is online, but this new memory onlined is not normal memory (for example, highmem), we should not allocate kmem_cache_node for SLUB. And if the last normal memory is offlined, but the node still has memory, we should remove kmem_cache_node for that node. (The current code delays it when all of the memory is offlined) So we only do something when marg->status_change_nid_normal > 0. marg->status_change_nid is not suitable here. The same problem doesn't exist in SLAB, because SLAB allocates kmem_list3 for every node even the node don't have normal memory, SLAB tolerates kmem_list3 on alien nodes. SLUB only focuses on the nodes which have normal memory, it don't tolerate alien kmem_cache_node. The patch makes SLUB become self-compatible and avoids WARNs and BUGs in rare conditions. Signed-off-by: Lai Jiangshan Cc: David Rientjes Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Yasuaki Ishimatsu Cc: Rob Landley Cc: Andrew Morton Cc: Jiang Liu Cc: Kay Sievers Cc: Greg Kroah-Hartman Cc: Mel Gorman Cc: Wen Congyang Acked-by: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index a0d698467f70..487f0bdd53c0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3573,7 +3573,7 @@ static void slab_mem_offline_callback(void *arg) struct memory_notify *marg = arg; int offline_node; - offline_node = marg->status_change_nid; + offline_node = marg->status_change_nid_normal; /* * If the node still has available memory. we need kmem_cache_node @@ -3606,7 +3606,7 @@ static int slab_mem_going_online_callback(void *arg) struct kmem_cache_node *n; struct kmem_cache *s; struct memory_notify *marg = arg; - int nid = marg->status_change_nid; + int nid = marg->status_change_nid_normal; int ret = 0; /* -- cgit From 712cd386fdc983d318fecf302a2a9cb8e9de90c9 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 11 Dec 2012 16:01:07 -0800 Subject: mm/memory_hotplug.c: update start_pfn in zone and pg_data when spanned_pages == 0. If we hot-remove memory only and leave the cpus alive, the corresponding node will not be removed. But the node_start_pfn and node_spanned_pages in pg_data will be reset to 0. In this case, when we hot-add the memory back next time, the node_start_pfn will always be 0 because no pfn is less than 0. After that, if we hot-remove the memory again, it will cause kernel panic in function find_biggest_section_pfn() when it tries to scan all the pfns. The zone will also have the same problem. This patch sets start_pfn to the start_pfn of the section being added when spanned_pages of the zone or pg_data is 0. ---How to reproduce--- 1. hot-add a container with some memory and cpus; 2. hot-remove the container's memory, and leave cpus there; 3. hot-add these memory again; 4. hot-remove them again; then, the kernel will panic. ---Call trace--- BUG: unable to handle kernel paging request at 00000fff82a8cc38 IP: [] find_biggest_section_pfn+0xe5/0x180 ...... Call Trace: [] __remove_zone+0x184/0x1b0 [] __remove_section+0x8c/0xb0 [] __remove_pages+0xe7/0x120 [] arch_remove_memory+0x2c/0x80 [] remove_memory+0x56/0x90 [] acpi_memory_device_remove_memory+0x48/0x73 [] acpi_memory_device_notify+0x153/0x274 [] acpi_ev_notify_dispatch+0x41/0x5f [] acpi_os_execute_deferred+0x27/0x34 [] process_one_work+0x219/0x680 [] worker_thread+0x12e/0x320 [] kthread+0xc6/0xd0 [] kernel_thread_helper+0x4/0x10 ...... ---[ end trace 96d845dbf33fee11 ]--- Signed-off-by: Tang Chen Cc: Yasuaki Ishimatsu Cc: Wen Congyang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 72195602ded5..571130ee66d7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -205,7 +205,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, zone_span_writelock(zone); old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; - if (start_pfn < zone->zone_start_pfn) + if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn) zone->zone_start_pfn = start_pfn; zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - @@ -220,7 +220,7 @@ static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long old_pgdat_end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; - if (start_pfn < pgdat->node_start_pfn) + if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) pgdat->node_start_pfn = start_pfn; pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - -- cgit From e749eb95531ac8349df47f8d46ce2641dcb16589 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 11 Dec 2012 16:01:09 -0800 Subject: mm: add comment on storage key dirty bit semantics Add comments that dirty bit in storage key gets set whenever page content is changed. Hopefully if someone will use this function, he'll have a look at one of the two places where we comment on this. Signed-off-by: Jan Kara Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 46823fb0e801..cf7e99a87c32 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1151,9 +1151,11 @@ void page_remove_rmap(struct page *page) * containing the swap entry, but page not yet written to swap. * * And we can skip it on file pages, so long as the filesystem - * participates in dirty tracking; but need to catch shm and tmpfs - * and ramfs pages which have been modified since creation by read - * fault. + * participates in dirty tracking (note that this is not only an + * optimization but also solves problems caused by dirty flag in + * storage key getting set by a write from inside kernel); but need to + * catch shm and tmpfs and ramfs pages which have been modified since + * creation by read fault. * * Note that mapping must be decided above, before decrementing * mapcount (which luckily provides a barrier): once page is unmapped, -- cgit From e9868505987a03a26a3979f27b82911ccc003752 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 11 Dec 2012 16:01:10 -0800 Subject: mm,vmscan: only evict file pages when we have plenty If we have more inactive file pages than active file pages, we skip scanning the active file pages altogether, with the idea that we do not want to evict the working set when there is plenty of streaming IO in the cache. However, the code forgot to also skip scanning anonymous pages in that situation. That leads to the curious situation of keeping the active file pages protected from being paged out when there are lots of inactive file pages, while still scanning and evicting anonymous pages. This patch fixes that situation, by only evicting file pages when we have plenty of them and most are inactive. [akpm@linux-foundation.org: adjust comment layout] Signed-off-by: Rik van Riel Cc: Mel Gorman Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index a1ce17f44be0..53947311c777 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1679,13 +1679,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, if (global_reclaim(sc)) { free = zone_page_state(zone, NR_FREE_PAGES); - /* If we have very few page cache pages, - force-scan anon pages. */ if (unlikely(file + free <= high_wmark_pages(zone))) { + /* + * If we have very few page cache pages, force-scan + * anon pages. + */ fraction[0] = 1; fraction[1] = 0; denominator = 1; goto out; + } else if (!inactive_file_is_low_global(zone)) { + /* + * There is enough inactive page cache, do not + * reclaim anything from the working set right now. + */ + fraction[0] = 0; + fraction[1] = 1; + denominator = 1; + goto out; } } -- cgit From cf0cac0a09341549dedabcfc2a66dcbc2eaaf2b9 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 11 Dec 2012 16:01:13 -0800 Subject: mm: refactor reinsert of swap_info in sys_swapoff() The block within sys_swapoff() which re-inserts the swap_info into the swap_list in case of failure of try_to_unuse() reads a few values outside the swap_lock. While this is safe at that point, it is subtle code. Simplify the code by moving the reading of these values to a separate function, refactoring it a bit so they are read from within the swap_lock. This is easier to understand, and matches better the way it worked before I unified the insertion of the swap_info from both sys_swapon and sys_swapoff. This change should make no functional difference. The only real change is moving the read of two or three structure fields to within the lock (frontswap_map_get() is nothing more than a read of p->frontswap_map). Signed-off-by: Cesar Eduardo Barros Cc: Konrad Rzeszutek Wilk Cc: Dan Magenheimer Cc: Mel Gorman Cc: Rik van Riel Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index f91a25547ffe..27a52b731576 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1443,13 +1443,12 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) return generic_swapfile_activate(sis, swap_file, span); } -static void enable_swap_info(struct swap_info_struct *p, int prio, +static void _enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, unsigned long *frontswap_map) { int i, prev; - spin_lock(&swap_lock); if (prio >= 0) p->prio = prio; else @@ -1473,6 +1472,21 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, else swap_info[prev]->next = p->type; frontswap_init(p->type); +} + +static void enable_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map, + unsigned long *frontswap_map) +{ + spin_lock(&swap_lock); + _enable_swap_info(p, prio, swap_map, frontswap_map); + spin_unlock(&swap_lock); +} + +static void reinsert_swap_info(struct swap_info_struct *p) +{ + spin_lock(&swap_lock); + _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); spin_unlock(&swap_lock); } @@ -1548,14 +1562,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); if (err) { - /* - * reading p->prio and p->swap_map outside the lock is - * safe here because only sys_swapon and sys_swapoff - * change them, and there can be no other sys_swapon or - * sys_swapoff for this swap_info_struct at this point. - */ /* re-insert swap space back into swap_list */ - enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); + reinsert_swap_info(p); goto out_dput; } -- cgit From 6555bc035731eab76c0901925034465d3ad2099c Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 11 Dec 2012 16:01:14 -0800 Subject: mm: do not call frontswap_init() during swapoff The call to frontswap_init() was added within enable_swap_info(), which was called not only during sys_swapon, but also to reinsert the swap_info into the swap_list in case of failure of try_to_unuse() within sys_swapoff. This means that frontswap_init() might be called more than once for the same swap area. While as far as I could see no frontswap implementation has any problem with it (and in fact, all the ones I found ignore the parameter passed to frontswap_init), this could change in the future. To prevent future problems, move the call to frontswap_init() to outside the code shared between sys_swapon and sys_swapoff. Signed-off-by: Cesar Eduardo Barros Cc: Konrad Rzeszutek Wilk Acked-by: Dan Magenheimer Cc: Mel Gorman Cc: Rik van Riel Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 27a52b731576..0fbb45283c66 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1471,7 +1471,6 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, swap_list.head = swap_list.next = p->type; else swap_info[prev]->next = p->type; - frontswap_init(p->type); } static void enable_swap_info(struct swap_info_struct *p, int prio, @@ -1480,6 +1479,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, { spin_lock(&swap_lock); _enable_swap_info(p, prio, swap_map, frontswap_map); + frontswap_init(p->type); spin_unlock(&swap_lock); } -- cgit From 4de22c0584fb0566487b2cba5cdfbce346b18402 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 11 Dec 2012 16:01:17 -0800 Subject: mm, highmem: use PKMAP_NR() to calculate an index of pkmap To calculate an index of pkmap, using PKMAP_NR() is more understandable and maintainable, so change it. Signed-off-by: Joonsoo Kim Cc: Mel Gorman Reviewed-by: Minchan Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index 2da13a5c50e2..2576a7118981 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -99,7 +99,7 @@ struct page *kmap_to_page(void *vaddr) unsigned long addr = (unsigned long)vaddr; if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { - int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT; + int i = PKMAP_NR(addr); return pte_page(pkmap_page_table[i]); } -- cgit From cc33a303f1c155cf0147964586bb80fa732d8a21 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 11 Dec 2012 16:01:20 -0800 Subject: mm, highmem: remove useless pool_lock The pool_lock protects the page_address_pool from concurrent access. But, access to the page_address_pool is already protected by kmap_lock. So remove it. Signed-off-by: Joonsoo Kim Cc: Mel Gorman Reviewed-by: Minchan Kin Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index 2576a7118981..f0f0f1d5e691 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -328,7 +328,6 @@ struct page_address_map { * page_address_map freelist, allocated from page_address_maps. */ static struct list_head page_address_pool; /* freelist */ -static spinlock_t pool_lock; /* protects page_address_pool */ /* * Hash table bucket @@ -395,11 +394,9 @@ void set_page_address(struct page *page, void *virtual) if (virtual) { /* Add */ BUG_ON(list_empty(&page_address_pool)); - spin_lock_irqsave(&pool_lock, flags); pam = list_entry(page_address_pool.next, struct page_address_map, list); list_del(&pam->list); - spin_unlock_irqrestore(&pool_lock, flags); pam->page = page; pam->virtual = virtual; @@ -413,9 +410,7 @@ void set_page_address(struct page *page, void *virtual) if (pam->page == page) { list_del(&pam->list); spin_unlock_irqrestore(&pas->lock, flags); - spin_lock_irqsave(&pool_lock, flags); list_add_tail(&pam->list, &page_address_pool); - spin_unlock_irqrestore(&pool_lock, flags); goto done; } } @@ -438,7 +433,6 @@ void __init page_address_init(void) INIT_LIST_HEAD(&page_address_htable[i].lh); spin_lock_init(&page_address_htable[i].lock); } - spin_lock_init(&pool_lock); } #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ -- cgit From a354e2c84eeebcd7d7bbdd71216895b8e9866b5c Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 11 Dec 2012 16:01:23 -0800 Subject: mm, highmem: remove page_address_pool list We can find free page_address_map instance without the page_address_pool. So remove it. Signed-off-by: Joonsoo Kim Cc: Mel Gorman Reviewed-by: Minchan Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index f0f0f1d5e691..4d6f96c20191 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -324,10 +324,7 @@ struct page_address_map { struct list_head list; }; -/* - * page_address_map freelist, allocated from page_address_maps. - */ -static struct list_head page_address_pool; /* freelist */ +static struct page_address_map page_address_maps[LAST_PKMAP]; /* * Hash table bucket @@ -392,12 +389,7 @@ void set_page_address(struct page *page, void *virtual) pas = page_slot(page); if (virtual) { /* Add */ - BUG_ON(list_empty(&page_address_pool)); - - pam = list_entry(page_address_pool.next, - struct page_address_map, list); - list_del(&pam->list); - + pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)]; pam->page = page; pam->virtual = virtual; @@ -410,7 +402,6 @@ void set_page_address(struct page *page, void *virtual) if (pam->page == page) { list_del(&pam->list); spin_unlock_irqrestore(&pas->lock, flags); - list_add_tail(&pam->list, &page_address_pool); goto done; } } @@ -420,15 +411,10 @@ done: return; } -static struct page_address_map page_address_maps[LAST_PKMAP]; - void __init page_address_init(void) { int i; - INIT_LIST_HEAD(&page_address_pool); - for (i = 0; i < ARRAY_SIZE(page_address_maps); i++) - list_add(&page_address_maps[i].list, &page_address_pool); for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { INIT_LIST_HEAD(&page_address_htable[i].lh); spin_lock_init(&page_address_htable[i].lock); -- cgit From eb2db439a3203ae86c35ad277ac4a3268a94baa1 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 11 Dec 2012 16:01:24 -0800 Subject: mm, highmem: get virtual address of the page using PKMAP_ADDR() In flush_all_zero_pkmaps(), we have an index of the pkmap associated with the page. Using this index, we can simply get virtual address of the page. So change it. Signed-off-by: Joonsoo Kim Cc: Mel Gorman Reviewed-by: Minchan Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index 4d6f96c20191..d999077431df 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -137,8 +137,7 @@ static void flush_all_zero_pkmaps(void) * So no dangers, even with speculative execution. */ page = pte_page(pkmap_page_table[i]); - pte_clear(&init_mm, (unsigned long)page_address(page), - &pkmap_page_table[i]); + pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]); set_page_address(page, NULL); need_flush = 1; -- cgit From a1dd450bcb1a05e8218b9aac0ee36f8755d8a140 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 11 Dec 2012 16:01:27 -0800 Subject: mm: thp: set the accessed flag for old pages on access fault On x86 memory accesses to pages without the ACCESSED flag set result in the ACCESSED flag being set automatically. With the ARM architecture a page access fault is raised instead (and it will continue to be raised until the ACCESSED flag is set for the appropriate PTE/PMD). For normal memory pages, handle_pte_fault will call pte_mkyoung (effectively setting the ACCESSED flag). For transparent huge pages, pmd_mkyoung will only be called for a write fault. This patch ensures that faults on transparent hugepages which do not result in a CoW update the access flags for the faulting pmd. Signed-off-by: Will Deacon Cc: Chris Metcalf Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Acked-by: Johannes Weiner Cc: Ni zhan Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 22 ++++++++++++++++++++++ mm/memory.c | 8 ++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ea5fb93a53a9..5f902e20e8c0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -784,6 +784,28 @@ out: return ret; } +void huge_pmd_set_accessed(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmd, pmd_t orig_pmd, + int dirty) +{ + pmd_t entry; + unsigned long haddr; + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto unlock; + + entry = pmd_mkyoung(orig_pmd); + haddr = address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) + update_mmu_cache_pmd(vma, address, pmd); + +unlock: + spin_unlock(&mm->page_table_lock); +} + static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, diff --git a/mm/memory.c b/mm/memory.c index 221fc9ffcab1..765377385632 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3537,8 +3537,9 @@ retry: barrier(); if (pmd_trans_huge(orig_pmd)) { - if (flags & FAULT_FLAG_WRITE && - !pmd_write(orig_pmd) && + unsigned int dirty = flags & FAULT_FLAG_WRITE; + + if (dirty && !pmd_write(orig_pmd) && !pmd_trans_splitting(orig_pmd)) { ret = do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); @@ -3550,6 +3551,9 @@ retry: if (unlikely(ret & VM_FAULT_OOM)) goto retry; return ret; + } else { + huge_pmd_set_accessed(mm, vma, address, pmd, + orig_pmd, dirty); } return 0; } -- cgit From 9ff4868e3051d9128a24dd330bed32011a11421d Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 11 Dec 2012 16:01:30 -0800 Subject: mm, oom: allow exiting threads to have access to memory reserves Exiting threads, those with PF_EXITING set, can pagefault and require memory before they can make forward progress. This happens, for instance, when a process must fault task->robust_list, a userspace structure, before detaching its memory. These threads also aren't guaranteed to get access to memory reserves unless oom killed or killed from userspace. The oom killer won't grant memory reserves if other threads are also exiting other than current and stalling at the same point. This prevents needlessly killing processes when others are already exiting. Instead of special casing all the possible situations between PF_EXITING getting set and a thread detaching its mm where it may allocate memory, which probably wouldn't get updated when a change is made to the exit path, the solution is to give all exiting threads access to memory reserves if they call the oom killer. This allows them to quickly allocate, detach its mm, and free the memory it represents. Summary of Luigi's bug report: : He had an oom condition where threads were faulting on task->robust_list : and repeatedly called the oom killer but it would defer killing a thread : because it saw other PF_EXITING threads. This can happen anytime we need : to allocate memory after setting PF_EXITING and before detaching our mm; : if there are other threads in the same state then the oom killer won't do : anything unless one of them happens to be killed from userspace. : : So instead of only deferring for PF_EXITING and !task->robust_list, it's : better to just give them access to memory reserves to prevent a potential : livelock so that any other faults that may be introduced in the future in : the exit path don't cause the same problem (and hopefully we don't allow : too many of those!). Signed-off-by: David Rientjes Acked-by: Minchan Kim Tested-by: Luigi Semenzato Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 79e0f3e24831..7e9e9113bd05 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -310,26 +310,13 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, if (!task->mm) return OOM_SCAN_CONTINUE; - if (task->flags & PF_EXITING) { + if (task->flags & PF_EXITING && !force_kill) { /* - * If task is current and is in the process of releasing memory, - * allow the "kill" to set TIF_MEMDIE, which will allow it to - * access memory reserves. Otherwise, it may stall forever. - * - * The iteration isn't broken here, however, in case other - * threads are found to have already been oom killed. + * If this task is not being ptraced on exit, then wait for it + * to finish before killing some other task unnecessarily. */ - if (task == current) - return OOM_SCAN_SELECT; - else if (!force_kill) { - /* - * If this task is not being ptraced on exit, then wait - * for it to finish before killing some other task - * unnecessarily. - */ - if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) - return OOM_SCAN_ABORT; - } + if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) + return OOM_SCAN_ABORT; } return OOM_SCAN_OK; } @@ -706,11 +693,11 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, return; /* - * If current has a pending SIGKILL, then automatically select it. The - * goal is to allow it to allocate so that it may quickly exit and free - * its memory. + * If current has a pending SIGKILL or is exiting, then automatically + * select it. The goal is to allow it to allocate so that it may + * quickly exit and free its memory. */ - if (fatal_signal_pending(current)) { + if (fatal_signal_pending(current) || current->flags & PF_EXITING) { set_thread_flag(TIF_MEMDIE); return; } -- cgit From 5de55b265a13bc263c823bbe05d87d2c5e785f6f Mon Sep 17 00:00:00 2001 From: Matthieu CASTET Date: Tue, 11 Dec 2012 16:01:31 -0800 Subject: dmapool: make DMAPOOL_DEBUG detect corruption of free marker This can help to catch the case where hardware is writing after dma free. [akpm@linux-foundation.org: tidy code, fix comment, use sizeof(page->offset), use pr_err()] Signed-off-by: Matthieu Castet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'mm') diff --git a/mm/dmapool.c b/mm/dmapool.c index da1b0f0b8709..c69781e97cf9 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -332,6 +332,30 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, retval = offset + page->vaddr; *handle = offset + page->dma; #ifdef DMAPOOL_DEBUG + { + int i; + u8 *data = retval; + /* page->offset is stored in first 4 bytes */ + for (i = sizeof(page->offset); i < pool->size; i++) { + if (data[i] == POOL_POISON_FREED) + continue; + if (pool->dev) + dev_err(pool->dev, + "dma_pool_alloc %s, %p (corruped)\n", + pool->name, retval); + else + pr_err("dma_pool_alloc %s, %p (corruped)\n", + pool->name, retval); + + /* + * Dump the first 4 bytes even if they are not + * POOL_POISON_FREED + */ + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, + data, pool->size, 1); + break; + } + } memset(retval, POOL_POISON_ALLOCATED, pool->size); #endif spin_unlock_irqrestore(&pool->lock, flags); -- cgit From ff604cf6d41f1e05f34762e1d764fe14a0f5f964 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 11 Dec 2012 16:01:32 -0800 Subject: mm: hwpoison: fix action_result() to print out dirty/clean action_result() fails to print out "dirty" even if an error occurred on a dirty pagecache, because when we check PageDirty in action_result() it was cleared after page isolation even if it's dirty before error handling. This can break some applications that monitor this message, so should be fixed. There are several callers of action_result() except page_action(), but either of them are not for LRU pages but for free pages or kernel pages, so we don't have to consider dirty or not for them. Note that PG_dirty can be set outside page locks as described in commit 6746aff74da2 ("HWPOISON: shmem: call set_page_dirty() with locked page"), so this patch does not completely closes the race window, but just narrows it. Signed-off-by: Naoya Horiguchi Reviewed-by: Andi Kleen Cc: Tony Luck Cc: "Jun'ichi Nomura" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2c9fc7340b12..108c52fa60f6 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -781,16 +781,16 @@ static struct page_state { { compound, compound, "huge", me_huge_page }, #endif - { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, - { sc|dirty, sc, "swapcache", me_swapcache_clean }, + { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, + { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, - { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, - { unevict, unevict, "unevictable LRU", me_pagecache_clean}, + { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, + { unevict, unevict, "clean unevictable LRU", me_pagecache_clean }, - { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, - { mlock, mlock, "mlocked LRU", me_pagecache_clean }, + { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, + { mlock, mlock, "clean mlocked LRU", me_pagecache_clean }, - { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, + { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, { lru|dirty, lru, "clean LRU", me_pagecache_clean }, /* @@ -812,14 +812,14 @@ static struct page_state { #undef slab #undef reserved +/* + * "Dirty/Clean" indication is not 100% accurate due to the possibility of + * setting PG_dirty outside page lock. See also comment above set_page_dirty(). + */ static void action_result(unsigned long pfn, char *msg, int result) { - struct page *page = pfn_to_page(pfn); - - printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", - pfn, - PageDirty(page) ? "dirty " : "", - msg, action_name[result]); + pr_err("MCE %#lx: %s page recovery: %s\n", + pfn, msg, action_name[result]); } static int page_action(struct page_state *ps, struct page *p, -- cgit From 42d7395feb56f0655cd8b68e06fc6063823449f8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 11 Dec 2012 16:01:34 -0800 Subject: mm: support more pagesizes for MAP_HUGETLB/SHM_HUGETLB There was some desire in large applications using MAP_HUGETLB or SHM_HUGETLB to use 1GB huge pages on some mappings, and stay with 2MB on others. This is useful together with NUMA policy: use 2MB interleaving on some mappings, but 1GB on local mappings. This patch extends the IPC/SHM syscall interfaces slightly to allow specifying the page size. It borrows some upper bits in the existing flag arguments and allows encoding the log of the desired page size in addition to the *_HUGETLB flag. When 0 is specified the default size is used, this makes the change fully compatible. Extending the internal hugetlb code to handle this is straight forward. Instead of a single mount it just keeps an array of them and selects the right mount based on the specified page size. When no page size is specified it uses the mount of the default page size. The change is not visible in /proc/mounts because internal mounts don't appear there. It also has very little overhead: the additional mounts just consume a super block, but not more memory when not used. I also exported the new flags to the user headers (they were previously under __KERNEL__). Right now only symbols for x86 and some other architecture for 1GB and 2MB are defined. The interface should already work for all other architectures though. Only architectures that define multiple hugetlb sizes actually need it (that is currently x86, tile, powerpc). However tile and powerpc have user configurable hugetlb sizes, so it's not easy to add defines. A program on those architectures would need to query sysfs and use the appropiate log2. [akpm@linux-foundation.org: cleanups] [rientjes@google.com: fix build] [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Andi Kleen Cc: Michael Kerrisk Acked-by: Rik van Riel Acked-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 9a796c41e7d9..ebf19031c5e4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1153,8 +1153,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, * memory so no accounting is necessary */ file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, - VM_NORESERVE, &user, - HUGETLB_ANONHUGE_INODE); + VM_NORESERVE, + &user, HUGETLB_ANONHUGE_INODE, + (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (IS_ERR(file)) return PTR_ERR(file); } -- cgit From d37371870ceb1d2165397dc36114725b6dca946c Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 11 Dec 2012 16:01:38 -0800 Subject: mm: augment vma rbtree with rb_subtree_gap Define vma->rb_subtree_gap as the largest gap between any vma in the subtree rooted at that vma, and their predecessor. Or, for a recursive definition, vma->rb_subtree_gap is the max of: - vma->vm_start - vma->vm_prev->vm_end - rb_subtree_gap fields of the vmas pointed by vma->rb.rb_left and vma->rb.rb_right This will allow get_unmapped_area_* to find a free area of the right size in O(log(N)) time, instead of potentially having to do a linear walk across all the VMAs. Also define mm->highest_vm_end as the vm_end field of the highest vma, so that we can easily check if the following gap is suitable. This does have the potential to make unmapping VMAs more expensive, especially for processes with very large numbers of VMAs, where the VMA rbtree can grow quite deep. Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Cc: Hugh Dickins Cc: Russell King Cc: Ralf Baechle Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 132 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index ebf19031c5e4..bdcea6310fff 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -297,6 +298,27 @@ out: return retval; } +static long vma_compute_subtree_gap(struct vm_area_struct *vma) +{ + unsigned long max, subtree_gap; + max = vma->vm_start; + if (vma->vm_prev) + max -= vma->vm_prev->vm_end; + if (vma->vm_rb.rb_left) { + subtree_gap = rb_entry(vma->vm_rb.rb_left, + struct vm_area_struct, vm_rb)->rb_subtree_gap; + if (subtree_gap > max) + max = subtree_gap; + } + if (vma->vm_rb.rb_right) { + subtree_gap = rb_entry(vma->vm_rb.rb_right, + struct vm_area_struct, vm_rb)->rb_subtree_gap; + if (subtree_gap > max) + max = subtree_gap; + } + return max; +} + #ifdef CONFIG_DEBUG_VM_RB static int browse_rb(struct rb_root *root) { @@ -327,6 +349,18 @@ static int browse_rb(struct rb_root *root) return i; } +static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) +{ + struct rb_node *nd; + + for (nd = rb_first(root); nd; nd = rb_next(nd)) { + struct vm_area_struct *vma; + vma = rb_entry(nd, struct vm_area_struct, vm_rb); + BUG_ON(vma != ignore && + vma->rb_subtree_gap != vma_compute_subtree_gap(vma)); + } +} + void validate_mm(struct mm_struct *mm) { int bug = 0; @@ -349,9 +383,52 @@ void validate_mm(struct mm_struct *mm) BUG_ON(bug); } #else +#define validate_mm_rb(root, ignore) do { } while (0) #define validate_mm(mm) do { } while (0) #endif +RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, + unsigned long, rb_subtree_gap, vma_compute_subtree_gap) + +/* + * Update augmented rbtree rb_subtree_gap values after vma->vm_start or + * vma->vm_prev->vm_end values changed, without modifying the vma's position + * in the rbtree. + */ +static void vma_gap_update(struct vm_area_struct *vma) +{ + /* + * As it turns out, RB_DECLARE_CALLBACKS() already created a callback + * function that does exacltly what we want. + */ + vma_gap_callbacks_propagate(&vma->vm_rb, NULL); +} + +static inline void vma_rb_insert(struct vm_area_struct *vma, + struct rb_root *root) +{ + /* All rb_subtree_gap values must be consistent prior to insertion */ + validate_mm_rb(root, NULL); + + rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); +} + +static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) +{ + /* + * All rb_subtree_gap values must be consistent prior to erase, + * with the possible exception of the vma being erased. + */ + validate_mm_rb(root, vma); + + /* + * Note rb_erase_augmented is a fairly large inline function, + * so make sure we instantiate it only once with our desired + * augmented rbtree callbacks. + */ + rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); +} + /* * vma has some anon_vma assigned, and is already inserted on that * anon_vma's interval trees. @@ -421,8 +498,25 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr, void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, struct rb_node **rb_link, struct rb_node *rb_parent) { + /* Update tracking information for the gap following the new vma. */ + if (vma->vm_next) + vma_gap_update(vma->vm_next); + else + mm->highest_vm_end = vma->vm_end; + + /* + * vma->vm_prev wasn't known when we followed the rbtree to find the + * correct insertion point for that vma. As a result, we could not + * update the vma vm_rb parents rb_subtree_gap values on the way down. + * So, we first insert the vma with a zero rb_subtree_gap value + * (to be consistent with what we did on the way down), and then + * immediately update the gap to the correct value. Finally we + * rebalance the rbtree after all augmented values have been set. + */ rb_link_node(&vma->vm_rb, rb_parent, rb_link); - rb_insert_color(&vma->vm_rb, &mm->mm_rb); + vma->rb_subtree_gap = 0; + vma_gap_update(vma); + vma_rb_insert(vma, &mm->mm_rb); } static void __vma_link_file(struct vm_area_struct *vma) @@ -498,12 +592,12 @@ static inline void __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev) { - struct vm_area_struct *next = vma->vm_next; + struct vm_area_struct *next; - prev->vm_next = next; + vma_rb_erase(vma, &mm->mm_rb); + prev->vm_next = next = vma->vm_next; if (next) next->vm_prev = prev; - rb_erase(&vma->vm_rb, &mm->mm_rb); if (mm->mmap_cache == vma) mm->mmap_cache = prev; } @@ -525,6 +619,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, struct rb_root *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; + bool start_changed = false, end_changed = false; long adjust_next = 0; int remove_next = 0; @@ -615,8 +710,14 @@ again: remove_next = 1 + (end > next->vm_end); vma_interval_tree_remove(next, root); } - vma->vm_start = start; - vma->vm_end = end; + if (start != vma->vm_start) { + vma->vm_start = start; + start_changed = true; + } + if (end != vma->vm_end) { + vma->vm_end = end; + end_changed = true; + } vma->vm_pgoff = pgoff; if (adjust_next) { next->vm_start += adjust_next << PAGE_SHIFT; @@ -645,6 +746,15 @@ again: remove_next = 1 + (end > next->vm_end); * (it may either follow vma or precede it). */ __insert_vm_struct(mm, insert); + } else { + if (start_changed) + vma_gap_update(vma); + if (end_changed) { + if (!next) + mm->highest_vm_end = end; + else if (!adjust_next) + vma_gap_update(next); + } } if (anon_vma) { @@ -678,10 +788,13 @@ again: remove_next = 1 + (end > next->vm_end); * we must remove another next too. It would clutter * up the code too much to do both in one go. */ - if (remove_next == 2) { - next = vma->vm_next; + next = vma->vm_next; + if (remove_next == 2) goto again; - } + else if (next) + vma_gap_update(next); + else + mm->highest_vm_end = end; } if (insert && file) uprobe_mmap(insert); @@ -1784,6 +1897,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; anon_vma_interval_tree_post_update_vma(vma); + if (vma->vm_next) + vma_gap_update(vma->vm_next); + else + vma->vm_mm->highest_vm_end = address; perf_event_mmap(vma); } } @@ -1838,6 +1955,7 @@ int expand_downwards(struct vm_area_struct *vma, vma->vm_start = address; vma->vm_pgoff -= grow; anon_vma_interval_tree_post_update_vma(vma); + vma_gap_update(vma); perf_event_mmap(vma); } } @@ -1960,14 +2078,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; do { - rb_erase(&vma->vm_rb, &mm->mm_rb); + vma_rb_erase(vma, &mm->mm_rb); mm->map_count--; tail_vma = vma; vma = vma->vm_next; } while (vma && vma->vm_start < end); *insertion_point = vma; - if (vma) + if (vma) { vma->vm_prev = prev; + vma_gap_update(vma); + } else + mm->highest_vm_end = prev ? prev->vm_end : 0; tail_vma->vm_next = NULL; if (mm->unmap_area == arch_unmap_area) addr = prev ? prev->vm_end : mm->mmap_base; -- cgit From 5a0768f641a5bad844860e67250baf0d1aa5e03c Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 11 Dec 2012 16:01:42 -0800 Subject: mm: check rb_subtree_gap correctness When CONFIG_DEBUG_VM_RB is enabled, check that rb_subtree_gap is correctly set for every vma and that mm->highest_vm_end is also correct. Also add an explicit 'bug' variable to track if browse_rb() detected any invalid condition. [akpm@linux-foundation.org: repair innovative coding-style inventions] Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Cc: Hugh Dickins Cc: Russell King Cc: Ralf Baechle Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 53 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index bdcea6310fff..ff93f6c8436c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -322,31 +322,45 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma) #ifdef CONFIG_DEBUG_VM_RB static int browse_rb(struct rb_root *root) { - int i = 0, j; + int i = 0, j, bug = 0; struct rb_node *nd, *pn = NULL; unsigned long prev = 0, pend = 0; for (nd = rb_first(root); nd; nd = rb_next(nd)) { struct vm_area_struct *vma; vma = rb_entry(nd, struct vm_area_struct, vm_rb); - if (vma->vm_start < prev) - printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; - if (vma->vm_start < pend) + if (vma->vm_start < prev) { + printk("vm_start %lx prev %lx\n", vma->vm_start, prev); + bug = 1; + } + if (vma->vm_start < pend) { printk("vm_start %lx pend %lx\n", vma->vm_start, pend); - if (vma->vm_start > vma->vm_end) - printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); + bug = 1; + } + if (vma->vm_start > vma->vm_end) { + printk("vm_end %lx < vm_start %lx\n", + vma->vm_end, vma->vm_start); + bug = 1; + } + if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { + printk("free gap %lx, correct %lx\n", + vma->rb_subtree_gap, + vma_compute_subtree_gap(vma)); + bug = 1; + } i++; pn = nd; prev = vma->vm_start; pend = vma->vm_end; } j = 0; - for (nd = pn; nd; nd = rb_prev(nd)) { + for (nd = pn; nd; nd = rb_prev(nd)) j++; + if (i != j) { + printk("backwards %d, forwards %d\n", j, i); + bug = 1; } - if (i != j) - printk("backwards %d, forwards %d\n", j, i), i = 0; - return i; + return bug ? -1 : i; } static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) @@ -365,6 +379,7 @@ void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; + unsigned long highest_address = 0; struct vm_area_struct *vma = mm->mmap; while (vma) { struct anon_vma_chain *avc; @@ -372,14 +387,24 @@ void validate_mm(struct mm_struct *mm) list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_verify(avc); vma_unlock_anon_vma(vma); + highest_address = vma->vm_end; vma = vma->vm_next; i++; } - if (i != mm->map_count) - printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; + if (i != mm->map_count) { + printk("map_count %d vm_next %d\n", mm->map_count, i); + bug = 1; + } + if (highest_address != mm->highest_vm_end) { + printk("mm->highest_vm_end %lx, found %lx\n", + mm->highest_vm_end, highest_address); + bug = 1; + } i = browse_rb(&mm->mm_rb); - if (i != mm->map_count) - printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; + if (i != mm->map_count) { + printk("map_count %d rb %d\n", mm->map_count, i); + bug = 1; + } BUG_ON(bug); } #else -- cgit From db4fbfb9523c93583c339e66023506f651c1d54b Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 11 Dec 2012 16:01:49 -0800 Subject: mm: vm_unmapped_area() lookup function Implement vm_unmapped_area() using the rb_subtree_gap and highest_vm_end information to look up for suitable virtual address space gaps. struct vm_unmapped_area_info is used to define the desired allocation request: - lowest or highest possible address matching the remaining constraints - desired gap length - low/high address limits that the gap must fit into - alignment mask and offset Also update the generic arch_get_unmapped_area[_topdown] functions to make use of vm_unmapped_area() instead of implementing a brute force search. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Cc: Hugh Dickins Cc: Russell King Cc: Ralf Baechle Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 312 ++++++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 222 insertions(+), 90 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index ff93f6c8436c..5646677a96d5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1539,6 +1539,206 @@ unacct_error: return error; } +unsigned long unmapped_area(struct vm_unmapped_area_info *info) +{ + /* + * We implement the search by looking for an rbtree node that + * immediately follows a suitable gap. That is, + * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length; + * - gap_end = vma->vm_start >= info->low_limit + length; + * - gap_end - gap_start >= length + */ + + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long length, low_limit, high_limit, gap_start, gap_end; + + /* Adjust search length to account for worst case alignment overhead */ + length = info->length + info->align_mask; + if (length < info->length) + return -ENOMEM; + + /* Adjust search limits by the desired length */ + if (info->high_limit < length) + return -ENOMEM; + high_limit = info->high_limit - length; + + if (info->low_limit > high_limit) + return -ENOMEM; + low_limit = info->low_limit + length; + + /* Check if rbtree root looks promising */ + if (RB_EMPTY_ROOT(&mm->mm_rb)) + goto check_highest; + vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); + if (vma->rb_subtree_gap < length) + goto check_highest; + + while (true) { + /* Visit left subtree if it looks promising */ + gap_end = vma->vm_start; + if (gap_end >= low_limit && vma->vm_rb.rb_left) { + struct vm_area_struct *left = + rb_entry(vma->vm_rb.rb_left, + struct vm_area_struct, vm_rb); + if (left->rb_subtree_gap >= length) { + vma = left; + continue; + } + } + + gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; +check_current: + /* Check if current node has a suitable gap */ + if (gap_start > high_limit) + return -ENOMEM; + if (gap_end >= low_limit && gap_end - gap_start >= length) + goto found; + + /* Visit right subtree if it looks promising */ + if (vma->vm_rb.rb_right) { + struct vm_area_struct *right = + rb_entry(vma->vm_rb.rb_right, + struct vm_area_struct, vm_rb); + if (right->rb_subtree_gap >= length) { + vma = right; + continue; + } + } + + /* Go back up the rbtree to find next candidate node */ + while (true) { + struct rb_node *prev = &vma->vm_rb; + if (!rb_parent(prev)) + goto check_highest; + vma = rb_entry(rb_parent(prev), + struct vm_area_struct, vm_rb); + if (prev == vma->vm_rb.rb_left) { + gap_start = vma->vm_prev->vm_end; + gap_end = vma->vm_start; + goto check_current; + } + } + } + +check_highest: + /* Check highest gap, which does not precede any rbtree node */ + gap_start = mm->highest_vm_end; + gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */ + if (gap_start > high_limit) + return -ENOMEM; + +found: + /* We found a suitable gap. Clip it with the original low_limit. */ + if (gap_start < info->low_limit) + gap_start = info->low_limit; + + /* Adjust gap address to the desired alignment */ + gap_start += (info->align_offset - gap_start) & info->align_mask; + + VM_BUG_ON(gap_start + info->length > info->high_limit); + VM_BUG_ON(gap_start + info->length > gap_end); + return gap_start; +} + +unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long length, low_limit, high_limit, gap_start, gap_end; + + /* Adjust search length to account for worst case alignment overhead */ + length = info->length + info->align_mask; + if (length < info->length) + return -ENOMEM; + + /* + * Adjust search limits by the desired length. + * See implementation comment at top of unmapped_area(). + */ + gap_end = info->high_limit; + if (gap_end < length) + return -ENOMEM; + high_limit = gap_end - length; + + if (info->low_limit > high_limit) + return -ENOMEM; + low_limit = info->low_limit + length; + + /* Check highest gap, which does not precede any rbtree node */ + gap_start = mm->highest_vm_end; + if (gap_start <= high_limit) + goto found_highest; + + /* Check if rbtree root looks promising */ + if (RB_EMPTY_ROOT(&mm->mm_rb)) + return -ENOMEM; + vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); + if (vma->rb_subtree_gap < length) + return -ENOMEM; + + while (true) { + /* Visit right subtree if it looks promising */ + gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; + if (gap_start <= high_limit && vma->vm_rb.rb_right) { + struct vm_area_struct *right = + rb_entry(vma->vm_rb.rb_right, + struct vm_area_struct, vm_rb); + if (right->rb_subtree_gap >= length) { + vma = right; + continue; + } + } + +check_current: + /* Check if current node has a suitable gap */ + gap_end = vma->vm_start; + if (gap_end < low_limit) + return -ENOMEM; + if (gap_start <= high_limit && gap_end - gap_start >= length) + goto found; + + /* Visit left subtree if it looks promising */ + if (vma->vm_rb.rb_left) { + struct vm_area_struct *left = + rb_entry(vma->vm_rb.rb_left, + struct vm_area_struct, vm_rb); + if (left->rb_subtree_gap >= length) { + vma = left; + continue; + } + } + + /* Go back up the rbtree to find next candidate node */ + while (true) { + struct rb_node *prev = &vma->vm_rb; + if (!rb_parent(prev)) + return -ENOMEM; + vma = rb_entry(rb_parent(prev), + struct vm_area_struct, vm_rb); + if (prev == vma->vm_rb.rb_right) { + gap_start = vma->vm_prev ? + vma->vm_prev->vm_end : 0; + goto check_current; + } + } + } + +found: + /* We found a suitable gap. Clip it with the original high_limit. */ + if (gap_end > info->high_limit) + gap_end = info->high_limit; + +found_highest: + /* Compute highest gap address at the desired alignment */ + gap_end -= info->length; + gap_end -= (gap_end - info->align_offset) & info->align_mask; + + VM_BUG_ON(gap_end < info->low_limit); + VM_BUG_ON(gap_end < gap_start); + return gap_end; +} + /* Get an address range which is currently unmapped. * For shmat() with addr=0. * @@ -1557,7 +1757,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long start_addr; + struct vm_unmapped_area_info info; if (len > TASK_SIZE) return -ENOMEM; @@ -1572,40 +1772,13 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, (!vma || addr + len <= vma->vm_start)) return addr; } - if (len > mm->cached_hole_size) { - start_addr = addr = mm->free_area_cache; - } else { - start_addr = addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - } -full_search: - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (TASK_SIZE - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ - if (start_addr != TASK_UNMAPPED_BASE) { - addr = TASK_UNMAPPED_BASE; - start_addr = addr; - mm->cached_hole_size = 0; - goto full_search; - } - return -ENOMEM; - } - if (!vma || addr + len <= vma->vm_start) { - /* - * Remember the place where we stopped the search: - */ - mm->free_area_cache = addr + len; - return addr; - } - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - addr = vma->vm_end; - } + info.flags = 0; + info.length = len; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + info.align_mask = 0; + return vm_unmapped_area(&info); } #endif @@ -1630,7 +1803,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - unsigned long addr = addr0, start_addr; + unsigned long addr = addr0; + struct vm_unmapped_area_info info; /* requested length too big for entire address space */ if (len > TASK_SIZE) @@ -1648,53 +1822,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; } - /* check if free_area_cache is useful for us */ - if (len <= mm->cached_hole_size) { - mm->cached_hole_size = 0; - mm->free_area_cache = mm->mmap_base; - } - -try_again: - /* either no address requested or can't fit in requested address hole */ - start_addr = addr = mm->free_area_cache; - - if (addr < len) - goto fail; - - addr -= len; - do { - /* - * Lookup failure means no vma is above this address, - * else if new region fits below vma->vm_start, - * return with success: - */ - vma = find_vma(mm, addr); - if (!vma || addr+len <= vma->vm_start) - /* remember the address as a hint for next time */ - return (mm->free_area_cache = addr); - - /* remember the largest hole we saw so far */ - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - - /* try just below the current vma->vm_start */ - addr = vma->vm_start-len; - } while (len < vma->vm_start); - -fail: - /* - * if hint left us with no space for the requested - * mapping then try again: - * - * Note: this is different with the case of bottomup - * which does the fully line-search, but we use find_vma - * here that causes some holes skipped. - */ - if (start_addr != mm->mmap_base) { - mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = 0; - goto try_again; - } + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = mm->mmap_base; + info.align_mask = 0; + addr = vm_unmapped_area(&info); /* * A failed mmap() very likely causes application failure, @@ -1702,14 +1835,13 @@ fail: * can happen with large stack limits and large mmap() * allocations. */ - mm->cached_hole_size = ~0UL; - mm->free_area_cache = TASK_UNMAPPED_BASE; - addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); - /* - * Restore the topdown base: - */ - mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = ~0UL; + if (addr & ~PAGE_MASK) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + addr = vm_unmapped_area(&info); + } return addr; } -- cgit From 78bd52097d04205a33a8014a1b8ac01cf1ae9d06 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Tue, 11 Dec 2012 16:02:31 -0800 Subject: mm: adjust address_space_operations.migratepage() return code Memory fragmentation introduced by ballooning might reduce significantly the number of 2MB contiguous memory blocks that can be used within a guest, thus imposing performance penalties associated with the reduced number of transparent huge pages that could be used by the guest workload. This patch-set follows the main idea discussed at 2012 LSFMMS session: "Ballooning for transparent huge pages" -- http://lwn.net/Articles/490114/ to introduce the required changes to the virtio_balloon driver, as well as the changes to the core compaction & migration bits, in order to make those subsystems aware of ballooned pages and allow memory balloon pages become movable within a guest, thus avoiding the aforementioned fragmentation issue Following are numbers that prove this patch benefits on allowing compaction to be more effective at memory ballooned guests. Results for STRESS-HIGHALLOC benchmark, from Mel Gorman's mmtests suite, running on a 4gB RAM KVM guest which was ballooning 512mB RAM in 64mB chunks, at every minute (inflating/deflating), while test was running: ===BEGIN stress-highalloc STRESS-HIGHALLOC highalloc-3.7 highalloc-3.7 rc4-clean rc4-patch Pass 1 55.00 ( 0.00%) 62.00 ( 7.00%) Pass 2 54.00 ( 0.00%) 62.00 ( 8.00%) while Rested 75.00 ( 0.00%) 80.00 ( 5.00%) MMTests Statistics: duration 3.7 3.7 rc4-clean rc4-patch User 1207.59 1207.46 System 1300.55 1299.61 Elapsed 2273.72 2157.06 MMTests Statistics: vmstat 3.7 3.7 rc4-clean rc4-patch Page Ins 3581516 2374368 Page Outs 11148692 10410332 Swap Ins 80 47 Swap Outs 3641 476 Direct pages scanned 37978 33826 Kswapd pages scanned 1828245 1342869 Kswapd pages reclaimed 1710236 1304099 Direct pages reclaimed 32207 31005 Kswapd efficiency 93% 97% Kswapd velocity 804.077 622.546 Direct efficiency 84% 91% Direct velocity 16.703 15.682 Percentage direct scans 2% 2% Page writes by reclaim 79252 9704 Page writes file 75611 9228 Page writes anon 3641 476 Page reclaim immediate 16764 11014 Page rescued immediate 0 0 Slabs scanned 2171904 2152448 Direct inode steals 385 2261 Kswapd inode steals 659137 609670 Kswapd skipped wait 1 69 THP fault alloc 546 631 THP collapse alloc 361 339 THP splits 259 263 THP fault fallback 98 50 THP collapse fail 20 17 Compaction stalls 747 499 Compaction success 244 145 Compaction failures 503 354 Compaction pages moved 370888 474837 Compaction move failure 77378 65259 ===END stress-highalloc This patch: Introduce MIGRATEPAGE_SUCCESS as the default return code for address_space_operations.migratepage() method and documents the expected return code for the same method in failure cases. Signed-off-by: Rafael Aquini Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Rik van Riel Cc: Mel Gorman Cc: Andi Kleen Cc: Konrad Rzeszutek Wilk Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 1dc4598d2513..33f5f82a6006 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -276,7 +276,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, /* Anonymous page without mapping */ if (page_count(page) != 1) return -EAGAIN; - return 0; + return MIGRATEPAGE_SUCCESS; } spin_lock_irq(&mapping->tree_lock); @@ -346,7 +346,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, } spin_unlock_irq(&mapping->tree_lock); - return 0; + return MIGRATEPAGE_SUCCESS; } /* @@ -362,7 +362,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, if (!mapping) { if (page_count(page) != 1) return -EAGAIN; - return 0; + return MIGRATEPAGE_SUCCESS; } spin_lock_irq(&mapping->tree_lock); @@ -389,7 +389,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, page_unfreeze_refs(page, expected_count - 1); spin_unlock_irq(&mapping->tree_lock); - return 0; + return MIGRATEPAGE_SUCCESS; } /* @@ -476,11 +476,11 @@ int migrate_page(struct address_space *mapping, rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); - if (rc) + if (rc != MIGRATEPAGE_SUCCESS) return rc; migrate_page_copy(newpage, page); - return 0; + return MIGRATEPAGE_SUCCESS; } EXPORT_SYMBOL(migrate_page); @@ -503,7 +503,7 @@ int buffer_migrate_page(struct address_space *mapping, rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); - if (rc) + if (rc != MIGRATEPAGE_SUCCESS) return rc; /* @@ -539,7 +539,7 @@ int buffer_migrate_page(struct address_space *mapping, } while (bh != head); - return 0; + return MIGRATEPAGE_SUCCESS; } EXPORT_SYMBOL(buffer_migrate_page); #endif @@ -618,7 +618,7 @@ static int fallback_migrate_page(struct address_space *mapping, * * Return value: * < 0 - error code - * == 0 - success + * MIGRATEPAGE_SUCCESS - success */ static int move_to_new_page(struct page *newpage, struct page *page, int remap_swapcache, enum migrate_mode mode) @@ -655,7 +655,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, else rc = fallback_migrate_page(mapping, newpage, page, mode); - if (rc) { + if (rc != MIGRATEPAGE_SUCCESS) { newpage->mapping = NULL; } else { if (remap_swapcache) @@ -804,7 +804,7 @@ skip_unmap: put_anon_vma(anon_vma); uncharge: - mem_cgroup_end_migration(mem, page, newpage, rc == 0); + mem_cgroup_end_migration(mem, page, newpage, rc == MIGRATEPAGE_SUCCESS); unlock: unlock_page(page); out: @@ -977,7 +977,7 @@ int migrate_pages(struct list_head *from, case -EAGAIN: retry++; break; - case 0: + case MIGRATEPAGE_SUCCESS: break; default: /* Permanent failure */ @@ -986,15 +986,12 @@ int migrate_pages(struct list_head *from, } } } - rc = 0; + rc = nr_failed + retry; out: if (!swapwrite) current->flags &= ~PF_SWAPWRITE; - if (rc) - return rc; - - return nr_failed + retry; + return rc; } int migrate_huge_page(struct page *hpage, new_page_t get_new_page, @@ -1014,7 +1011,7 @@ int migrate_huge_page(struct page *hpage, new_page_t get_new_page, /* try again */ cond_resched(); break; - case 0: + case MIGRATEPAGE_SUCCESS: goto out; default: rc = -EIO; -- cgit From 18468d93e53b037e1a04ec58398eab763d054064 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Tue, 11 Dec 2012 16:02:38 -0800 Subject: mm: introduce a common interface for balloon pages mobility Memory fragmentation introduced by ballooning might reduce significantly the number of 2MB contiguous memory blocks that can be used within a guest, thus imposing performance penalties associated with the reduced number of transparent huge pages that could be used by the guest workload. This patch introduces a common interface to help a balloon driver on making its page set movable to compaction, and thus allowing the system to better leverage the compation efforts on memory defragmentation. [akpm@linux-foundation.org: use PAGE_FLAGS_CHECK_AT_PREP, s/__balloon_page_flags/page_flags_cleared/, small cleanups] [rientjes@google.com: allow balloon compaction for any system with memory compaction enabled, which is the defconfig] Signed-off-by: Rafael Aquini Acked-by: Mel Gorman Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Rik van Riel Cc: Andi Kleen Cc: Konrad Rzeszutek Wilk Cc: Minchan Kim Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 15 +++ mm/Makefile | 3 +- mm/balloon_compaction.c | 302 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 319 insertions(+), 1 deletion(-) create mode 100644 mm/balloon_compaction.c (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index a3f8dddaaab3..e6651c5de14f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -187,6 +187,21 @@ config SPLIT_PTLOCK_CPUS default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC default "4" +# +# support for memory balloon compaction +config BALLOON_COMPACTION + bool "Allow for balloon memory compaction/migration" + def_bool y + depends on COMPACTION && VIRTIO_BALLOON + help + Memory fragmentation introduced by ballooning might reduce + significantly the number of 2MB contiguous memory blocks that can be + used within a guest, thus imposing performance penalties associated + with the reduced number of transparent huge pages that could be used + by the guest workload. Allowing the compaction & migration for memory + pages enlisted as being part of memory balloon devices avoids the + scenario aforementioned and helps improving memory defragmentation. + # # support for memory compaction config COMPACTION diff --git a/mm/Makefile b/mm/Makefile index 6b025f80af34..3a4628751f89 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -16,7 +16,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o interval_tree.o $(mmu-y) + compaction.o balloon_compaction.o \ + interval_tree.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c new file mode 100644 index 000000000000..07dbc8ec46cf --- /dev/null +++ b/mm/balloon_compaction.c @@ -0,0 +1,302 @@ +/* + * mm/balloon_compaction.c + * + * Common interface for making balloon pages movable by compaction. + * + * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini + */ +#include +#include +#include +#include + +/* + * balloon_devinfo_alloc - allocates a balloon device information descriptor. + * @balloon_dev_descriptor: pointer to reference the balloon device which + * this struct balloon_dev_info will be servicing. + * + * Driver must call it to properly allocate and initialize an instance of + * struct balloon_dev_info which will be used to reference a balloon device + * as well as to keep track of the balloon device page list. + */ +struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor) +{ + struct balloon_dev_info *b_dev_info; + b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL); + if (!b_dev_info) + return ERR_PTR(-ENOMEM); + + b_dev_info->balloon_device = balloon_dev_descriptor; + b_dev_info->mapping = NULL; + b_dev_info->isolated_pages = 0; + spin_lock_init(&b_dev_info->pages_lock); + INIT_LIST_HEAD(&b_dev_info->pages); + + return b_dev_info; +} +EXPORT_SYMBOL_GPL(balloon_devinfo_alloc); + +/* + * balloon_page_enqueue - allocates a new page and inserts it into the balloon + * page list. + * @b_dev_info: balloon device decriptor where we will insert a new page to + * + * Driver must call it to properly allocate a new enlisted balloon page + * before definetively removing it from the guest system. + * This function returns the page address for the recently enqueued page or + * NULL in the case we fail to allocate a new page this turn. + */ +struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info) +{ + unsigned long flags; + struct page *page = alloc_page(balloon_mapping_gfp_mask() | + __GFP_NOMEMALLOC | __GFP_NORETRY); + if (!page) + return NULL; + + /* + * Block others from accessing the 'page' when we get around to + * establishing additional references. We should be the only one + * holding a reference to the 'page' at this point. + */ + BUG_ON(!trylock_page(page)); + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages); + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + unlock_page(page); + return page; +} +EXPORT_SYMBOL_GPL(balloon_page_enqueue); + +/* + * balloon_page_dequeue - removes a page from balloon's page list and returns + * the its address to allow the driver release the page. + * @b_dev_info: balloon device decriptor where we will grab a page from. + * + * Driver must call it to properly de-allocate a previous enlisted balloon page + * before definetively releasing it back to the guest system. + * This function returns the page address for the recently dequeued page or + * NULL in the case we find balloon's page list temporarily empty due to + * compaction isolated pages. + */ +struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) +{ + struct page *page, *tmp; + unsigned long flags; + bool dequeued_page; + + dequeued_page = false; + list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { + /* + * Block others from accessing the 'page' while we get around + * establishing additional references and preparing the 'page' + * to be released by the balloon driver. + */ + if (trylock_page(page)) { + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + /* + * Raise the page refcount here to prevent any wrong + * attempt to isolate this page, in case of coliding + * with balloon_page_isolate() just after we release + * the page lock. + * + * balloon_page_free() will take care of dropping + * this extra refcount later. + */ + get_page(page); + balloon_page_delete(page); + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + unlock_page(page); + dequeued_page = true; + break; + } + } + + if (!dequeued_page) { + /* + * If we are unable to dequeue a balloon page because the page + * list is empty and there is no isolated pages, then something + * went out of track and some balloon pages are lost. + * BUG() here, otherwise the balloon driver may get stuck into + * an infinite loop while attempting to release all its pages. + */ + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + if (unlikely(list_empty(&b_dev_info->pages) && + !b_dev_info->isolated_pages)) + BUG(); + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + page = NULL; + } + return page; +} +EXPORT_SYMBOL_GPL(balloon_page_dequeue); + +#ifdef CONFIG_BALLOON_COMPACTION +/* + * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages. + * @b_dev_info: holds the balloon device information descriptor. + * @a_ops: balloon_mapping address_space_operations descriptor. + * + * Driver must call it to properly allocate and initialize an instance of + * struct address_space which will be used as the special page->mapping for + * balloon device enlisted page instances. + */ +struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info, + const struct address_space_operations *a_ops) +{ + struct address_space *mapping; + + mapping = kmalloc(sizeof(*mapping), GFP_KERNEL); + if (!mapping) + return ERR_PTR(-ENOMEM); + + /* + * Give a clean 'zeroed' status to all elements of this special + * balloon page->mapping struct address_space instance. + */ + address_space_init_once(mapping); + + /* + * Set mapping->flags appropriately, to allow balloon pages + * ->mapping identification. + */ + mapping_set_balloon(mapping); + mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask()); + + /* balloon's page->mapping->a_ops callback descriptor */ + mapping->a_ops = a_ops; + + /* + * Establish a pointer reference back to the balloon device descriptor + * this particular page->mapping will be servicing. + * This is used by compaction / migration procedures to identify and + * access the balloon device pageset while isolating / migrating pages. + * + * As some balloon drivers can register multiple balloon devices + * for a single guest, this also helps compaction / migration to + * properly deal with multiple balloon pagesets, when required. + */ + mapping->private_data = b_dev_info; + b_dev_info->mapping = mapping; + + return mapping; +} +EXPORT_SYMBOL_GPL(balloon_mapping_alloc); + +static inline void __isolate_balloon_page(struct page *page) +{ + struct balloon_dev_info *b_dev_info = page->mapping->private_data; + unsigned long flags; + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + list_del(&page->lru); + b_dev_info->isolated_pages++; + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); +} + +static inline void __putback_balloon_page(struct page *page) +{ + struct balloon_dev_info *b_dev_info = page->mapping->private_data; + unsigned long flags; + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + list_add(&page->lru, &b_dev_info->pages); + b_dev_info->isolated_pages--; + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); +} + +static inline int __migrate_balloon_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + return page->mapping->a_ops->migratepage(mapping, newpage, page, mode); +} + +/* __isolate_lru_page() counterpart for a ballooned page */ +bool balloon_page_isolate(struct page *page) +{ + /* + * Avoid burning cycles with pages that are yet under __free_pages(), + * or just got freed under us. + * + * In case we 'win' a race for a balloon page being freed under us and + * raise its refcount preventing __free_pages() from doing its job + * the put_page() at the end of this block will take care of + * release this page, thus avoiding a nasty leakage. + */ + if (likely(get_page_unless_zero(page))) { + /* + * As balloon pages are not isolated from LRU lists, concurrent + * compaction threads can race against page migration functions + * as well as race against the balloon driver releasing a page. + * + * In order to avoid having an already isolated balloon page + * being (wrongly) re-isolated while it is under migration, + * or to avoid attempting to isolate pages being released by + * the balloon driver, lets be sure we have the page lock + * before proceeding with the balloon page isolation steps. + */ + if (likely(trylock_page(page))) { + /* + * A ballooned page, by default, has just one refcount. + * Prevent concurrent compaction threads from isolating + * an already isolated balloon page by refcount check. + */ + if (__is_movable_balloon_page(page) && + page_count(page) == 2) { + __isolate_balloon_page(page); + unlock_page(page); + return true; + } + unlock_page(page); + } + put_page(page); + } + return false; +} + +/* putback_lru_page() counterpart for a ballooned page */ +void balloon_page_putback(struct page *page) +{ + /* + * 'lock_page()' stabilizes the page and prevents races against + * concurrent isolation threads attempting to re-isolate it. + */ + lock_page(page); + + if (__is_movable_balloon_page(page)) { + __putback_balloon_page(page); + /* drop the extra ref count taken for page isolation */ + put_page(page); + } else { + WARN_ON(1); + dump_page(page); + } + unlock_page(page); +} + +/* move_to_new_page() counterpart for a ballooned page */ +int balloon_page_migrate(struct page *newpage, + struct page *page, enum migrate_mode mode) +{ + struct address_space *mapping; + int rc = -EAGAIN; + + /* + * Block others from accessing the 'newpage' when we get around to + * establishing additional references. We should be the only one + * holding a reference to the 'newpage' at this point. + */ + BUG_ON(!trylock_page(newpage)); + + if (WARN_ON(!__is_movable_balloon_page(page))) { + dump_page(page); + unlock_page(newpage); + return rc; + } + + mapping = page->mapping; + if (mapping) + rc = __migrate_balloon_page(mapping, newpage, page, mode); + + unlock_page(newpage); + return rc; +} +#endif /* CONFIG_BALLOON_COMPACTION */ -- cgit From bf6bddf1924eaebf2beb85e4249a89dd16d4eed6 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Tue, 11 Dec 2012 16:02:42 -0800 Subject: mm: introduce compaction and migration for ballooned pages Memory fragmentation introduced by ballooning might reduce significantly the number of 2MB contiguous memory blocks that can be used within a guest, thus imposing performance penalties associated with the reduced number of transparent huge pages that could be used by the guest workload. This patch introduces the helper functions as well as the necessary changes to teach compaction and migration bits how to cope with pages which are part of a guest memory balloon, in order to make them movable by memory compaction procedures. Signed-off-by: Rafael Aquini Acked-by: Mel Gorman Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Rik van Riel Cc: Andi Kleen Cc: Konrad Rzeszutek Wilk Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 21 +++++++++++++++++++-- mm/migrate.c | 34 ++++++++++++++++++++++++++++++++-- 2 files changed, 51 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 694eaabaaebd..470474c03b61 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internal.h" #if defined CONFIG_COMPACTION || defined CONFIG_CMA @@ -565,9 +566,24 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, goto next_pageblock; } - /* Check may be lockless but that's ok as we recheck later */ - if (!PageLRU(page)) + /* + * Check may be lockless but that's ok as we recheck later. + * It's possible to migrate LRU pages and balloon pages + * Skip any other type of page + */ + if (!PageLRU(page)) { + if (unlikely(balloon_page_movable(page))) { + if (locked && balloon_page_isolate(page)) { + /* Successfully isolated */ + cc->finished_update_migrate = true; + list_add(&page->lru, migratelist); + cc->nr_migratepages++; + nr_isolated++; + goto check_compact_cluster; + } + } continue; + } /* * PageLRU is set. lru_lock normally excludes isolation @@ -621,6 +637,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, cc->nr_migratepages++; nr_isolated++; +check_compact_cluster: /* Avoid isolating too much */ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { ++low_pfn; diff --git a/mm/migrate.c b/mm/migrate.c index 33f5f82a6006..427343c0c296 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -35,6 +35,7 @@ #include #include #include +#include #include @@ -79,7 +80,10 @@ void putback_lru_pages(struct list_head *l) list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - putback_lru_page(page); + if (unlikely(balloon_page_movable(page))) + balloon_page_putback(page); + else + putback_lru_page(page); } } @@ -768,6 +772,18 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } } + if (unlikely(balloon_page_movable(page))) { + /* + * A ballooned page does not need any special attention from + * physical to virtual reverse mapping procedures. + * Skip any attempt to unmap PTEs or to remap swap cache, + * in order to avoid burning cycles at rmap level, and perform + * the page migration right away (proteced by page lock). + */ + rc = balloon_page_migrate(newpage, page, mode); + goto uncharge; + } + /* * Corner case handling: * 1. When a new swap-cache page is read into, it is added to the LRU @@ -804,7 +820,9 @@ skip_unmap: put_anon_vma(anon_vma); uncharge: - mem_cgroup_end_migration(mem, page, newpage, rc == MIGRATEPAGE_SUCCESS); + mem_cgroup_end_migration(mem, page, newpage, + (rc == MIGRATEPAGE_SUCCESS || + rc == MIGRATEPAGE_BALLOON_SUCCESS)); unlock: unlock_page(page); out: @@ -836,6 +854,18 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, goto out; rc = __unmap_and_move(page, newpage, force, offlining, mode); + + if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { + /* + * A ballooned page has been migrated already. + * Now, it's the time to wrap-up counters, + * handle the page back to Buddy and return. + */ + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + balloon_page_free(page); + return MIGRATEPAGE_SUCCESS; + } out: if (rc != -EAGAIN) { /* -- cgit From 5733c7d11dff44e98d2ca16617886a78086b354f Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Tue, 11 Dec 2012 16:02:47 -0800 Subject: mm: introduce putback_movable_pages() The PATCH "mm: introduce compaction and migration for virtio ballooned pages" hacks around putback_lru_pages() in order to allow ballooned pages to be re-inserted on balloon page list as if a ballooned page was like a LRU page. As ballooned pages are not legitimate LRU pages, this patch introduces putback_movable_pages() to properly cope with cases where the isolated pageset contains ballooned pages and LRU pages, thus fixing the mentioned inelegant hack around putback_lru_pages(). Signed-off-by: Rafael Aquini Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Rik van Riel Cc: Mel Gorman Cc: Andi Kleen Cc: Konrad Rzeszutek Wilk Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 6 +++--- mm/migrate.c | 20 ++++++++++++++++++++ mm/page_alloc.c | 2 +- 3 files changed, 24 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 470474c03b61..d24dd2d7bad4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1003,7 +1003,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) switch (isolate_migratepages(zone, cc)) { case ISOLATE_ABORT: ret = COMPACT_PARTIAL; - putback_lru_pages(&cc->migratepages); + putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; goto out; case ISOLATE_NONE: @@ -1026,9 +1026,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) trace_mm_compaction_migratepages(nr_migrate - nr_remaining, nr_remaining); - /* Release LRU pages not migrated */ + /* Release isolated pages not migrated */ if (err) { - putback_lru_pages(&cc->migratepages); + putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; if (err == -ENOMEM) { ret = COMPACT_PARTIAL; diff --git a/mm/migrate.c b/mm/migrate.c index 427343c0c296..3f675ca08279 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -76,6 +76,26 @@ void putback_lru_pages(struct list_head *l) struct page *page; struct page *page2; + list_for_each_entry_safe(page, page2, l, lru) { + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } +} + +/* + * Put previously isolated pages back onto the appropriate lists + * from where they were once taken off for compaction/migration. + * + * This function shall be used instead of putback_lru_pages(), + * whenever the isolated pageset has been built by isolate_migratepages_range() + */ +void putback_movable_pages(struct list_head *l) +{ + struct page *page; + struct page *page2; + list_for_each_entry_safe(page, page2, l, lru) { list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5a7b7611d4e4..d9fbac1b5030 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5761,7 +5761,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, 0, false, MIGRATE_SYNC); } - putback_lru_pages(&cc->migratepages); + putback_movable_pages(&cc->migratepages); return ret > 0 ? 0 : ret; } -- cgit From 6f6313d4870f9642cb3ea8ec892cf6da81331b9c Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Tue, 11 Dec 2012 16:02:48 -0800 Subject: mm/vmscan.c: try_to_freeze() returns boolean kswapd()->try_to_freeze() is defined to return a boolean, so it's better to use a bool to hold its return value. Signed-off-by: Jie Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 53947311c777..157bb116dec8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2963,7 +2963,7 @@ static int kswapd(void *p) classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; balanced_classzone_idx = classzone_idx; for ( ; ; ) { - int ret; + bool ret; /* * If the last balance_pgdat was unsuccessful it's unlikely a -- cgit From 212a0a6f28dda0a1e732d20d57abb465750d473c Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 11 Dec 2012 16:02:51 -0800 Subject: mm, mempolicy: remove duplicate code Remove some duplicate code and simplify alloc_pages_vma(). No functional change. Signed-off-by: David Rientjes Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4ea600da8940..05b28361a39b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1907,7 +1907,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, int node) { struct mempolicy *pol; - struct zonelist *zl; struct page *page; unsigned int cpuset_mems_cookie; @@ -1926,23 +1925,11 @@ retry_cpuset: return page; } - zl = policy_zonelist(gfp, pol, node); - if (unlikely(mpol_needs_cond_ref(pol))) { - /* - * slow path: ref counted shared policy - */ - struct page *page = __alloc_pages_nodemask(gfp, order, - zl, policy_nodemask(gfp, pol)); - __mpol_put(pol); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) - goto retry_cpuset; - return page; - } - /* - * fast path: default or task policy - */ - page = __alloc_pages_nodemask(gfp, order, zl, + page = __alloc_pages_nodemask(gfp, order, + policy_zonelist(gfp, pol, node), policy_nodemask(gfp, pol)); + if (unlikely(mpol_needs_cond_ref(pol))) + __mpol_put(pol); if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) goto retry_cpuset; return page; -- cgit From a9c58b907dbc6821533dfc295b63caf111ff1f16 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 11 Dec 2012 16:02:54 -0800 Subject: mm, oom: change type of oom_score_adj to short The maximum oom_score_adj is 1000 and the minimum oom_score_adj is -1000, so this range can be represented by the signed short type with no functional change. The extra space this frees up in struct signal_struct will be used for per-thread oom kill flags in the next patch. Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Reviewed-by: Michal Hocko Cc: Anton Vorontsov Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 2 +- mm/oom_kill.c | 10 +++++----- mm/swapfile.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 31ae5ea1eac0..b4d5a9deb17f 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1919,7 +1919,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, if (ksm_run != flags) { ksm_run = flags; if (flags & KSM_RUN_UNMERGE) { - int oom_score_adj; + short oom_score_adj; oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); err = unmerge_and_remove_all_rmap_items(); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7e9e9113bd05..37ab4c5ab6e8 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -53,7 +53,7 @@ static DEFINE_SPINLOCK(zone_scan_lock); * @old_val. Usually used to reinstate a previous value to prevent racing with * userspacing tuning the value in the interim. */ -void compare_swap_oom_score_adj(int old_val, int new_val) +void compare_swap_oom_score_adj(short old_val, short new_val) { struct sighand_struct *sighand = current->sighand; @@ -72,7 +72,7 @@ void compare_swap_oom_score_adj(int old_val, int new_val) * synchronization and returns the old value. Usually used to temporarily * set a value, save the old value in the caller, and then reinstate it later. */ -int test_set_oom_score_adj(int new_val) +short test_set_oom_score_adj(short new_val) { struct sighand_struct *sighand = current->sighand; int old_val; @@ -193,7 +193,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, if (!p) return 0; - adj = p->signal->oom_score_adj; + adj = (long)p->signal->oom_score_adj; if (adj == OOM_SCORE_ADJ_MIN) { task_unlock(p); return 0; @@ -399,7 +399,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), task->mm->nr_ptes, @@ -415,7 +415,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, { task_lock(current); pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " - "oom_score_adj=%d\n", + "oom_score_adj=%hd\n", current->comm, gfp_mask, order, current->signal->oom_score_adj); cpuset_print_task_mems_allowed(current); diff --git a/mm/swapfile.c b/mm/swapfile.c index 0fbb45283c66..bb6f6a04e92d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1498,7 +1498,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct address_space *mapping; struct inode *inode; struct filename *pathname; - int oom_score_adj; + short oom_score_adj; int i, type, prev; int err; -- cgit From e1e12d2f3104be886073ac6c5c4678f30b1b9e51 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 11 Dec 2012 16:02:56 -0800 Subject: mm, oom: fix race when specifying a thread as the oom origin test_set_oom_score_adj() and compare_swap_oom_score_adj() are used to specify that current should be killed first if an oom condition occurs in between the two calls. The usage is short oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); ... compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); to store the thread's oom_score_adj, temporarily change it to the maximum score possible, and then restore the old value if it is still the same. This happens to still be racy, however, if the user writes OOM_SCORE_ADJ_MAX to /proc/pid/oom_score_adj in between the two calls. The compare_swap_oom_score_adj() will then incorrectly reset the old value prior to the write of OOM_SCORE_ADJ_MAX. To fix this, introduce a new oom_flags_t member in struct signal_struct that will be used for per-thread oom killer flags. KSM and swapoff can now use a bit in this member to specify that threads should be killed first in oom conditions without playing around with oom_score_adj. This also allows the correct oom_score_adj to always be shown when reading /proc/pid/oom_score. Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Reviewed-by: Michal Hocko Cc: Anton Vorontsov Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 7 ++----- mm/oom_kill.c | 49 +++++++------------------------------------------ mm/swapfile.c | 5 ++--- 3 files changed, 11 insertions(+), 50 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index b4d5a9deb17f..382d930a0bf1 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1919,12 +1919,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, if (ksm_run != flags) { ksm_run = flags; if (flags & KSM_RUN_UNMERGE) { - short oom_score_adj; - - oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); + set_current_oom_origin(); err = unmerge_and_remove_all_rmap_items(); - compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, - oom_score_adj); + clear_current_oom_origin(); if (err) { ksm_run = KSM_RUN_STOP; count = err; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 37ab4c5ab6e8..18f1ae2b45de 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -44,48 +44,6 @@ int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks = 1; static DEFINE_SPINLOCK(zone_scan_lock); -/* - * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj - * @old_val: old oom_score_adj for compare - * @new_val: new oom_score_adj for swap - * - * Sets the oom_score_adj value for current to @new_val iff its present value is - * @old_val. Usually used to reinstate a previous value to prevent racing with - * userspacing tuning the value in the interim. - */ -void compare_swap_oom_score_adj(short old_val, short new_val) -{ - struct sighand_struct *sighand = current->sighand; - - spin_lock_irq(&sighand->siglock); - if (current->signal->oom_score_adj == old_val) - current->signal->oom_score_adj = new_val; - trace_oom_score_adj_update(current); - spin_unlock_irq(&sighand->siglock); -} - -/** - * test_set_oom_score_adj() - set current's oom_score_adj and return old value - * @new_val: new oom_score_adj value - * - * Sets the oom_score_adj value for current to @new_val with proper - * synchronization and returns the old value. Usually used to temporarily - * set a value, save the old value in the caller, and then reinstate it later. - */ -short test_set_oom_score_adj(short new_val) -{ - struct sighand_struct *sighand = current->sighand; - int old_val; - - spin_lock_irq(&sighand->siglock); - old_val = current->signal->oom_score_adj; - current->signal->oom_score_adj = new_val; - trace_oom_score_adj_update(current); - spin_unlock_irq(&sighand->siglock); - - return old_val; -} - #ifdef CONFIG_NUMA /** * has_intersects_mems_allowed() - check task eligiblity for kill @@ -310,6 +268,13 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, if (!task->mm) return OOM_SCAN_CONTINUE; + /* + * If task is allocating a lot of memory and has been marked to be + * killed first if it triggers an oom, then select it. + */ + if (oom_task_origin(task)) + return OOM_SCAN_SELECT; + if (task->flags & PF_EXITING && !force_kill) { /* * If this task is not being ptraced on exit, then wait for it diff --git a/mm/swapfile.c b/mm/swapfile.c index bb6f6a04e92d..e97a0e5aea91 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1498,7 +1498,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct address_space *mapping; struct inode *inode; struct filename *pathname; - short oom_score_adj; int i, type, prev; int err; @@ -1557,9 +1556,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->flags &= ~SWP_WRITEOK; spin_unlock(&swap_lock); - oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); + set_current_oom_origin(); err = try_to_unuse(type, false, 0); /* force all pages to be unused */ - compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); + clear_current_oom_origin(); if (err) { /* re-insert swap space back into swap_list */ -- cgit From 2e30abd1730751d58463d88bc0844ab8fd7112a9 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Tue, 11 Dec 2012 16:02:57 -0800 Subject: mm: cma: skip watermarks check for already isolated blocks in split_free_page() Since commit 2139cbe627b8 ("cma: fix counting of isolated pages") free pages in isolated pageblocks are not accounted to NR_FREE_PAGES counters, so watermarks check is not required if one operates on a free page in isolated pageblock. Signed-off-by: Marek Szyprowski Cc: Kyungmin Park Cc: Arnd Bergmann Cc: Mel Gorman Acked-by: Michal Nazarewicz Cc: Minchan Kim Cc: Bartlomiej Zolnierkiewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d9fbac1b5030..265fea4fbc81 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1394,21 +1394,22 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) zone = page_zone(page); order = page_order(page); + mt = get_pageblock_migratetype(page); - /* Obey watermarks as if the page was being allocated */ - watermark = low_wmark_pages(zone) + (1 << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) - return 0; + if (mt != MIGRATE_ISOLATE) { + /* Obey watermarks as if the page was being allocated */ + watermark = low_wmark_pages(zone) + (1 << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + return 0; + + __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); + } /* Remove page from free list */ list_del(&page->lru); zone->free_area[order].nr_free--; rmv_page_order(page); - mt = get_pageblock_migratetype(page); - if (unlikely(mt != MIGRATE_ISOLATE)) - __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); - if (alloc_order != order) expand(zone, page, alloc_order, order, &zone->free_area[order], migratetype); -- cgit From bc357f431c836c6631751e3ef7dfe7882394ad67 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Tue, 11 Dec 2012 16:02:59 -0800 Subject: mm: cma: remove watermark hacks Commits 2139cbe627b8 ("cma: fix counting of isolated pages") and d95ea5d18e69 ("cma: fix watermark checking") introduced a reliable method of free page accounting when memory is being allocated from CMA regions, so the workaround introduced earlier by commit 49f223a9cd96 ("mm: trigger page reclaim in alloc_contig_range() to stabilise watermarks") can be finally removed. Signed-off-by: Marek Szyprowski Cc: Kyungmin Park Cc: Arnd Bergmann Cc: Mel Gorman Acked-by: Michal Nazarewicz Cc: Minchan Kim Cc: Bartlomiej Zolnierkiewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 58 --------------------------------------------------------- 1 file changed, 58 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 265fea4fbc81..5a8d339d282a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5218,10 +5218,6 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); - zone->watermark[WMARK_MIN] += cma_wmark_pages(zone); - zone->watermark[WMARK_LOW] += cma_wmark_pages(zone); - zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone); - setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } @@ -5766,54 +5762,6 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, return ret > 0 ? 0 : ret; } -/* - * Update zone's cma pages counter used for watermark level calculation. - */ -static inline void __update_cma_watermarks(struct zone *zone, int count) -{ - unsigned long flags; - spin_lock_irqsave(&zone->lock, flags); - zone->min_cma_pages += count; - spin_unlock_irqrestore(&zone->lock, flags); - setup_per_zone_wmarks(); -} - -/* - * Trigger memory pressure bump to reclaim some pages in order to be able to - * allocate 'count' pages in single page units. Does similar work as - *__alloc_pages_slowpath() function. - */ -static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) -{ - enum zone_type high_zoneidx = gfp_zone(gfp_mask); - struct zonelist *zonelist = node_zonelist(0, gfp_mask); - int did_some_progress = 0; - int order = 1; - - /* - * Increase level of watermarks to force kswapd do his job - * to stabilise at new watermark level. - */ - __update_cma_watermarks(zone, count); - - /* Obey watermarks as if the page was being allocated */ - while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) { - wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); - - did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, - NULL); - if (!did_some_progress) { - /* Exhausted what can be done so it's blamo time */ - out_of_memory(zonelist, gfp_mask, order, NULL, false); - } - } - - /* Restore original watermark levels. */ - __update_cma_watermarks(zone, -count); - - return count; -} - /** * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate @@ -5837,7 +5785,6 @@ static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype) { - struct zone *zone = page_zone(pfn_to_page(start)); unsigned long outer_start, outer_end; int ret = 0, order; @@ -5922,11 +5869,6 @@ int alloc_contig_range(unsigned long start, unsigned long end, goto done; } - /* - * Reclaim enough pages to make sure that contiguous allocation - * will not starve the system. - */ - __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); /* Grab isolated pages from freelists. */ outer_end = isolate_freepages_range(&cc, outer_start, end); -- cgit From 81df9bff2609f07cef4690ac2ebda1611b55b05a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 11 Dec 2012 16:03:10 -0800 Subject: bootmem: fix wrong call parameter for free_bootmem() It is strange that alloc_bootmem() returns a virtual address and free_bootmem() requires a physical address. Anyway, free_bootmem()'s first parameter should be physical address. There are some call sites for free_bootmem() with virtual address. So fix them. [akpm@linux-foundation.org: improve free_bootmem() and free_bootmem_pate() documentation] Signed-off-by: Joonsoo Kim Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Johannes Weiner Cc: FUJITA Tomonori Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index f468185b3b28..ecc45958ac0c 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -147,21 +147,21 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) /* * free_bootmem_late - free bootmem pages directly to page allocator - * @addr: starting address of the range + * @addr: starting physical address of the range * @size: size of the range in bytes * * This is only useful when the bootmem allocator has already been torn * down, but we are still initializing the system. Pages are given directly * to the page allocator, no bootmem metadata is updated because it is gone. */ -void __init free_bootmem_late(unsigned long addr, unsigned long size) +void __init free_bootmem_late(unsigned long physaddr, unsigned long size) { unsigned long cursor, end; - kmemleak_free_part(__va(addr), size); + kmemleak_free_part(__va(physaddr), size); - cursor = PFN_UP(addr); - end = PFN_DOWN(addr + size); + cursor = PFN_UP(physaddr); + end = PFN_DOWN(physaddr + size); for (; cursor < end; cursor++) { __free_pages_bootmem(pfn_to_page(cursor), 0); @@ -377,21 +377,21 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, /** * free_bootmem - mark a page range as usable - * @addr: starting address of the range + * @addr: starting physical address of the range * @size: size of the range in bytes * * Partial pages will be considered reserved and left as they are. * * The range must be contiguous but may span node boundaries. */ -void __init free_bootmem(unsigned long addr, unsigned long size) +void __init free_bootmem(unsigned long physaddr, unsigned long size) { unsigned long start, end; - kmemleak_free_part(__va(addr), size); + kmemleak_free_part(__va(physaddr), size); - start = PFN_UP(addr); - end = PFN_DOWN(addr + size); + start = PFN_UP(physaddr); + end = PFN_DOWN(physaddr + size); mark_bootmem(start, end, 0, 0); } -- cgit From 511c2aba8f07fc45bdcba548cb63f7b8a450c6dc Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 11 Dec 2012 16:03:16 -0800 Subject: mm, memory-hotplug: dynamic configure movable memory and portion memory Add online_movable and online_kernel for logic memory hotplug. This is the dynamic version of "movablecore" & "kernelcore". We have the same reason to introduce it as to introduce "movablecore" & "kernelcore". It has the same motive as "movablecore" & "kernelcore", but it is dynamic/running-time: o We can configure memory as kernelcore or movablecore after boot. Userspace workload is increased, we need more hugepage, we can't use "online_movable" to add memory and allow the system use more THP(transparent-huge-page), vice-verse when kernel workload is increase. Also help for virtualization to dynamic configure host/guest's memory, to save/(reduce waste) memory. Memory capacity on Demand o When a new node is physically online after boot, we need to use "online_movable" or "online_kernel" to configure/portion it as we expected when we logic-online it. This configuration also helps for physically-memory-migrate. o all benefit as the same as existed "movablecore" & "kernelcore". o Preparing for movable-node, which is very important for power-saving, hardware partitioning and high-available-system(hardware fault management). (Note, we don't introduce movable-node here.) Action behavior: When a memoryblock/memorysection is onlined by "online_movable", the kernel will not have directly reference to the page of the memoryblock, thus we can remove that memory any time when needed. When it is online by "online_kernel", the kernel can use it. When it is online by "online", the zone type doesn't changed. Current constraints: Only the memoryblock which is adjacent to the ZONE_MOVABLE can be online from ZONE_NORMAL to ZONE_MOVABLE. [akpm@linux-foundation.org: use min_t, cleanups] Signed-off-by: Lai Jiangshan Signed-off-by: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Lai Jiangshan Cc: Jiang Liu Cc: KOSAKI Motohiro Cc: Minchan Kim Cc: Mel Gorman Cc: David Rientjes Cc: Yinghai Lu Cc: Rusty Russell Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 99 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 571130ee66d7..5c1f4959e6b4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -214,6 +214,88 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, zone_span_writeunlock(zone); } +static void resize_zone(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) +{ + zone_span_writelock(zone); + + zone->zone_start_pfn = start_pfn; + zone->spanned_pages = end_pfn - start_pfn; + + zone_span_writeunlock(zone); +} + +static void fix_zone_id(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) +{ + enum zone_type zid = zone_idx(zone); + int nid = zone->zone_pgdat->node_id; + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn++) + set_page_links(pfn_to_page(pfn), zid, nid, pfn); +} + +static int move_pfn_range_left(struct zone *z1, struct zone *z2, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long flags; + + pgdat_resize_lock(z1->zone_pgdat, &flags); + + /* can't move pfns which are higher than @z2 */ + if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) + goto out_fail; + /* the move out part mast at the left most of @z2 */ + if (start_pfn > z2->zone_start_pfn) + goto out_fail; + /* must included/overlap */ + if (end_pfn <= z2->zone_start_pfn) + goto out_fail; + + resize_zone(z1, z1->zone_start_pfn, end_pfn); + resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); + + pgdat_resize_unlock(z1->zone_pgdat, &flags); + + fix_zone_id(z1, start_pfn, end_pfn); + + return 0; +out_fail: + pgdat_resize_unlock(z1->zone_pgdat, &flags); + return -1; +} + +static int move_pfn_range_right(struct zone *z1, struct zone *z2, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long flags; + + pgdat_resize_lock(z1->zone_pgdat, &flags); + + /* can't move pfns which are lower than @z1 */ + if (z1->zone_start_pfn > start_pfn) + goto out_fail; + /* the move out part mast at the right most of @z1 */ + if (z1->zone_start_pfn + z1->spanned_pages > end_pfn) + goto out_fail; + /* must included/overlap */ + if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) + goto out_fail; + + resize_zone(z1, z1->zone_start_pfn, start_pfn); + resize_zone(z2, start_pfn, z2->zone_start_pfn + z2->spanned_pages); + + pgdat_resize_unlock(z1->zone_pgdat, &flags); + + fix_zone_id(z2, start_pfn, end_pfn); + + return 0; +out_fail: + pgdat_resize_unlock(z1->zone_pgdat, &flags); + return -1; +} + static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long end_pfn) { @@ -508,7 +590,7 @@ static void node_states_set_node(int node, struct memory_notify *arg) } -int __ref online_pages(unsigned long pfn, unsigned long nr_pages) +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { unsigned long onlined_pages = 0; struct zone *zone; @@ -525,6 +607,22 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) */ zone = page_zone(pfn_to_page(pfn)); + if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { + if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { + unlock_memory_hotplug(); + return -1; + } + } + if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { + if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { + unlock_memory_hotplug(); + return -1; + } + } + + /* Previous code may changed the zone of the pfn range */ + zone = page_zone(pfn_to_page(pfn)); + arg.start_pfn = pfn; arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); -- cgit From e455a9b92d6e19a3f0f7eb6f6241efa566a7e81a Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 11 Dec 2012 16:03:20 -0800 Subject: memory_hotplug: handle empty zone when online_movable/online_kernel Make online_movable/online_kernel can empty a zone or can move memory to a empty zone. Signed-off-by: Lai Jiangshan Signed-off-by: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Lai Jiangshan Cc: Jiang Liu Cc: KOSAKI Motohiro Cc: Minchan Kim Cc: Mel Gorman Cc: David Rientjes Cc: Yinghai Lu Cc: Rusty Russell Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 51 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5c1f4959e6b4..c370491bdb97 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -219,8 +219,17 @@ static void resize_zone(struct zone *zone, unsigned long start_pfn, { zone_span_writelock(zone); - zone->zone_start_pfn = start_pfn; - zone->spanned_pages = end_pfn - start_pfn; + if (end_pfn - start_pfn) { + zone->zone_start_pfn = start_pfn; + zone->spanned_pages = end_pfn - start_pfn; + } else { + /* + * make it consist as free_area_init_core(), + * if spanned_pages = 0, then keep start_pfn = 0 + */ + zone->zone_start_pfn = 0; + zone->spanned_pages = 0; + } zone_span_writeunlock(zone); } @@ -236,10 +245,19 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn, set_page_links(pfn_to_page(pfn), zid, nid, pfn); } -static int move_pfn_range_left(struct zone *z1, struct zone *z2, +static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, unsigned long start_pfn, unsigned long end_pfn) { + int ret; unsigned long flags; + unsigned long z1_start_pfn; + + if (!z1->wait_table) { + ret = init_currently_empty_zone(z1, start_pfn, + end_pfn - start_pfn, MEMMAP_HOTPLUG); + if (ret) + return ret; + } pgdat_resize_lock(z1->zone_pgdat, &flags); @@ -253,7 +271,13 @@ static int move_pfn_range_left(struct zone *z1, struct zone *z2, if (end_pfn <= z2->zone_start_pfn) goto out_fail; - resize_zone(z1, z1->zone_start_pfn, end_pfn); + /* use start_pfn for z1's start_pfn if z1 is empty */ + if (z1->spanned_pages) + z1_start_pfn = z1->zone_start_pfn; + else + z1_start_pfn = start_pfn; + + resize_zone(z1, z1_start_pfn, end_pfn); resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); pgdat_resize_unlock(z1->zone_pgdat, &flags); @@ -266,10 +290,19 @@ out_fail: return -1; } -static int move_pfn_range_right(struct zone *z1, struct zone *z2, +static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, unsigned long start_pfn, unsigned long end_pfn) { + int ret; unsigned long flags; + unsigned long z2_end_pfn; + + if (!z2->wait_table) { + ret = init_currently_empty_zone(z2, start_pfn, + end_pfn - start_pfn, MEMMAP_HOTPLUG); + if (ret) + return ret; + } pgdat_resize_lock(z1->zone_pgdat, &flags); @@ -283,8 +316,14 @@ static int move_pfn_range_right(struct zone *z1, struct zone *z2, if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) goto out_fail; + /* use end_pfn for z2's end_pfn if z2 is empty */ + if (z2->spanned_pages) + z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages; + else + z2_end_pfn = end_pfn; + resize_zone(z1, z1->zone_start_pfn, start_pfn); - resize_zone(z2, start_pfn, z2->zone_start_pfn + z2->spanned_pages); + resize_zone(z2, start_pfn, z2_end_pfn); pgdat_resize_unlock(z1->zone_pgdat, &flags); -- cgit From 74d42d8fe146e870c52bde3b1c692f86cc8ff844 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 11 Dec 2012 16:03:23 -0800 Subject: memory_hotplug: ensure every online node has NORMAL memory Old memory hotplug code and new online/movable may cause a online node don't have any normal memory, but memory-management acts bad when we have nodes which is online but don't have any normal memory. Example: it may cause a bound task fail on all kernel allocation and cause the task can't create task or create other kernel object. So we disable non-normal-memory-node here, we will enable it when we prepared. Signed-off-by: Lai Jiangshan Signed-off-by: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Lai Jiangshan Cc: Jiang Liu Cc: KOSAKI Motohiro Cc: Minchan Kim Cc: Mel Gorman Cc: David Rientjes Cc: Yinghai Lu Cc: Rusty Russell Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c370491bdb97..de9cb14ae753 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -581,6 +581,12 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, return 0; } +/* ensure every online node has NORMAL memory */ +static bool can_online_high_movable(struct zone *zone) +{ + return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); +} + /* check which state of node_states will be changed when online memory */ static void node_states_check_changes_online(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) @@ -646,6 +652,12 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ */ zone = page_zone(pfn_to_page(pfn)); + if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && + !can_online_high_movable(zone)) { + unlock_memory_hotplug(); + return -1; + } + if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { unlock_memory_hotplug(); @@ -1054,6 +1066,30 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) return offlined; } +/* ensure the node has NORMAL memory if it is still online */ +static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long present_pages = 0; + enum zone_type zt; + + for (zt = 0; zt <= ZONE_NORMAL; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + + if (present_pages > nr_pages) + return true; + + present_pages = 0; + for (; zt <= ZONE_MOVABLE; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + + /* + * we can't offline the last normal memory until all + * higher memory is offlined. + */ + return present_pages == 0; +} + /* check which state of node_states will be changed when offline memory */ static void node_states_check_changes_offline(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) @@ -1141,6 +1177,10 @@ static int __ref __offline_pages(unsigned long start_pfn, node = zone_to_nid(zone); nr_pages = end_pfn - start_pfn; + ret = -EINVAL; + if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) + goto out; + /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, true); -- cgit From 2d7a695604556913d9ef1ead47881b804d6c52b4 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 12 Dec 2012 13:50:42 -0800 Subject: bootmem: remove not implemented function call, bootmem_arch_preferred_node() There is no implementation of bootmem_arch_preferred_node() and a call to this function will cause a compilation error. So remove it. Signed-off-by: Joonsoo Kim Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Johannes Weiner Cc: FUJITA Tomonori Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index ecc45958ac0c..4730931b45a9 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -581,18 +581,6 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, { if (WARN_ON_ONCE(slab_is_available())) return kzalloc(size, GFP_NOWAIT); - -#ifdef CONFIG_HAVE_ARCH_BOOTMEM - { - bootmem_data_t *p_bdata; - - p_bdata = bootmem_arch_preferred_node(bdata, size, align, - goal, limit); - if (p_bdata) - return alloc_bootmem_bdata(p_bdata, size, align, - goal, limit); - } -#endif return NULL; } -- cgit From 3f7dfe24b84c0ad698c461fc9c4ba3544bbfcebf Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 12 Dec 2012 13:50:43 -0800 Subject: bootmem: remove alloc_arch_preferred_bootmem() The name of this function is not suitable, and removing the function and open-coding it into each call sites makes the code more understandable. Additionally, we shouldn't do an allocation from bootmem when slab_is_available(), so directly return kmalloc()'s return value. Signed-off-by: Joonsoo Kim Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Johannes Weiner Cc: FUJITA Tomonori Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 4730931b45a9..26d057a8b552 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -575,15 +575,6 @@ find_block: return NULL; } -static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, - unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) -{ - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc(size, GFP_NOWAIT); - return NULL; -} - static void * __init alloc_bootmem_core(unsigned long size, unsigned long align, unsigned long goal, @@ -592,9 +583,8 @@ static void * __init alloc_bootmem_core(unsigned long size, bootmem_data_t *bdata; void *region; - region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); - if (region) - return region; + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); list_for_each_entry(bdata, &bdata_list, list) { if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) @@ -692,11 +682,9 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, { void *ptr; + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); again: - ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, - align, goal, limit); - if (ptr) - return ptr; /* do not panic in alloc_bootmem_bdata() */ if (limit && goal + size > limit) -- cgit From 4a6c1297268c917e9c50701906ba2f7e06812299 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 12 Dec 2012 13:50:47 -0800 Subject: thp: huge zero page: basic preparation During testing I noticed big (up to 2.5 times) memory consumption overhead on some workloads (e.g. ft.A from NPB) if THP is enabled. The main reason for that big difference is lacking zero page in THP case. We have to allocate a real page on read page fault. A program to demonstrate the issue: #include #include #include #define MB 1024*1024 int main(int argc, char **argv) { char *p; int i; posix_memalign((void **)&p, 2 * MB, 200 * MB); for (i = 0; i < 200 * MB; i+= 4096) assert(p[i] == 0); pause(); return 0; } With thp-never RSS is about 400k, but with thp-always it's 200M. After the patcheset thp-always RSS is 400k too. Design overview. Huge zero page (hzp) is a non-movable huge page (2M on x86-64) filled with zeros. The way how we allocate it changes in the patchset: - [01/10] simplest way: hzp allocated on boot time in hugepage_init(); - [09/10] lazy allocation on first use; - [10/10] lockless refcounting + shrinker-reclaimable hzp; We setup it in do_huge_pmd_anonymous_page() if area around fault address is suitable for THP and we've got read page fault. If we fail to setup hzp (ENOMEM) we fallback to handle_pte_fault() as we normally do in THP. On wp fault to hzp we allocate real memory for the huge page and clear it. If ENOMEM, graceful fallback: we create a new pmd table and set pte around fault address to newly allocated normal (4k) page. All other ptes in the pmd set to normal zero page. We cannot split hzp (and it's bug if we try), but we can split the pmd which points to it. On splitting the pmd we create a table with all ptes set to normal zero page. === By hpa's request I've tried alternative approach for hzp implementation (see Virtual huge zero page patchset): pmd table with all entries set to zero page. This way should be more cache friendly, but it increases TLB pressure. The problem with virtual huge zero page: it requires per-arch enabling. We need a way to mark that pmd table has all ptes set to zero page. Some numbers to compare two implementations (on 4s Westmere-EX): Mirobenchmark1 ============== test: posix_memalign((void **)&p, 2 * MB, 8 * GB); for (i = 0; i < 100; i++) { assert(memcmp(p, p + 4*GB, 4*GB) == 0); asm volatile ("": : :"memory"); } hzp: Performance counter stats for './test_memcmp' (5 runs): 32356.272845 task-clock # 0.998 CPUs utilized ( +- 0.13% ) 40 context-switches # 0.001 K/sec ( +- 0.94% ) 0 CPU-migrations # 0.000 K/sec 4,218 page-faults # 0.130 K/sec ( +- 0.00% ) 76,712,481,765 cycles # 2.371 GHz ( +- 0.13% ) [83.31%] 36,279,577,636 stalled-cycles-frontend # 47.29% frontend cycles idle ( +- 0.28% ) [83.35%] 1,684,049,110 stalled-cycles-backend # 2.20% backend cycles idle ( +- 2.96% ) [66.67%] 134,355,715,816 instructions # 1.75 insns per cycle # 0.27 stalled cycles per insn ( +- 0.10% ) [83.35%] 13,526,169,702 branches # 418.039 M/sec ( +- 0.10% ) [83.31%] 1,058,230 branch-misses # 0.01% of all branches ( +- 0.91% ) [83.36%] 32.413866442 seconds time elapsed ( +- 0.13% ) vhzp: Performance counter stats for './test_memcmp' (5 runs): 30327.183829 task-clock # 0.998 CPUs utilized ( +- 0.13% ) 38 context-switches # 0.001 K/sec ( +- 1.53% ) 0 CPU-migrations # 0.000 K/sec 4,218 page-faults # 0.139 K/sec ( +- 0.01% ) 71,964,773,660 cycles # 2.373 GHz ( +- 0.13% ) [83.35%] 31,191,284,231 stalled-cycles-frontend # 43.34% frontend cycles idle ( +- 0.40% ) [83.32%] 773,484,474 stalled-cycles-backend # 1.07% backend cycles idle ( +- 6.61% ) [66.67%] 134,982,215,437 instructions # 1.88 insns per cycle # 0.23 stalled cycles per insn ( +- 0.11% ) [83.32%] 13,509,150,683 branches # 445.447 M/sec ( +- 0.11% ) [83.34%] 1,017,667 branch-misses # 0.01% of all branches ( +- 1.07% ) [83.32%] 30.381324695 seconds time elapsed ( +- 0.13% ) Mirobenchmark2 ============== test: posix_memalign((void **)&p, 2 * MB, 8 * GB); for (i = 0; i < 1000; i++) { char *_p = p; while (_p < p+4*GB) { assert(*_p == *(_p+4*GB)); _p += 4096; asm volatile ("": : :"memory"); } } hzp: Performance counter stats for 'taskset -c 0 ./test_memcmp2' (5 runs): 3505.727639 task-clock # 0.998 CPUs utilized ( +- 0.26% ) 9 context-switches # 0.003 K/sec ( +- 4.97% ) 4,384 page-faults # 0.001 M/sec ( +- 0.00% ) 8,318,482,466 cycles # 2.373 GHz ( +- 0.26% ) [33.31%] 5,134,318,786 stalled-cycles-frontend # 61.72% frontend cycles idle ( +- 0.42% ) [33.32%] 2,193,266,208 stalled-cycles-backend # 26.37% backend cycles idle ( +- 5.51% ) [33.33%] 9,494,670,537 instructions # 1.14 insns per cycle # 0.54 stalled cycles per insn ( +- 0.13% ) [41.68%] 2,108,522,738 branches # 601.451 M/sec ( +- 0.09% ) [41.68%] 158,746 branch-misses # 0.01% of all branches ( +- 1.60% ) [41.71%] 3,168,102,115 L1-dcache-loads # 903.693 M/sec ( +- 0.11% ) [41.70%] 1,048,710,998 L1-dcache-misses # 33.10% of all L1-dcache hits ( +- 0.11% ) [41.72%] 1,047,699,685 LLC-load # 298.854 M/sec ( +- 0.03% ) [33.38%] 2,287 LLC-misses # 0.00% of all LL-cache hits ( +- 8.27% ) [33.37%] 3,166,187,367 dTLB-loads # 903.147 M/sec ( +- 0.02% ) [33.35%] 4,266,538 dTLB-misses # 0.13% of all dTLB cache hits ( +- 0.03% ) [33.33%] 3.513339813 seconds time elapsed ( +- 0.26% ) vhzp: Performance counter stats for 'taskset -c 0 ./test_memcmp2' (5 runs): 27313.891128 task-clock # 0.998 CPUs utilized ( +- 0.24% ) 62 context-switches # 0.002 K/sec ( +- 0.61% ) 4,384 page-faults # 0.160 K/sec ( +- 0.01% ) 64,747,374,606 cycles # 2.370 GHz ( +- 0.24% ) [33.33%] 61,341,580,278 stalled-cycles-frontend # 94.74% frontend cycles idle ( +- 0.26% ) [33.33%] 56,702,237,511 stalled-cycles-backend # 87.57% backend cycles idle ( +- 0.07% ) [33.33%] 10,033,724,846 instructions # 0.15 insns per cycle # 6.11 stalled cycles per insn ( +- 0.09% ) [41.65%] 2,190,424,932 branches # 80.195 M/sec ( +- 0.12% ) [41.66%] 1,028,630 branch-misses # 0.05% of all branches ( +- 1.50% ) [41.66%] 3,302,006,540 L1-dcache-loads # 120.891 M/sec ( +- 0.11% ) [41.68%] 271,374,358 L1-dcache-misses # 8.22% of all L1-dcache hits ( +- 0.04% ) [41.66%] 20,385,476 LLC-load # 0.746 M/sec ( +- 1.64% ) [33.34%] 76,754 LLC-misses # 0.38% of all LL-cache hits ( +- 2.35% ) [33.34%] 3,309,927,290 dTLB-loads # 121.181 M/sec ( +- 0.03% ) [33.34%] 2,098,967,427 dTLB-misses # 63.41% of all dTLB cache hits ( +- 0.03% ) [33.34%] 27.364448741 seconds time elapsed ( +- 0.24% ) === I personally prefer implementation present in this patchset. It doesn't touch arch-specific code. This patch: Huge zero page (hzp) is a non-movable huge page (2M on x86-64) filled with zeros. For now let's allocate the page on hugepage_init(). We'll switch to lazy allocation later. We are not going to map the huge zero page until we can handle it properly on all code paths. is_huge_zero_{pfn,pmd}() functions will be used by following patches to check whether the pfn/pmd is huge zero page. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Andi Kleen Cc: "H. Peter Anvin" Cc: Mel Gorman Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5f902e20e8c0..04eb489a6805 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -47,6 +47,7 @@ static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; /* during fragmentation poll the hugepage allocator once every minute */ static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; static struct task_struct *khugepaged_thread __read_mostly; +static unsigned long huge_zero_pfn __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); static DEFINE_SPINLOCK(khugepaged_mm_lock); static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); @@ -159,6 +160,29 @@ static int start_khugepaged(void) return err; } +static int __init init_huge_zero_page(void) +{ + struct page *hpage; + + hpage = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, + HPAGE_PMD_ORDER); + if (!hpage) + return -ENOMEM; + + huge_zero_pfn = page_to_pfn(hpage); + return 0; +} + +static inline bool is_huge_zero_pfn(unsigned long pfn) +{ + return pfn == huge_zero_pfn; +} + +static inline bool is_huge_zero_pmd(pmd_t pmd) +{ + return is_huge_zero_pfn(pmd_pfn(pmd)); +} + #ifdef CONFIG_SYSFS static ssize_t double_flag_show(struct kobject *kobj, @@ -540,6 +564,10 @@ static int __init hugepage_init(void) if (err) return err; + err = init_huge_zero_page(); + if (err) + goto out; + err = khugepaged_slab_init(); if (err) goto out; @@ -562,6 +590,8 @@ static int __init hugepage_init(void) return 0; out: + if (huge_zero_pfn) + __free_page(pfn_to_page(huge_zero_pfn)); hugepage_exit_sysfs(hugepage_kobj); return err; } -- cgit From 479f0abbfd253d1117a35c1df12755d27a2a0705 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 12 Dec 2012 13:50:50 -0800 Subject: thp: zap_huge_pmd(): zap huge zero pmd We don't have a mapped page to zap in huge zero page case. Let's just clear pmd and remove it from tlb. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Andi Kleen Cc: "H. Peter Anvin" Cc: Mel Gorman Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 04eb489a6805..1ee34ddb46ad 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1085,15 +1085,20 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t orig_pmd; pgtable = pgtable_trans_huge_withdraw(tlb->mm); orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); - page = pmd_page(orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); - page_remove_rmap(page); - VM_BUG_ON(page_mapcount(page) < 0); - add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); - VM_BUG_ON(!PageHead(page)); - tlb->mm->nr_ptes--; - spin_unlock(&tlb->mm->page_table_lock); - tlb_remove_page(tlb, page); + if (is_huge_zero_pmd(orig_pmd)) { + tlb->mm->nr_ptes--; + spin_unlock(&tlb->mm->page_table_lock); + } else { + page = pmd_page(orig_pmd); + page_remove_rmap(page); + VM_BUG_ON(page_mapcount(page) < 0); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + VM_BUG_ON(!PageHead(page)); + tlb->mm->nr_ptes--; + spin_unlock(&tlb->mm->page_table_lock); + tlb_remove_page(tlb, page); + } pte_free(tlb->mm, pgtable); ret = 1; } -- cgit From fc9fe822f7112db23e51e2be3b886f5d8f0afdb6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 12 Dec 2012 13:50:51 -0800 Subject: thp: copy_huge_pmd(): copy huge zero page It's easy to copy huge zero page. Just set destination pmd to huge zero page. It's safe to copy huge zero page since we have none yet :-p [rientjes@google.com: fix comment] Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Andi Kleen Cc: "H. Peter Anvin" Cc: Mel Gorman Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1ee34ddb46ad..650625390f61 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -708,6 +708,18 @@ static inline struct page *alloc_hugepage(int defrag) } #endif +static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd) +{ + pmd_t entry; + entry = pfn_pmd(huge_zero_pfn, vma->vm_page_prot); + entry = pmd_wrprotect(entry); + entry = pmd_mkhuge(entry); + set_pmd_at(mm, haddr, pmd, entry); + pgtable_trans_huge_deposit(mm, pgtable); + mm->nr_ptes++; +} + int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) @@ -785,6 +797,16 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_free(dst_mm, pgtable); goto out_unlock; } + /* + * mm->page_table_lock is enough to be sure that huge zero pmd is not + * under splitting since we don't split the page itself, only pmd to + * a page table. + */ + if (is_huge_zero_pmd(pmd)) { + set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd); + ret = 0; + goto out_unlock; + } if (unlikely(pmd_trans_splitting(pmd))) { /* split huge page running from under us */ spin_unlock(&src_mm->page_table_lock); -- cgit From 93b4796dede916de74b21fbd637588da6a99a7ec Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 12 Dec 2012 13:50:54 -0800 Subject: thp: do_huge_pmd_wp_page(): handle huge zero page On write access to huge zero page we alloc a new huge page and clear it. If ENOMEM, graceful fallback: we create a new pmd table and set pte around fault address to newly allocated normal (4k) page. All other ptes in the pmd set to normal zero page. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Andi Kleen Cc: "H. Peter Anvin" Cc: Mel Gorman Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++-------- mm/memory.c | 7 ---- 2 files changed, 96 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 650625390f61..a959b3a4ddd5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -858,6 +858,70 @@ unlock: spin_unlock(&mm->page_table_lock); } +static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, unsigned long haddr) +{ + pgtable_t pgtable; + pmd_t _pmd; + struct page *page; + int i, ret = 0; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!page) { + ret |= VM_FAULT_OOM; + goto out; + } + + if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { + put_page(page); + ret |= VM_FAULT_OOM; + goto out; + } + + clear_user_highpage(page, address); + __SetPageUptodate(page); + + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + + spin_lock(&mm->page_table_lock); + pmdp_clear_flush(vma, haddr, pmd); + /* leave pmd empty until pte is filled */ + + pgtable = pgtable_trans_huge_withdraw(mm); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + if (haddr == (address & PAGE_MASK)) { + entry = mk_pte(page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + page_add_new_anon_rmap(page, vma, haddr); + } else { + entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); + entry = pte_mkspecial(entry); + } + pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, pmd, pgtable); + spin_unlock(&mm->page_table_lock); + inc_mm_counter(mm, MM_ANONPAGES); + + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + + ret |= VM_FAULT_WRITE; +out: + return ret; +} + static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -964,19 +1028,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd) { int ret = 0; - struct page *page, *new_page; + struct page *page = NULL, *new_page; unsigned long haddr; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ VM_BUG_ON(!vma->anon_vma); + haddr = address & HPAGE_PMD_MASK; + if (is_huge_zero_pmd(orig_pmd)) + goto alloc; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_unlock; page = pmd_page(orig_pmd); VM_BUG_ON(!PageCompound(page) || !PageHead(page)); - haddr = address & HPAGE_PMD_MASK; if (page_mapcount(page) == 1) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); @@ -988,7 +1054,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } get_page(page); spin_unlock(&mm->page_table_lock); - +alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), @@ -998,24 +1064,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!new_page)) { count_vm_event(THP_FAULT_FALLBACK); - ret = do_huge_pmd_wp_page_fallback(mm, vma, address, - pmd, orig_pmd, page, haddr); - if (ret & VM_FAULT_OOM) - split_huge_page(page); - put_page(page); + if (is_huge_zero_pmd(orig_pmd)) { + ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, + address, pmd, haddr); + } else { + ret = do_huge_pmd_wp_page_fallback(mm, vma, address, + pmd, orig_pmd, page, haddr); + if (ret & VM_FAULT_OOM) + split_huge_page(page); + put_page(page); + } goto out; } count_vm_event(THP_FAULT_ALLOC); if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { put_page(new_page); - split_huge_page(page); - put_page(page); + if (page) { + split_huge_page(page); + put_page(page); + } ret |= VM_FAULT_OOM; goto out; } - copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); + if (is_huge_zero_pmd(orig_pmd)) + clear_huge_page(new_page, haddr, HPAGE_PMD_NR); + else + copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); mmun_start = haddr; @@ -1023,7 +1099,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); spin_lock(&mm->page_table_lock); - put_page(page); + if (page) + put_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { spin_unlock(&mm->page_table_lock); mem_cgroup_uncharge_page(new_page); @@ -1031,14 +1108,18 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_mn; } else { pmd_t entry; - VM_BUG_ON(!PageHead(page)); entry = mk_huge_pmd(new_page, vma); pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); update_mmu_cache_pmd(vma, address, pmd); - page_remove_rmap(page); - put_page(page); + if (is_huge_zero_pmd(orig_pmd)) + add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); + else { + VM_BUG_ON(!PageHead(page)); + page_remove_rmap(page); + put_page(page); + } ret |= VM_FAULT_WRITE; } spin_unlock(&mm->page_table_lock); diff --git a/mm/memory.c b/mm/memory.c index 765377385632..259b34fe1347 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -724,13 +724,6 @@ static inline int is_zero_pfn(unsigned long pfn) } #endif -#ifndef my_zero_pfn -static inline unsigned long my_zero_pfn(unsigned long addr) -{ - return zero_pfn; -} -#endif - /* * vm_normal_page -- This function gets the "struct page" associated with a pte. * -- cgit From cad7f613c4d010e1d0f05c9a4fb33c7ae40ba115 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 12 Dec 2012 13:50:57 -0800 Subject: thp: change_huge_pmd(): make sure we don't try to make a page writable mprotect core never tries to make page writable using change_huge_pmd(). Let's add an assert that the assumption is true. It's important to be sure we will not make huge zero page writable. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Andi Kleen Cc: "H. Peter Anvin" Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a959b3a4ddd5..7742fb36eb4d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1273,6 +1273,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, pmd_t entry; entry = pmdp_get_and_clear(mm, addr, pmd); entry = pmd_modify(entry, newprot); + BUG_ON(pmd_write(entry)); set_pmd_at(mm, addr, pmd, entry); spin_unlock(&vma->vm_mm->page_table_lock); ret = 1; -- cgit From e180377f1ae48b3cbc559c9875d9b038f7f000c6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 12 Dec 2012 13:50:59 -0800 Subject: thp: change split_huge_page_pmd() interface Pass vma instead of mm and add address parameter. In most cases we already have vma on the stack. We provides split_huge_page_pmd_mm() for few cases when we have mm, but not vma. This change is preparation to huge zero pmd splitting implementation. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Andi Kleen Cc: "H. Peter Anvin" Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 19 +++++++++++++++++-- mm/memory.c | 4 ++-- mm/mempolicy.c | 2 +- mm/mprotect.c | 2 +- mm/mremap.c | 2 +- mm/pagewalk.c | 2 +- 6 files changed, 23 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7742fb36eb4d..de6aa5f3fdd2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2475,9 +2475,14 @@ static int khugepaged(void *none) return 0; } -void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) +void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd) { struct page *page; + unsigned long haddr = address & HPAGE_PMD_MASK; + struct mm_struct *mm = vma->vm_mm; + + BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); spin_lock(&mm->page_table_lock); if (unlikely(!pmd_trans_huge(*pmd))) { @@ -2495,6 +2500,16 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) BUG_ON(pmd_trans_huge(*pmd)); } +void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, + pmd_t *pmd) +{ + struct vm_area_struct *vma; + + vma = find_vma(mm, address); + BUG_ON(vma == NULL); + split_huge_page_pmd(vma, address, pmd); +} + static void split_huge_page_address(struct mm_struct *mm, unsigned long address) { @@ -2509,7 +2524,7 @@ static void split_huge_page_address(struct mm_struct *mm, * Caller holds the mmap_sem write mode, so a huge pmd cannot * materialize from under us. */ - split_huge_page_pmd(mm, pmd); + split_huge_page_pmd_mm(mm, address, pmd); } void __vma_adjust_trans_huge(struct vm_area_struct *vma, diff --git a/mm/memory.c b/mm/memory.c index 259b34fe1347..f9a4b0cb8623 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1243,7 +1243,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, BUG(); } #endif - split_huge_page_pmd(vma->vm_mm, pmd); + split_huge_page_pmd(vma, addr, pmd); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ @@ -1512,7 +1512,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, } if (pmd_trans_huge(*pmd)) { if (flags & FOLL_SPLIT) { - split_huge_page_pmd(mm, pmd); + split_huge_page_pmd(vma, address, pmd); goto split_fallthrough; } spin_lock(&mm->page_table_lock); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 05b28361a39b..0719e8dd4945 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -511,7 +511,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - split_huge_page_pmd(vma->vm_mm, pmd); + split_huge_page_pmd(vma, addr, pmd); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; if (check_pte_range(vma, pmd, addr, next, nodes, diff --git a/mm/mprotect.c b/mm/mprotect.c index a40992610ab6..e8c3938db6fa 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -90,7 +90,7 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) - split_huge_page_pmd(vma->vm_mm, pmd); + split_huge_page_pmd(vma, addr, pmd); else if (change_huge_pmd(vma, pmd, addr, newprot)) continue; /* fall through */ diff --git a/mm/mremap.c b/mm/mremap.c index 1b61c2d3307a..eabb24da6c9e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -182,7 +182,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, need_flush = true; continue; } else if (!err) { - split_huge_page_pmd(vma->vm_mm, old_pmd); + split_huge_page_pmd(vma, old_addr, old_pmd); } VM_BUG_ON(pmd_trans_huge(*old_pmd)); } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 6c118d012bb5..35aa294656cd 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -58,7 +58,7 @@ again: if (!walk->pte_entry) continue; - split_huge_page_pmd(walk->mm, pmd); + split_huge_page_pmd_mm(walk->mm, addr, pmd); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) goto again; err = walk_pte_range(pmd, addr, next, walk); -- cgit From c5a647d09fe9fc3e0241c89845cf8e6220b916f5 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 12 Dec 2012 13:51:00 -0800 Subject: thp: implement splitting pmd for huge zero page We can't split huge zero page itself (and it's bug if we try), but we can split the pmd which points to it. On splitting the pmd we create a table with all ptes set to normal zero page. [akpm@linux-foundation.org: fix build error] Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Andi Kleen Cc: "H. Peter Anvin" Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index de6aa5f3fdd2..ea0e23fd6967 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1616,6 +1616,7 @@ int split_huge_page(struct page *page) struct anon_vma *anon_vma; int ret = 1; + BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); BUG_ON(!PageAnon(page)); anon_vma = page_lock_anon_vma(page); if (!anon_vma) @@ -2475,24 +2476,64 @@ static int khugepaged(void *none) return 0; } +static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, + unsigned long haddr, pmd_t *pmd) +{ + struct mm_struct *mm = vma->vm_mm; + pgtable_t pgtable; + pmd_t _pmd; + int i; + + pmdp_clear_flush(vma, haddr, pmd); + /* leave pmd empty until pte is filled */ + + pgtable = pgtable_trans_huge_withdraw(mm); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); + entry = pte_mkspecial(entry); + pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, pmd, pgtable); +} + void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd) { struct page *page; - unsigned long haddr = address & HPAGE_PMD_MASK; struct mm_struct *mm = vma->vm_mm; + unsigned long haddr = address & HPAGE_PMD_MASK; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); spin_lock(&mm->page_table_lock); if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + return; + } + if (is_huge_zero_pmd(*pmd)) { + __split_huge_zero_page_pmd(vma, haddr, pmd); + spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return; } page = pmd_page(*pmd); VM_BUG_ON(!page_count(page)); get_page(page); spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); split_huge_page(page); -- cgit From 80371957f09814d25c38733d2d08de47f59a13c2 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 12 Dec 2012 13:51:02 -0800 Subject: thp: setup huge zero page on non-write page fault All code paths seems covered. Now we can map huge zero page on read page fault. We setup it in do_huge_pmd_anonymous_page() if area around fault address is suitable for THP and we've got read page fault. If we fail to setup huge zero page (ENOMEM) we fallback to handle_pte_fault() as we normally do in THP. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Andi Kleen Cc: "H. Peter Anvin" Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ea0e23fd6967..e1b6f4e13b91 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -733,6 +733,16 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (unlikely(khugepaged_enter(vma))) return VM_FAULT_OOM; + if (!(flags & FAULT_FLAG_WRITE)) { + pgtable_t pgtable; + pgtable = pte_alloc_one(mm, haddr); + if (unlikely(!pgtable)) + return VM_FAULT_OOM; + spin_lock(&mm->page_table_lock); + set_huge_zero_page(pgtable, mm, vma, haddr, pmd); + spin_unlock(&mm->page_table_lock); + return 0; + } page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), vma, haddr, numa_node_id(), 0); if (unlikely(!page)) { -- cgit From 78ca0e679203bbf74f8febd9725a1c8dd083d073 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 12 Dec 2012 13:51:05 -0800 Subject: thp: lazy huge zero page allocation Instead of allocating huge zero page on hugepage_init() we can postpone it until first huge zero page map. It saves memory if THP is not in use. cmpxchg() is used to avoid race on huge_zero_pfn initialization. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Andi Kleen Cc: "H. Peter Anvin" Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e1b6f4e13b91..9539d6654bb9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -160,22 +160,24 @@ static int start_khugepaged(void) return err; } -static int __init init_huge_zero_page(void) +static int init_huge_zero_pfn(void) { struct page *hpage; + unsigned long pfn; hpage = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, HPAGE_PMD_ORDER); if (!hpage) return -ENOMEM; - - huge_zero_pfn = page_to_pfn(hpage); + pfn = page_to_pfn(hpage); + if (cmpxchg(&huge_zero_pfn, 0, pfn)) + __free_page(hpage); return 0; } static inline bool is_huge_zero_pfn(unsigned long pfn) { - return pfn == huge_zero_pfn; + return huge_zero_pfn && pfn == huge_zero_pfn; } static inline bool is_huge_zero_pmd(pmd_t pmd) @@ -564,10 +566,6 @@ static int __init hugepage_init(void) if (err) return err; - err = init_huge_zero_page(); - if (err) - goto out; - err = khugepaged_slab_init(); if (err) goto out; @@ -590,8 +588,6 @@ static int __init hugepage_init(void) return 0; out: - if (huge_zero_pfn) - __free_page(pfn_to_page(huge_zero_pfn)); hugepage_exit_sysfs(hugepage_kobj); return err; } @@ -735,6 +731,10 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (!(flags & FAULT_FLAG_WRITE)) { pgtable_t pgtable; + if (unlikely(!huge_zero_pfn && init_huge_zero_pfn())) { + count_vm_event(THP_FAULT_FALLBACK); + goto out; + } pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; -- cgit From 97ae17497e996ff09bf97b6db3b33f7fd4029092 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 12 Dec 2012 13:51:06 -0800 Subject: thp: implement refcounting for huge zero page H. Peter Anvin doesn't like huge zero page which sticks in memory forever after the first allocation. Here's implementation of lockless refcounting for huge zero page. We have two basic primitives: {get,put}_huge_zero_page(). They manipulate reference counter. If counter is 0, get_huge_zero_page() allocates a new huge page and takes two references: one for caller and one for shrinker. We free the page only in shrinker callback if counter is 1 (only shrinker has the reference). put_huge_zero_page() only decrements counter. Counter is never zero in put_huge_zero_page() since shrinker holds on reference. Freeing huge zero page in shrinker callback helps to avoid frequent allocate-free. Refcounting has cost. On 4 socket machine I observe ~1% slowdown on parallel (40 processes) read page faulting comparing to lazy huge page allocation. I think it's pretty reasonable for synthetic benchmark. [lliubbo@gmail.com: fix mismerge] Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Andi Kleen Cc: "H. Peter Anvin" Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 113 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 88 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9539d6654bb9..d89220cb1d9f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -12,12 +12,14 @@ #include #include #include +#include #include #include #include #include #include #include + #include #include #include "internal.h" @@ -47,7 +49,6 @@ static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; /* during fragmentation poll the hugepage allocator once every minute */ static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; static struct task_struct *khugepaged_thread __read_mostly; -static unsigned long huge_zero_pfn __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); static DEFINE_SPINLOCK(khugepaged_mm_lock); static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); @@ -160,31 +161,74 @@ static int start_khugepaged(void) return err; } -static int init_huge_zero_pfn(void) +static atomic_t huge_zero_refcount; +static unsigned long huge_zero_pfn __read_mostly; + +static inline bool is_huge_zero_pfn(unsigned long pfn) { - struct page *hpage; - unsigned long pfn; + unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn); + return zero_pfn && pfn == zero_pfn; +} - hpage = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, +static inline bool is_huge_zero_pmd(pmd_t pmd) +{ + return is_huge_zero_pfn(pmd_pfn(pmd)); +} + +static unsigned long get_huge_zero_page(void) +{ + struct page *zero_page; +retry: + if (likely(atomic_inc_not_zero(&huge_zero_refcount))) + return ACCESS_ONCE(huge_zero_pfn); + + zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, HPAGE_PMD_ORDER); - if (!hpage) - return -ENOMEM; - pfn = page_to_pfn(hpage); - if (cmpxchg(&huge_zero_pfn, 0, pfn)) - __free_page(hpage); - return 0; + if (!zero_page) + return 0; + preempt_disable(); + if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) { + preempt_enable(); + __free_page(zero_page); + goto retry; + } + + /* We take additional reference here. It will be put back by shrinker */ + atomic_set(&huge_zero_refcount, 2); + preempt_enable(); + return ACCESS_ONCE(huge_zero_pfn); } -static inline bool is_huge_zero_pfn(unsigned long pfn) +static void put_huge_zero_page(void) { - return huge_zero_pfn && pfn == huge_zero_pfn; + /* + * Counter should never go to zero here. Only shrinker can put + * last reference. + */ + BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); } -static inline bool is_huge_zero_pmd(pmd_t pmd) +static int shrink_huge_zero_page(struct shrinker *shrink, + struct shrink_control *sc) { - return is_huge_zero_pfn(pmd_pfn(pmd)); + if (!sc->nr_to_scan) + /* we can free zero page only if last reference remains */ + return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; + + if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { + unsigned long zero_pfn = xchg(&huge_zero_pfn, 0); + BUG_ON(zero_pfn == 0); + __free_page(__pfn_to_page(zero_pfn)); + } + + return 0; } +static struct shrinker huge_zero_page_shrinker = { + .shrink = shrink_huge_zero_page, + .seeks = DEFAULT_SEEKS, +}; + #ifdef CONFIG_SYSFS static ssize_t double_flag_show(struct kobject *kobj, @@ -576,6 +620,8 @@ static int __init hugepage_init(void) goto out; } + register_shrinker(&huge_zero_page_shrinker); + /* * By default disable transparent hugepages on smaller systems, * where the extra memory used could hurt more than TLB overhead @@ -705,10 +751,11 @@ static inline struct page *alloc_hugepage(int defrag) #endif static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd) + struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, + unsigned long zero_pfn) { pmd_t entry; - entry = pfn_pmd(huge_zero_pfn, vma->vm_page_prot); + entry = pfn_pmd(zero_pfn, vma->vm_page_prot); entry = pmd_wrprotect(entry); entry = pmd_mkhuge(entry); set_pmd_at(mm, haddr, pmd, entry); @@ -731,15 +778,19 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (!(flags & FAULT_FLAG_WRITE)) { pgtable_t pgtable; - if (unlikely(!huge_zero_pfn && init_huge_zero_pfn())) { - count_vm_event(THP_FAULT_FALLBACK); - goto out; - } + unsigned long zero_pfn; pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; + zero_pfn = get_huge_zero_page(); + if (unlikely(!zero_pfn)) { + pte_free(mm, pgtable); + count_vm_event(THP_FAULT_FALLBACK); + goto out; + } spin_lock(&mm->page_table_lock); - set_huge_zero_page(pgtable, mm, vma, haddr, pmd); + set_huge_zero_page(pgtable, mm, vma, haddr, pmd, + zero_pfn); spin_unlock(&mm->page_table_lock); return 0; } @@ -813,7 +864,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, * a page table. */ if (is_huge_zero_pmd(pmd)) { - set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd); + unsigned long zero_pfn; + /* + * get_huge_zero_page() will never allocate a new page here, + * since we already have a zero page to copy. It just takes a + * reference. + */ + zero_pfn = get_huge_zero_page(); + set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, + zero_pfn); ret = 0; goto out_unlock; } @@ -923,6 +982,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); spin_unlock(&mm->page_table_lock); + put_huge_zero_page(); inc_mm_counter(mm, MM_ANONPAGES); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -1123,9 +1183,10 @@ alloc: page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); update_mmu_cache_pmd(vma, address, pmd); - if (is_huge_zero_pmd(orig_pmd)) + if (is_huge_zero_pmd(orig_pmd)) { add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); - else { + put_huge_zero_page(); + } else { VM_BUG_ON(!PageHead(page)); page_remove_rmap(page); put_page(page); @@ -1202,6 +1263,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (is_huge_zero_pmd(orig_pmd)) { tlb->mm->nr_ptes--; spin_unlock(&tlb->mm->page_table_lock); + put_huge_zero_page(); } else { page = pmd_page(orig_pmd); page_remove_rmap(page); @@ -2511,6 +2573,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, } smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); + put_huge_zero_page(); } void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, -- cgit From d8a8e1f0da3d29d7268b3300c96a059d63901b76 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 12 Dec 2012 13:51:09 -0800 Subject: thp, vmstat: implement HZP_ALLOC and HZP_ALLOC_FAILED events hzp_alloc is incremented every time a huge zero page is successfully allocated. It includes allocations which where dropped due race with other allocation. Note, it doesn't count every map of the huge zero page, only its allocation. hzp_alloc_failed is incremented if kernel fails to allocate huge zero page and falls back to using small pages. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Andi Kleen Cc: "H. Peter Anvin" Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 5 ++++- mm/vmstat.c | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d89220cb1d9f..9a5d45dfad44 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -184,8 +184,11 @@ retry: zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, HPAGE_PMD_ORDER); - if (!zero_page) + if (!zero_page) { + count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); return 0; + } + count_vm_event(THP_ZERO_PAGE_ALLOC); preempt_disable(); if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) { preempt_enable(); diff --git a/mm/vmstat.c b/mm/vmstat.c index c7370579111b..5da4b19023c3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -801,6 +801,8 @@ const char * const vmstat_text[] = { "thp_collapse_alloc", "thp_collapse_alloc_failed", "thp_split", + "thp_zero_page_alloc", + "thp_zero_page_alloc_failed", #endif #endif /* CONFIG_VM_EVENTS_COUNTERS */ -- cgit From 79da5407eeadc740fbf4b45d6df7d7f8e6adaf2c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 12 Dec 2012 13:51:12 -0800 Subject: thp: introduce sysfs knob to disable huge zero page By default kernel tries to use huge zero page on read page fault. It's possible to disable huge zero page by writing 0 or enable it back by writing 1: echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Andi Kleen Cc: "H. Peter Anvin" Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9a5d45dfad44..72835cea0b0f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -39,7 +39,8 @@ unsigned long transparent_hugepage_flags __read_mostly = (1< Date: Wed, 12 Dec 2012 13:51:14 -0800 Subject: thp: avoid race on multiple parallel page faults to the same page pmd value is stable only with mm->page_table_lock taken. After taking the lock we need to check that nobody modified the pmd before changing it. Signed-off-by: Kirill A. Shutemov Cc: Jiri Slaby Cc: David Rientjes Reviewed-by: Bob Liu Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 72835cea0b0f..827d9c813051 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -769,17 +769,20 @@ static inline struct page *alloc_hugepage(int defrag) } #endif -static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, +static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, unsigned long zero_pfn) { pmd_t entry; + if (!pmd_none(*pmd)) + return false; entry = pfn_pmd(zero_pfn, vma->vm_page_prot); entry = pmd_wrprotect(entry); entry = pmd_mkhuge(entry); set_pmd_at(mm, haddr, pmd, entry); pgtable_trans_huge_deposit(mm, pgtable); mm->nr_ptes++; + return true; } int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, @@ -799,6 +802,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, transparent_hugepage_use_zero_page()) { pgtable_t pgtable; unsigned long zero_pfn; + bool set; pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; @@ -809,9 +813,13 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; } spin_lock(&mm->page_table_lock); - set_huge_zero_page(pgtable, mm, vma, haddr, pmd, + set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, zero_pfn); spin_unlock(&mm->page_table_lock); + if (!set) { + pte_free(mm, pgtable); + put_huge_zero_page(); + } return 0; } page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), @@ -885,14 +893,16 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ if (is_huge_zero_pmd(pmd)) { unsigned long zero_pfn; + bool set; /* * get_huge_zero_page() will never allocate a new page here, * since we already have a zero page to copy. It just takes a * reference. */ zero_pfn = get_huge_zero_page(); - set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, + set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, zero_pfn); + BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ ret = 0; goto out_unlock; } @@ -949,7 +959,7 @@ unlock: static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned long haddr) + pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) { pgtable_t pgtable; pmd_t _pmd; @@ -978,6 +988,9 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto out_free_page; + pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ @@ -1010,6 +1023,12 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, ret |= VM_FAULT_WRITE; out: return ret; +out_free_page: + spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mem_cgroup_uncharge_page(page); + put_page(page); + goto out; } static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, @@ -1156,7 +1175,7 @@ alloc: count_vm_event(THP_FAULT_FALLBACK); if (is_huge_zero_pmd(orig_pmd)) { ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, - address, pmd, haddr); + address, pmd, orig_pmd, haddr); } else { ret = do_huge_pmd_wp_page_fallback(mm, vma, address, pmd, orig_pmd, page, haddr); -- cgit From c8bf2d8ba4fbc093de7c0d192fe5d2531f14b8b9 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 12 Dec 2012 13:51:17 -0800 Subject: mm: compaction: Fix compiler warning compact_capture_page() is only used if compaction is enabled so it should be moved into the corresponding #ifdef. Signed-off-by: Thierry Reding Acked-by: Mel Gorman Cc: Rik van Riel Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 108 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 54 insertions(+), 54 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index d24dd2d7bad4..129791218226 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -215,60 +215,6 @@ static bool suitable_migration_target(struct page *page) return false; } -static void compact_capture_page(struct compact_control *cc) -{ - unsigned long flags; - int mtype, mtype_low, mtype_high; - - if (!cc->page || *cc->page) - return; - - /* - * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP - * regardless of the migratetype of the freelist is is captured from. - * This is fine because the order for a high-order MIGRATE_MOVABLE - * allocation is typically at least a pageblock size and overall - * fragmentation is not impaired. Other allocation types must - * capture pages from their own migratelist because otherwise they - * could pollute other pageblocks like MIGRATE_MOVABLE with - * difficult to move pages and making fragmentation worse overall. - */ - if (cc->migratetype == MIGRATE_MOVABLE) { - mtype_low = 0; - mtype_high = MIGRATE_PCPTYPES; - } else { - mtype_low = cc->migratetype; - mtype_high = cc->migratetype + 1; - } - - /* Speculatively examine the free lists without zone lock */ - for (mtype = mtype_low; mtype < mtype_high; mtype++) { - int order; - for (order = cc->order; order < MAX_ORDER; order++) { - struct page *page; - struct free_area *area; - area = &(cc->zone->free_area[order]); - if (list_empty(&area->free_list[mtype])) - continue; - - /* Take the lock and attempt capture of the page */ - if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) - return; - if (!list_empty(&area->free_list[mtype])) { - page = list_entry(area->free_list[mtype].next, - struct page, lru); - if (capture_free_page(page, cc->order, mtype)) { - spin_unlock_irqrestore(&cc->zone->lock, - flags); - *cc->page = page; - return; - } - } - spin_unlock_irqrestore(&cc->zone->lock, flags); - } - } -} - /* * Isolate free pages onto a private freelist. Caller must hold zone->lock. * If @strict is true, will abort returning 0 on any invalid PFNs or non-free @@ -953,6 +899,60 @@ unsigned long compaction_suitable(struct zone *zone, int order) return COMPACT_CONTINUE; } +static void compact_capture_page(struct compact_control *cc) +{ + unsigned long flags; + int mtype, mtype_low, mtype_high; + + if (!cc->page || *cc->page) + return; + + /* + * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP + * regardless of the migratetype of the freelist is is captured from. + * This is fine because the order for a high-order MIGRATE_MOVABLE + * allocation is typically at least a pageblock size and overall + * fragmentation is not impaired. Other allocation types must + * capture pages from their own migratelist because otherwise they + * could pollute other pageblocks like MIGRATE_MOVABLE with + * difficult to move pages and making fragmentation worse overall. + */ + if (cc->migratetype == MIGRATE_MOVABLE) { + mtype_low = 0; + mtype_high = MIGRATE_PCPTYPES; + } else { + mtype_low = cc->migratetype; + mtype_high = cc->migratetype + 1; + } + + /* Speculatively examine the free lists without zone lock */ + for (mtype = mtype_low; mtype < mtype_high; mtype++) { + int order; + for (order = cc->order; order < MAX_ORDER; order++) { + struct page *page; + struct free_area *area; + area = &(cc->zone->free_area[order]); + if (list_empty(&area->free_list[mtype])) + continue; + + /* Take the lock and attempt capture of the page */ + if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) + return; + if (!list_empty(&area->free_list[mtype])) { + page = list_entry(area->free_list[mtype].next, + struct page, lru); + if (capture_free_page(page, cc->order, mtype)) { + spin_unlock_irqrestore(&cc->zone->lock, + flags); + *cc->page = page; + return; + } + } + spin_unlock_irqrestore(&cc->zone->lock, flags); + } + } +} + static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; -- cgit From be49a6e13537f32f85bd8c4ba04317ca36ca60ff Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 12 Dec 2012 13:51:19 -0800 Subject: mm: use migrate_prep() instead of migrate_prep_local() __alloc_contig_migrate_range() should use all possible ways to get all the pages migrated from the given memory range, so pruning per-cpu lru lists for all CPUs is required, regadless the cost of such operation. Otherwise some pages which got stuck at per-cpu lru list might get missed by migration procedure causing the contiguous allocation to fail. Reported-by: SeongHwan Yoon Signed-off-by: Marek Szyprowski Signed-off-by: Kyungmin Park Acked-by: Michal Nazarewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5a8d339d282a..4171cd4f8257 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5727,7 +5727,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, unsigned int tries = 0; int ret = 0; - migrate_prep_local(); + migrate_prep(); while (pfn < end || !list_empty(&cc->migratepages)) { if (fatal_signal_pending(current)) { -- cgit From 31aaea4aa1b267de52a44a72fd75990c79a9a433 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 12 Dec 2012 13:51:27 -0800 Subject: memcontrol: use N_MEMORY instead N_HIGH_MEMORY N_HIGH_MEMORY stands for the nodes that has normal or high memory. N_MEMORY stands for the nodes that has any memory. The code here need to handle with the nodes which have memory, we should use N_MEMORY instead. Signed-off-by: Lai Jiangshan Signed-off-by: Wen Congyang Cc: Christoph Lameter Cc: Hillf Danton Cc: Lin Feng Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 18 +++++++++--------- mm/page_cgroup.c | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 12307b3838fb..49d86d06e1dd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -800,7 +800,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, int nid; u64 total = 0; - for_each_node_state(nid, N_HIGH_MEMORY) + for_each_node_state(nid, N_MEMORY) total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); return total; } @@ -1644,9 +1644,9 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) return; /* make a nodemask where this memcg uses memory from */ - memcg->scan_nodes = node_states[N_HIGH_MEMORY]; + memcg->scan_nodes = node_states[N_MEMORY]; - for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { + for_each_node_mask(nid, node_states[N_MEMORY]) { if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) node_clear(nid, memcg->scan_nodes); @@ -1717,7 +1717,7 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) /* * Check rest of nodes. */ - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { if (node_isset(nid, memcg->scan_nodes)) continue; if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) @@ -3776,7 +3776,7 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) lru_add_drain_all(); drain_all_stock_sync(memcg); mem_cgroup_start_move(memcg); - for_each_node_state(node, N_HIGH_MEMORY) { + for_each_node_state(node, N_MEMORY) { for (zid = 0; zid < MAX_NR_ZONES; zid++) { enum lru_list lru; for_each_lru(lru) { @@ -4122,7 +4122,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); seq_printf(m, "total=%lu", total_nr); - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); seq_printf(m, " N%d=%lu", nid, node_nr); } @@ -4130,7 +4130,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); seq_printf(m, "file=%lu", file_nr); - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE); seq_printf(m, " N%d=%lu", nid, node_nr); @@ -4139,7 +4139,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); seq_printf(m, "anon=%lu", anon_nr); - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON); seq_printf(m, " N%d=%lu", nid, node_nr); @@ -4148,7 +4148,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); seq_printf(m, "unevictable=%lu", unevictable_nr); - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, BIT(LRU_UNEVICTABLE)); seq_printf(m, " N%d=%lu", nid, node_nr); diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 44db00e253ed..6d757e3a872a 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -274,7 +274,7 @@ void __init page_cgroup_init(void) if (mem_cgroup_disabled()) return; - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { unsigned long start_pfn, end_pfn; start_pfn = node_start_pfn(nid); -- cgit From bd3a66c1cdf31274489cc1b5ace879695a5a1797 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 12 Dec 2012 13:51:28 -0800 Subject: oom: use N_MEMORY instead N_HIGH_MEMORY N_HIGH_MEMORY stands for the nodes that has normal or high memory. N_MEMORY stands for the nodes that has any memory. The code here need to handle with the nodes which have memory, we should use N_MEMORY instead. Signed-off-by: Lai Jiangshan Acked-by: Hillf Danton Signed-off-by: Wen Congyang Cc: Christoph Lameter Cc: Lin Feng Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 18f1ae2b45de..fe36205a7f8a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -215,7 +215,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, * the page allocator means a mempolicy is in effect. Cpuset policy * is enforced in get_page_from_freelist(). */ - if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) { + if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) { *totalpages = total_swap_pages; for_each_node_mask(nid, *nodemask) *totalpages += node_spanned_pages(nid); -- cgit From 389162c22dc373932089ce16cbab43e80a88b035 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 12 Dec 2012 13:51:30 -0800 Subject: mm,migrate: use N_MEMORY instead N_HIGH_MEMORY N_HIGH_MEMORY stands for the nodes that has normal or high memory. N_MEMORY stands for the nodes that has any memory. The code here need to handle with the nodes which have memory, we should use N_MEMORY instead. Signed-off-by: Lai Jiangshan Acked-by: Christoph Lameter Signed-off-by: Wen Congyang Cc: Hillf Danton Cc: Lin Feng Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 3f675ca08279..cae02711181d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1238,7 +1238,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, if (node < 0 || node >= MAX_NUMNODES) goto out_pm; - if (!node_state(node, N_HIGH_MEMORY)) + if (!node_state(node, N_MEMORY)) goto out_pm; err = -EACCES; -- cgit From 01f13bd607346afa8911ac180588244698676e5c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 12 Dec 2012 13:51:33 -0800 Subject: mempolicy: use N_MEMORY instead N_HIGH_MEMORY N_HIGH_MEMORY stands for the nodes that has normal or high memory. N_MEMORY stands for the nodes that has any memory. The code here need to handle with the nodes which have memory, we should use N_MEMORY instead. Signed-off-by: Lai Jiangshan Signed-off-by: Wen Congyang Cc: Christoph Lameter Cc: Hillf Danton Cc: Lin Feng Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0719e8dd4945..aaf54566cb6b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -212,9 +212,9 @@ static int mpol_set_nodemask(struct mempolicy *pol, /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ if (pol == NULL) return 0; - /* Check N_HIGH_MEMORY */ + /* Check N_MEMORY */ nodes_and(nsc->mask1, - cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); + cpuset_current_mems_allowed, node_states[N_MEMORY]); VM_BUG_ON(!nodes); if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) @@ -1388,7 +1388,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, goto out_put; } - if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { + if (!nodes_subset(*new, node_states[N_MEMORY])) { err = -EINVAL; goto out_put; } @@ -2326,7 +2326,7 @@ void __init numa_policy_init(void) * fall back to the largest node if they're all smaller. */ nodes_clear(interleave_nodes); - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { unsigned long total_pages = node_present_pages(nid); /* Preserve the largest node */ @@ -2407,7 +2407,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) *nodelist++ = '\0'; if (nodelist_parse(nodelist, nodes)) goto out; - if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY])) + if (!nodes_subset(nodes, node_states[N_MEMORY])) goto out; } else nodes_clear(nodes); @@ -2441,7 +2441,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) * Default to online nodes with memory if no nodelist */ if (!nodelist) - nodes = node_states[N_HIGH_MEMORY]; + nodes = node_states[N_MEMORY]; break; case MPOL_LOCAL: /* -- cgit From 8cebfcd074a3044780f3f9af236fc8534d89e55e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 12 Dec 2012 13:51:36 -0800 Subject: hugetlb: use N_MEMORY instead N_HIGH_MEMORY N_HIGH_MEMORY stands for the nodes that has normal or high memory. N_MEMORY stands for the nodes that has any memory. The code here need to handle with the nodes which have memory, we should use N_MEMORY instead. Signed-off-by: Lai Jiangshan Acked-by: Hillf Danton Signed-off-by: Wen Congyang Cc: Christoph Lameter Cc: Hillf Danton Cc: Lin Feng Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1ef2cd4ae3c9..bd22bd895299 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1057,7 +1057,7 @@ static void return_unused_surplus_pages(struct hstate *h, * on-line nodes with memory and will handle the hstate accounting. */ while (nr_pages--) { - if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) + if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) break; } } @@ -1180,14 +1180,14 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, int __weak alloc_bootmem_huge_page(struct hstate *h) { struct huge_bootmem_page *m; - int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); + int nr_nodes = nodes_weight(node_states[N_MEMORY]); while (nr_nodes) { void *addr; addr = __alloc_bootmem_node_nopanic( NODE_DATA(hstate_next_node_to_alloc(h, - &node_states[N_HIGH_MEMORY])), + &node_states[N_MEMORY])), huge_page_size(h), huge_page_size(h), 0); if (addr) { @@ -1259,7 +1259,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) if (!alloc_bootmem_huge_page(h)) break; } else if (!alloc_fresh_huge_page(h, - &node_states[N_HIGH_MEMORY])) + &node_states[N_MEMORY])) break; } h->max_huge_pages = i; @@ -1527,7 +1527,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, if (!(obey_mempolicy && init_nodemask_of_mempolicy(nodes_allowed))) { NODEMASK_FREE(nodes_allowed); - nodes_allowed = &node_states[N_HIGH_MEMORY]; + nodes_allowed = &node_states[N_MEMORY]; } } else if (nodes_allowed) { /* @@ -1537,11 +1537,11 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; init_nodemask_of_node(nodes_allowed, nid); } else - nodes_allowed = &node_states[N_HIGH_MEMORY]; + nodes_allowed = &node_states[N_MEMORY]; h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); - if (nodes_allowed != &node_states[N_HIGH_MEMORY]) + if (nodes_allowed != &node_states[N_MEMORY]) NODEMASK_FREE(nodes_allowed); return len; @@ -1844,7 +1844,7 @@ static void hugetlb_register_all_nodes(void) { int nid; - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { struct node *node = node_devices[nid]; if (node->dev.id == nid) hugetlb_register_node(node); @@ -1939,8 +1939,8 @@ void __init hugetlb_add_hstate(unsigned order) for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); INIT_LIST_HEAD(&h->hugepage_activelist); - h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); - h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); + h->next_nid_to_alloc = first_node(node_states[N_MEMORY]); + h->next_nid_to_free = first_node(node_states[N_MEMORY]); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); /* @@ -2035,11 +2035,11 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, if (!(obey_mempolicy && init_nodemask_of_mempolicy(nodes_allowed))) { NODEMASK_FREE(nodes_allowed); - nodes_allowed = &node_states[N_HIGH_MEMORY]; + nodes_allowed = &node_states[N_MEMORY]; } h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); - if (nodes_allowed != &node_states[N_HIGH_MEMORY]) + if (nodes_allowed != &node_states[N_MEMORY]) NODEMASK_FREE(nodes_allowed); } out: -- cgit From a47b53c5f9a690addbc094501e4ae083ec307f57 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 12 Dec 2012 13:51:37 -0800 Subject: vmstat: use N_MEMORY instead N_HIGH_MEMORY N_HIGH_MEMORY stands for the nodes that has normal or high memory. N_MEMORY stands for the nodes that has any memory. The code here need to handle with the nodes which have memory, we should use N_MEMORY instead. Signed-off-by: Lai Jiangshan Acked-by: Christoph Lameter Signed-off-by: Wen Congyang Cc: Hillf Danton Cc: Lin Feng Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 5da4b19023c3..9a4a522c0b0f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -932,7 +932,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg) pg_data_t *pgdat = (pg_data_t *)arg; /* check memoryless node */ - if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) + if (!node_state(pgdat->node_id, N_MEMORY)) return 0; seq_printf(m, "Page block order: %d\n", pageblock_order); @@ -1294,7 +1294,7 @@ static int unusable_show(struct seq_file *m, void *arg) pg_data_t *pgdat = (pg_data_t *)arg; /* check memoryless node */ - if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) + if (!node_state(pgdat->node_id, N_MEMORY)) return 0; walk_zones_in_node(m, pgdat, unusable_show_print); -- cgit From 48fb2e240c4275c6ba4f53c9397f5fd6f350c3a7 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 12 Dec 2012 13:51:43 -0800 Subject: vmscan: use N_MEMORY instead N_HIGH_MEMORY N_HIGH_MEMORY stands for the nodes that has normal or high memory. N_MEMORY stands for the nodes that has any memory. The code here need to handle with the nodes which have memory, we should use N_MEMORY instead. Signed-off-by: Lai Jiangshan Acked-by: Hillf Danton Signed-off-by: Wen Congyang Cc: Christoph Lameter Cc: Hillf Danton Cc: Lin Feng Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 157bb116dec8..7f3096137b8a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3131,7 +3131,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb, int nid; if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { pg_data_t *pgdat = NODE_DATA(nid); const struct cpumask *mask; @@ -3187,7 +3187,7 @@ static int __init kswapd_init(void) int nid; swap_setup(); - for_each_node_state(nid, N_HIGH_MEMORY) + for_each_node_state(nid, N_MEMORY) kswapd_run(nid); hotcpu_notifier(cpu_callback, 0); return 0; -- cgit From 4b0ef1fe8a626f0ba7f649764f979d0dc9eab86b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 12 Dec 2012 13:51:46 -0800 Subject: page_alloc: use N_MEMORY instead N_HIGH_MEMORY change the node_states initialization N_HIGH_MEMORY stands for the nodes that has normal or high memory. N_MEMORY stands for the nodes that has any memory. The code here need to handle with the nodes which have memory, we should use N_MEMORY instead. Since we introduced N_MEMORY, we update the initialization of node_states. Signed-off-by: Lai Jiangshan Signed-off-by: Lin Feng Signed-off-by: Wen Congyang Cc: Christoph Lameter Cc: Hillf Danton Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4171cd4f8257..35727168896b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1695,7 +1695,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, * * If the zonelist cache is present in the passed in zonelist, then * returns a pointer to the allowed node mask (either the current - * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) + * tasks mems_allowed, or node_states[N_MEMORY].) * * If the zonelist cache is not available for this zonelist, does * nothing and returns NULL. @@ -1724,7 +1724,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? &cpuset_current_mems_allowed : - &node_states[N_HIGH_MEMORY]; + &node_states[N_MEMORY]; return allowednodes; } @@ -3238,7 +3238,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) return node; } - for_each_node_state(n, N_HIGH_MEMORY) { + for_each_node_state(n, N_MEMORY) { /* Don't want a node to appear more than once */ if (node_isset(n, *used_node_mask)) @@ -3380,7 +3380,7 @@ static int default_zonelist_order(void) * local memory, NODE_ORDER may be suitable. */ average_size = total_size / - (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); + (nodes_weight(node_states[N_MEMORY]) + 1); for_each_online_node(nid) { low_kmem_size = 0; total_size = 0; @@ -4731,7 +4731,7 @@ unsigned long __init find_min_pfn_with_active_regions(void) /* * early_calculate_totalpages() * Sum pages in active regions for movable zone. - * Populate N_HIGH_MEMORY for calculating usable_nodes. + * Populate N_MEMORY for calculating usable_nodes. */ static unsigned long __init early_calculate_totalpages(void) { @@ -4744,7 +4744,7 @@ static unsigned long __init early_calculate_totalpages(void) totalpages += pages; if (pages) - node_set_state(nid, N_HIGH_MEMORY); + node_set_state(nid, N_MEMORY); } return totalpages; } @@ -4761,9 +4761,9 @@ static void __init find_zone_movable_pfns_for_nodes(void) unsigned long usable_startpfn; unsigned long kernelcore_node, kernelcore_remaining; /* save the state before borrow the nodemask */ - nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; + nodemask_t saved_node_state = node_states[N_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); - int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); + int usable_nodes = nodes_weight(node_states[N_MEMORY]); /* * If movablecore was specified, calculate what size of @@ -4798,7 +4798,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) restart: /* Spread kernelcore memory as evenly as possible throughout nodes */ kernelcore_node = required_kernelcore / usable_nodes; - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { unsigned long start_pfn, end_pfn; /* @@ -4890,23 +4890,27 @@ restart: out: /* restore the node_state */ - node_states[N_HIGH_MEMORY] = saved_node_state; + node_states[N_MEMORY] = saved_node_state; } -/* Any regular memory on that node ? */ -static void __init check_for_regular_memory(pg_data_t *pgdat) +/* Any regular or high memory on that node ? */ +static void check_for_memory(pg_data_t *pgdat, int nid) { -#ifdef CONFIG_HIGHMEM enum zone_type zone_type; - for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { + if (N_MEMORY == N_NORMAL_MEMORY) + return; + + for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; if (zone->present_pages) { - node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); + node_set_state(nid, N_HIGH_MEMORY); + if (N_NORMAL_MEMORY != N_HIGH_MEMORY && + zone_type <= ZONE_NORMAL) + node_set_state(nid, N_NORMAL_MEMORY); break; } } -#endif } /** @@ -4989,8 +4993,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) /* Any memory on that node */ if (pgdat->node_present_pages) - node_set_state(nid, N_HIGH_MEMORY); - check_for_regular_memory(pgdat); + node_set_state(nid, N_MEMORY); + check_for_memory(pgdat, nid); } } -- cgit From 6715ddf94576ff39f5d1cda8d4c568d3b79e82ec Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 12 Dec 2012 13:51:49 -0800 Subject: hotplug: update nodemasks management Update nodemasks management for N_MEMORY. [lliubbo@gmail.com: fix build] Signed-off-by: Lai Jiangshan Signed-off-by: Wen Congyang Cc: Christoph Lameter Cc: Hillf Danton Cc: Lin Feng Cc: David Rientjes Signed-off-by: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 87 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 72 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index de9cb14ae753..eca4aac1a83b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -595,13 +595,15 @@ static void node_states_check_changes_online(unsigned long nr_pages, enum zone_type zone_last = ZONE_NORMAL; /* - * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes - * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. + * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] + * contains nodes which have zones of 0...ZONE_NORMAL, + * set zone_last to ZONE_NORMAL. * - * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes - * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. + * If we don't have HIGHMEM nor movable node, + * node_states[N_NORMAL_MEMORY] contains nodes which have zones of + * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. */ - if (N_HIGH_MEMORY == N_NORMAL_MEMORY) + if (N_MEMORY == N_NORMAL_MEMORY) zone_last = ZONE_MOVABLE; /* @@ -615,12 +617,34 @@ static void node_states_check_changes_online(unsigned long nr_pages, else arg->status_change_nid_normal = -1; +#ifdef CONFIG_HIGHMEM + /* + * If we have movable node, node_states[N_HIGH_MEMORY] + * contains nodes which have zones of 0...ZONE_HIGHMEM, + * set zone_last to ZONE_HIGHMEM. + * + * If we don't have movable node, node_states[N_NORMAL_MEMORY] + * contains nodes which have zones of 0...ZONE_MOVABLE, + * set zone_last to ZONE_MOVABLE. + */ + zone_last = ZONE_HIGHMEM; + if (N_MEMORY == N_HIGH_MEMORY) + zone_last = ZONE_MOVABLE; + + if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) + arg->status_change_nid_high = nid; + else + arg->status_change_nid_high = -1; +#else + arg->status_change_nid_high = arg->status_change_nid_normal; +#endif + /* * if the node don't have memory befor online, we will need to - * set the node to node_states[N_HIGH_MEMORY] after the memory + * set the node to node_states[N_MEMORY] after the memory * is online. */ - if (!node_state(nid, N_HIGH_MEMORY)) + if (!node_state(nid, N_MEMORY)) arg->status_change_nid = nid; else arg->status_change_nid = -1; @@ -631,7 +655,10 @@ static void node_states_set_node(int node, struct memory_notify *arg) if (arg->status_change_nid_normal >= 0) node_set_state(node, N_NORMAL_MEMORY); - node_set_state(node, N_HIGH_MEMORY); + if (arg->status_change_nid_high >= 0) + node_set_state(node, N_HIGH_MEMORY); + + node_set_state(node, N_MEMORY); } @@ -1099,13 +1126,15 @@ static void node_states_check_changes_offline(unsigned long nr_pages, enum zone_type zt, zone_last = ZONE_NORMAL; /* - * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes - * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. + * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] + * contains nodes which have zones of 0...ZONE_NORMAL, + * set zone_last to ZONE_NORMAL. * - * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes - * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. + * If we don't have HIGHMEM nor movable node, + * node_states[N_NORMAL_MEMORY] contains nodes which have zones of + * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. */ - if (N_HIGH_MEMORY == N_NORMAL_MEMORY) + if (N_MEMORY == N_NORMAL_MEMORY) zone_last = ZONE_MOVABLE; /* @@ -1122,6 +1151,30 @@ static void node_states_check_changes_offline(unsigned long nr_pages, else arg->status_change_nid_normal = -1; +#ifdef CONFIG_HIGHMEM + /* + * If we have movable node, node_states[N_HIGH_MEMORY] + * contains nodes which have zones of 0...ZONE_HIGHMEM, + * set zone_last to ZONE_HIGHMEM. + * + * If we don't have movable node, node_states[N_NORMAL_MEMORY] + * contains nodes which have zones of 0...ZONE_MOVABLE, + * set zone_last to ZONE_MOVABLE. + */ + zone_last = ZONE_HIGHMEM; + if (N_MEMORY == N_HIGH_MEMORY) + zone_last = ZONE_MOVABLE; + + for (; zt <= zone_last; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) + arg->status_change_nid_high = zone_to_nid(zone); + else + arg->status_change_nid_high = -1; +#else + arg->status_change_nid_high = arg->status_change_nid_normal; +#endif + /* * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE */ @@ -1146,9 +1199,13 @@ static void node_states_clear_node(int node, struct memory_notify *arg) if (arg->status_change_nid_normal >= 0) node_clear_state(node, N_NORMAL_MEMORY); - if ((N_HIGH_MEMORY != N_NORMAL_MEMORY) && - (arg->status_change_nid >= 0)) + if ((N_MEMORY != N_NORMAL_MEMORY) && + (arg->status_change_nid_high >= 0)) node_clear_state(node, N_HIGH_MEMORY); + + if ((N_MEMORY != N_HIGH_MEMORY) && + (arg->status_change_nid >= 0)) + node_clear_state(node, N_MEMORY); } static int __ref __offline_pages(unsigned long start_pfn, -- cgit From 2897b4d29d9fca82a57b09b8a216a5d604966e4b Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 12 Dec 2012 13:51:53 -0800 Subject: mm: WARN_ON_ONCE if f_op->mmap() change vma's start address During reviewing the source code, I found a comment which mention that after f_op->mmap(), vma's start address can be changed. I didn't verify that it is really possible, because there are so many f_op->mmap() implementation. But if there are some mmap() which change vma's start address, it is possible error situation, because we already prepare prev vma, rb_link and rb_parent and these are related to original address. So add WARN_ON_ONCE for finding that this situtation really happens. Signed-off-by: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index f940062c8d4b..9ed3a06242a0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1488,7 +1488,11 @@ munmap_back: * * Answer: Yes, several device drivers can do it in their * f_op->mmap method. -DaveM + * Bug: If addr is changed, prev, rb_link, rb_parent should + * be updated for vma_link() */ + WARN_ON_ONCE(addr != vma->vm_start); + addr = vma->vm_start; pgoff = vma->vm_pgoff; vm_flags = vma->vm_flags; -- cgit From 68ae564bbac8eb9ed54ddd2529b0e29ee190b355 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 12 Dec 2012 13:51:57 -0800 Subject: mm, memcg: avoid unnecessary function call when memcg is disabled While profiling numa/core v16 with cgroup_disable=memory on the command line, I noticed mem_cgroup_count_vm_event() still showed up as high as 0.60% in perftop. This occurs because the function is called extremely often even when memcg is disabled. To fix this, inline the check for mem_cgroup_disabled() so we avoid the unnecessary function call if memcg is disabled. Signed-off-by: David Rientjes Acked-by: KAMEZAWA Hiroyuki Acked-by: Glauber Costa Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 49d86d06e1dd..7f0e3571df7e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -59,6 +59,8 @@ #include struct cgroup_subsys mem_cgroup_subsys __read_mostly; +EXPORT_SYMBOL(mem_cgroup_subsys); + #define MEM_CGROUP_RECLAIM_RETRIES 5 static struct mem_cgroup *root_mem_cgroup __read_mostly; @@ -1015,7 +1017,7 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) -void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) +void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) { struct mem_cgroup *memcg; @@ -1040,7 +1042,7 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) out: rcu_read_unlock(); } -EXPORT_SYMBOL(mem_cgroup_count_vm_event); +EXPORT_SYMBOL(__mem_cgroup_count_vm_event); /** * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg -- cgit From 20b2f52b73febce476fc9376f0296c1aa0e4f5a7 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 12 Dec 2012 13:52:00 -0800 Subject: numa: add CONFIG_MOVABLE_NODE for movable-dedicated node We need a node which only contains movable memory. This feature is very important for node hotplug. If a node has normal/highmem, the memory may be used by the kernel and can't be offlined. If the node only contains movable memory, we can offline the memory and the node. All are prepared, we can actually introduce N_MEMORY. add CONFIG_MOVABLE_NODE make we can use it for movable-dedicated node [akpm@linux-foundation.org: fix Kconfig text] Signed-off-by: Lai Jiangshan Tested-by: Yasuaki Ishimatsu Signed-off-by: Wen Congyang Cc: Jiang Liu Cc: KOSAKI Motohiro Cc: Minchan Kim Cc: Mel Gorman Cc: David Rientjes Cc: Yinghai Lu Cc: Rusty Russell Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 8 ++++++++ mm/page_alloc.c | 3 +++ 2 files changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index e6651c5de14f..1680a0123a13 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -143,6 +143,14 @@ config NO_BOOTMEM config MEMORY_ISOLATION boolean +config MOVABLE_NODE + boolean "Enable to assign a node which has only movable memory" + depends on HAVE_MEMBLOCK + depends on NO_BOOTMEM + depends on X86_64 + depends on NUMA + default y + # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 35727168896b..2bf0d43d646b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -89,6 +89,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { [N_NORMAL_MEMORY] = { { [0] = 1UL } }, #ifdef CONFIG_HIGHMEM [N_HIGH_MEMORY] = { { [0] = 1UL } }, +#endif +#ifdef CONFIG_MOVABLE_NODE + [N_MEMORY] = { { [0] = 1UL } }, #endif [N_CPU] = { { [0] = 1UL } }, #endif /* NUMA */ -- cgit From 09285af75d1682d8642607941ca6034ea1b159eb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 12 Dec 2012 13:52:04 -0800 Subject: memory_hotplug: allow online/offline memory to result movable node Now, memory management can handle movable node or nodes which don't have any normal memory, so we can dynamic configure and add movable node by: online a ZONE_MOVABLE memory from a previous offline node offline the last normal memory which result a non-normal-memory-node movable-node is very important for power-saving, hardware partitioning and high-available-system(hardware fault management). Signed-off-by: Lai Jiangshan Tested-by: Yasuaki Ishimatsu Signed-off-by: Wen Congyang Cc: Jiang Liu Cc: KOSAKI Motohiro Cc: Minchan Kim Cc: Mel Gorman Cc: David Rientjes Cc: Yinghai Lu Cc: Rusty Russell Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index eca4aac1a83b..c6cd8b515424 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -581,11 +581,19 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, return 0; } +#ifdef CONFIG_MOVABLE_NODE +/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */ +static bool can_online_high_movable(struct zone *zone) +{ + return true; +} +#else /* #ifdef CONFIG_MOVABLE_NODE */ /* ensure every online node has NORMAL memory */ static bool can_online_high_movable(struct zone *zone) { return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); } +#endif /* #ifdef CONFIG_MOVABLE_NODE */ /* check which state of node_states will be changed when online memory */ static void node_states_check_changes_online(unsigned long nr_pages, @@ -1093,6 +1101,13 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) return offlined; } +#ifdef CONFIG_MOVABLE_NODE +/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */ +static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) +{ + return true; +} +#else /* #ifdef CONFIG_MOVABLE_NODE */ /* ensure the node has NORMAL memory if it is still online */ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) { @@ -1116,6 +1131,7 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) */ return present_pages == 0; } +#endif /* #ifdef CONFIG_MOVABLE_NODE */ /* check which state of node_states will be changed when offline memory */ static void node_states_check_changes_offline(unsigned long nr_pages, -- cgit From efacd02e4f57d94e934ba5c84f10f8ce91158770 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 12 Dec 2012 13:52:06 -0800 Subject: mm, oom: cleanup pagefault oom handler To lock the entire system from parallel oom killing, it's possible to pass in a zonelist with all zones rather than using for_each_populated_zone() for the iteration. This obsoletes try_set_system_oom() and clear_system_oom() so that they can be removed. Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 49 +++++++------------------------------------------ 1 file changed, 7 insertions(+), 42 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index fe36205a7f8a..0e30ff7b21c5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -591,43 +591,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) spin_unlock(&zone_scan_lock); } -/* - * Try to acquire the oom killer lock for all system zones. Returns zero if a - * parallel oom killing is taking place, otherwise locks all zones and returns - * non-zero. - */ -static int try_set_system_oom(void) -{ - struct zone *zone; - int ret = 1; - - spin_lock(&zone_scan_lock); - for_each_populated_zone(zone) - if (zone_is_oom_locked(zone)) { - ret = 0; - goto out; - } - for_each_populated_zone(zone) - zone_set_flag(zone, ZONE_OOM_LOCKED); -out: - spin_unlock(&zone_scan_lock); - return ret; -} - -/* - * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation - * attempts or page faults may now recall the oom killer, if necessary. - */ -static void clear_system_oom(void) -{ - struct zone *zone; - - spin_lock(&zone_scan_lock); - for_each_populated_zone(zone) - zone_clear_flag(zone, ZONE_OOM_LOCKED); - spin_unlock(&zone_scan_lock); -} - /** * out_of_memory - kill the "best" process when we run out of memory * @zonelist: zonelist pointer @@ -708,15 +671,17 @@ out: /* * The pagefault handler calls here because it is out of memory, so kill a - * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel - * oom killing is already in progress so do nothing. If a task is found with - * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit. + * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a + * parallel oom killing is already in progress so do nothing. */ void pagefault_out_of_memory(void) { - if (try_set_system_oom()) { + struct zonelist *zonelist = node_zonelist(first_online_node, + GFP_KERNEL); + + if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { out_of_memory(NULL, 0, 0, NULL, false); - clear_system_oom(); + clear_zonelist_oom(zonelist, GFP_KERNEL); } schedule_timeout_killable(1); } -- cgit From 0fa84a4bfa2aac8c04d45351b40765d61e1fd20d Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 12 Dec 2012 13:52:07 -0800 Subject: mm, oom: remove redundant sleep in pagefault oom handler out_of_memory() will already cause current to schedule if it has not been killed, so doing it again in pagefault_out_of_memory() is redundant. Remove it. Signed-off-by: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 0e30ff7b21c5..0399f146ae49 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -683,5 +683,4 @@ void pagefault_out_of_memory(void) out_of_memory(NULL, 0, 0, NULL, false); clear_zonelist_oom(zonelist, GFP_KERNEL); } - schedule_timeout_killable(1); } -- cgit From 9feedc9d831e18ae6d0d15aa562e5e46ba53647b Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 12 Dec 2012 13:52:12 -0800 Subject: mm: introduce new field "managed_pages" to struct zone Currently a zone's present_pages is calcuated as below, which is inaccurate and may cause trouble to memory hotplug. spanned_pages - absent_pages - memmap_pages - dma_reserve. During fixing bugs caused by inaccurate zone->present_pages, we found zone->present_pages has been abused. The field zone->present_pages may have different meanings in different contexts: 1) pages existing in a zone. 2) pages managed by the buddy system. For more discussions about the issue, please refer to: http://lkml.org/lkml/2012/11/5/866 https://patchwork.kernel.org/patch/1346751/ This patchset tries to introduce a new field named "managed_pages" to struct zone, which counts "pages managed by the buddy system". And revert zone->present_pages to count "physical pages existing in a zone", which also keep in consistence with pgdat->node_present_pages. We will set an initial value for zone->managed_pages in function free_area_init_core() and will adjust it later if the initial value is inaccurate. For DMA/normal zones, the initial value is set to: (spanned_pages - absent_pages - memmap_pages - dma_reserve) Later zone->managed_pages will be adjusted to the accurate value when the bootmem allocator frees all free pages to the buddy system in function free_all_bootmem_node() and free_all_bootmem(). The bootmem allocator doesn't touch highmem pages, so highmem zones' managed_pages is set to the accurate value "spanned_pages - absent_pages" in function free_area_init_core() and won't be updated anymore. This patch also adds a new field "managed_pages" to /proc/zoneinfo and sysrq showmem. [akpm@linux-foundation.org: small comment tweaks] Signed-off-by: Jiang Liu Cc: Wen Congyang Cc: David Rientjes Cc: Maciej Rutecki Tested-by: Chris Clayton Cc: "Rafael J . Wysocki" Cc: Mel Gorman Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Jianguo Wu Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 21 +++++++++++++++++++++ mm/memory_hotplug.c | 10 ++++++++++ mm/nobootmem.c | 22 ++++++++++++++++++++++ mm/page_alloc.c | 44 ++++++++++++++++++++++++++++++-------------- mm/vmstat.c | 6 ++++-- 5 files changed, 87 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 26d057a8b552..19262ac05dd2 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -229,6 +229,22 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) return count; } +static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) +{ + struct zone *z; + + /* + * In free_area_init_core(), highmem zone's managed_pages is set to + * present_pages, and bootmem allocator doesn't allocate from highmem + * zones. So there's no need to recalculate managed_pages because all + * highmem pages will be managed by the buddy system. Here highmem + * zone also includes highmem movable zone. + */ + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + if (!is_highmem(z)) + z->managed_pages = 0; +} + /** * free_all_bootmem_node - release a node's free pages to the buddy allocator * @pgdat: node to be released @@ -238,6 +254,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); + reset_node_lowmem_managed_pages(pgdat); return free_all_bootmem_core(pgdat->bdata); } @@ -250,6 +267,10 @@ unsigned long __init free_all_bootmem(void) { unsigned long total_pages = 0; bootmem_data_t *bdata; + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) + reset_node_lowmem_managed_pages(pgdat); list_for_each_entry(bdata, &bdata_list, list) total_pages += free_all_bootmem_core(bdata); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c6cd8b515424..b7c93ca896d6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page, void __ref put_page_bootmem(struct page *page) { unsigned long type; + static DEFINE_MUTEX(ppb_lock); type = (unsigned long) page->lru.next; BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || @@ -115,7 +116,14 @@ void __ref put_page_bootmem(struct page *page) ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); + + /* + * Please refer to comment for __free_pages_bootmem() + * for why we serialize here. + */ + mutex_lock(&ppb_lock); __free_pages_bootmem(page, 0); + mutex_unlock(&ppb_lock); } } @@ -748,6 +756,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ return ret; } + zone->managed_pages += onlined_pages; zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; if (onlined_pages) { @@ -1321,6 +1330,7 @@ repeat: /* reset pagetype flags and makes migrate type to be MOVABLE */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); /* removal success */ + zone->managed_pages -= offlined_pages; zone->present_pages -= offlined_pages; zone->zone_pgdat->node_present_pages -= offlined_pages; totalram_pages -= offlined_pages; diff --git a/mm/nobootmem.c b/mm/nobootmem.c index bd82f6b31411..b8294fc03df8 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -137,6 +137,22 @@ unsigned long __init free_low_memory_core_early(int nodeid) return count; } +static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) +{ + struct zone *z; + + /* + * In free_area_init_core(), highmem zone's managed_pages is set to + * present_pages, and bootmem allocator doesn't allocate from highmem + * zones. So there's no need to recalculate managed_pages because all + * highmem pages will be managed by the buddy system. Here highmem + * zone also includes highmem movable zone. + */ + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + if (!is_highmem(z)) + z->managed_pages = 0; +} + /** * free_all_bootmem_node - release a node's free pages to the buddy allocator * @pgdat: node to be released @@ -146,6 +162,7 @@ unsigned long __init free_low_memory_core_early(int nodeid) unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); + reset_node_lowmem_managed_pages(pgdat); /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ return 0; @@ -158,6 +175,11 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) */ unsigned long __init free_all_bootmem(void) { + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) + reset_node_lowmem_managed_pages(pgdat); + /* * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id * because in some case like Node0 doesn't have RAM installed diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2bf0d43d646b..0b6a6d04300a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -735,6 +735,13 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } +/* + * Read access to zone->managed_pages is safe because it's unsigned long, + * but we still need to serialize writers. Currently all callers of + * __free_pages_bootmem() except put_page_bootmem() should only be used + * at boot time. So for shorter boot time, we shift the burden to + * put_page_bootmem() to serialize writers. + */ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; @@ -750,6 +757,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) set_page_count(p, 0); } + page_zone(page)->managed_pages += 1 << order; set_page_refcounted(page); __free_pages(page, order); } @@ -2984,6 +2992,7 @@ void show_free_areas(unsigned int filter) " isolated(anon):%lukB" " isolated(file):%lukB" " present:%lukB" + " managed:%lukB" " mlocked:%lukB" " dirty:%lukB" " writeback:%lukB" @@ -3013,6 +3022,7 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_ISOLATED_ANON)), K(zone_page_state(zone, NR_ISOLATED_FILE)), K(zone->present_pages), + K(zone->managed_pages), K(zone_page_state(zone, NR_MLOCK)), K(zone_page_state(zone, NR_FILE_DIRTY)), K(zone_page_state(zone, NR_WRITEBACK)), @@ -4502,48 +4512,54 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long size, realsize, memmap_pages; + unsigned long size, realsize, freesize, memmap_pages; size = zone_spanned_pages_in_node(nid, j, zones_size); - realsize = size - zone_absent_pages_in_node(nid, j, + realsize = freesize = size - zone_absent_pages_in_node(nid, j, zholes_size); /* - * Adjust realsize so that it accounts for how much memory + * Adjust freesize so that it accounts for how much memory * is used by this zone for memmap. This affects the watermark * and per-cpu initialisations */ memmap_pages = PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; - if (realsize >= memmap_pages) { - realsize -= memmap_pages; + if (freesize >= memmap_pages) { + freesize -= memmap_pages; if (memmap_pages) printk(KERN_DEBUG " %s zone: %lu pages used for memmap\n", zone_names[j], memmap_pages); } else printk(KERN_WARNING - " %s zone: %lu pages exceeds realsize %lu\n", - zone_names[j], memmap_pages, realsize); + " %s zone: %lu pages exceeds freesize %lu\n", + zone_names[j], memmap_pages, freesize); /* Account for reserved pages */ - if (j == 0 && realsize > dma_reserve) { - realsize -= dma_reserve; + if (j == 0 && freesize > dma_reserve) { + freesize -= dma_reserve; printk(KERN_DEBUG " %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); } if (!is_highmem_idx(j)) - nr_kernel_pages += realsize; - nr_all_pages += realsize; + nr_kernel_pages += freesize; + nr_all_pages += freesize; zone->spanned_pages = size; - zone->present_pages = realsize; + zone->present_pages = freesize; + /* + * Set an approximate value for lowmem here, it will be adjusted + * when the bootmem allocator frees pages into the buddy system. + * And all highmem pages will be managed by the buddy system. + */ + zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; #ifdef CONFIG_NUMA zone->node = nid; - zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) + zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) / 100; - zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; + zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; spin_lock_init(&zone->lock); diff --git a/mm/vmstat.c b/mm/vmstat.c index 9a4a522c0b0f..df14808f0a36 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -994,14 +994,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n high %lu" "\n scanned %lu" "\n spanned %lu" - "\n present %lu", + "\n present %lu" + "\n managed %lu", zone_page_state(zone, NR_FREE_PAGES), min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), zone->pages_scanned, zone->spanned_pages, - zone->present_pages); + zone->present_pages, + zone->managed_pages); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) seq_printf(m, "\n %-12s %lu", vmstat_text[i], -- cgit From 01cefaef40c48c714b7476e62781230993e10dbf Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 12 Dec 2012 13:52:19 -0800 Subject: mm: provide more accurate estimation of pages occupied by memmap If SPARSEMEM is enabled, it won't build page structures for non-existing pages (holes) within a zone, so provide a more accurate estimation of pages occupied by memmap if there are bigger holes within the zone. And pages for highmem zones' memmap will be allocated from lowmem, so charge nr_kernel_pages for that. [akpm@linux-foundation.org: mark calc_memmap_size __paging_init] Signed-off-by: Jiang Liu Cc: Wen Congyang Cc: David Rientjes Cc: Maciej Rutecki Cc: Chris Clayton Cc: "Rafael J . Wysocki" Cc: Mel Gorman Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Tested-by: Jianguo Wu Cc: Dave Hansen Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0b6a6d04300a..d187988e2b56 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4489,6 +4489,26 @@ void __init set_pageblock_order(void) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ +static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, + unsigned long present_pages) +{ + unsigned long pages = spanned_pages; + + /* + * Provide a more accurate estimation if there are holes within + * the zone and SPARSEMEM is in use. If there are holes within the + * zone, each populated memory region may cost us one or two extra + * memmap pages due to alignment because memmap pages for each + * populated regions may not naturally algined on page boundary. + * So the (present_pages >> 4) heuristic is a tradeoff for that. + */ + if (spanned_pages > present_pages + (present_pages >> 4) && + IS_ENABLED(CONFIG_SPARSEMEM)) + pages = present_pages; + + return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; +} + /* * Set up the zone data structures: * - mark all pages reserved @@ -4523,8 +4543,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, * is used by this zone for memmap. This affects the watermark * and per-cpu initialisations */ - memmap_pages = - PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; + memmap_pages = calc_memmap_size(size, realsize); if (freesize >= memmap_pages) { freesize -= memmap_pages; if (memmap_pages) @@ -4545,6 +4564,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, if (!is_highmem_idx(j)) nr_kernel_pages += freesize; + /* Charge for highmem memmap if there are enough kernel pages */ + else if (nr_kernel_pages > memmap_pages * 2) + nr_kernel_pages -= memmap_pages; nr_all_pages += freesize; zone->spanned_pages = size; -- cgit From 220f2ac91353dd8b239b70c4b4cf1615b80c1ff5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 12 Dec 2012 13:52:21 -0800 Subject: tmpfs: support SEEK_DATA and SEEK_HOLE (reprise) Revert 3.5's commit f21f8062201f ("tmpfs: revert SEEK_DATA and SEEK_HOLE") to reinstate 4fb5ef089b28 ("tmpfs: support SEEK_DATA and SEEK_HOLE"), with the intervening additional arg to generic_file_llseek_size(). In 3.8, ext4 is expected to join btrfs, ocfs2 and xfs with proper SEEK_DATA and SEEK_HOLE support; and a good case has now been made for it on tmpfs, so let's join the party. It's quite easy for tmpfs to scan the radix_tree to support llseek's new SEEK_DATA and SEEK_HOLE options: so add them while the minutiae are still on my mind (in particular, the !PageUptodate-ness of pages fallocated but still unwritten). [akpm@linux-foundation.org: fix warning with CONFIG_TMPFS=n] Signed-off-by: Hugh Dickins Cc: Dave Chinner Cc: Jaegeuk Hanse Cc: "Theodore Ts'o" Cc: Zheng Liu Cc: Jeff liu Cc: Paul Eggert Cc: Christoph Hellwig Cc: Josef Bacik Cc: Andi Kleen Cc: Andreas Dilger Cc: Marco Stornelli Cc: Chris Mason Cc: Sunil Mushran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 50c5b8f3a359..03f9ba8fb8e5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1715,6 +1715,96 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, return error; } +/* + * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. + */ +static pgoff_t shmem_seek_hole_data(struct address_space *mapping, + pgoff_t index, pgoff_t end, int origin) +{ + struct page *page; + struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; + bool done = false; + int i; + + pagevec_init(&pvec, 0); + pvec.nr = 1; /* start small: we may be there already */ + while (!done) { + pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + pvec.nr, pvec.pages, indices); + if (!pvec.nr) { + if (origin == SEEK_DATA) + index = end; + break; + } + for (i = 0; i < pvec.nr; i++, index++) { + if (index < indices[i]) { + if (origin == SEEK_HOLE) { + done = true; + break; + } + index = indices[i]; + } + page = pvec.pages[i]; + if (page && !radix_tree_exceptional_entry(page)) { + if (!PageUptodate(page)) + page = NULL; + } + if (index >= end || + (page && origin == SEEK_DATA) || + (!page && origin == SEEK_HOLE)) { + done = true; + break; + } + } + shmem_deswap_pagevec(&pvec); + pagevec_release(&pvec); + pvec.nr = PAGEVEC_SIZE; + cond_resched(); + } + return index; +} + +static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) +{ + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + pgoff_t start, end; + loff_t new_offset; + + if (origin != SEEK_DATA && origin != SEEK_HOLE) + return generic_file_llseek_size(file, offset, origin, + MAX_LFS_FILESIZE, i_size_read(inode)); + mutex_lock(&inode->i_mutex); + /* We're holding i_mutex so we can access i_size directly */ + + if (offset < 0) + offset = -EINVAL; + else if (offset >= inode->i_size) + offset = -ENXIO; + else { + start = offset >> PAGE_CACHE_SHIFT; + end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + new_offset = shmem_seek_hole_data(mapping, start, end, origin); + new_offset <<= PAGE_CACHE_SHIFT; + if (new_offset > offset) { + if (new_offset < inode->i_size) + offset = new_offset; + else if (origin == SEEK_DATA) + offset = -ENXIO; + else + offset = inode->i_size; + } + } + + if (offset >= 0 && offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + mutex_unlock(&inode->i_mutex); + return offset; +} + static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -2586,7 +2676,7 @@ static const struct address_space_operations shmem_aops = { static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, #ifdef CONFIG_TMPFS - .llseek = generic_file_llseek, + .llseek = shmem_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = shmem_file_aio_read, -- cgit From c95d26c2ffd3931123d6a7a2489530ccbd36da7a Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 12 Dec 2012 13:52:23 -0800 Subject: memcg: do not check for mm in __mem_cgroup_count_vm_event The mm given to __mem_cgroup_count_vm_event() cannot be NULL because the function is either called from the page fault path or vma->vm_mm is used. So the check can be dropped. The check was introduced by commit 456f998ec817 ("memcg: add the pagefault count into memcg stats") because the originally proposed patch used current->mm for shmem but this has been changed to vma->vm_mm later on without the check being removed (thanks to Hugh for this recollection). Signed-off-by: Michal Hocko Cc: Hugh Dickins Cc: Ying Han Cc: David Rientjes Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7f0e3571df7e..6c055929c8cc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1021,9 +1021,6 @@ void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) { struct mem_cgroup *memcg; - if (!mm) - return; - rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (unlikely(!memcg)) -- cgit From 4128997b5f0e7ad583a5f3990051b8188b39055c Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Wed, 12 Dec 2012 13:52:25 -0800 Subject: mm: protect against concurrent vma expansion expand_stack() runs with a shared mmap_sem lock. Because of this, there could be multiple concurrent stack expansions in the same mm, which may cause problems in the vma gap update code. I propose to solve this by taking the mm->page_table_lock around such vma expansions, in order to avoid the concurrency issue. We only have to worry about concurrent expand_stack() calls here, since we hold a shared mmap_sem lock and all vma modificaitons other than expand_stack() are done under an exclusive mmap_sem lock. I previously tried to achieve the same effect by making sure all growable vmas in a given mm would share the same anon_vma, which we already lock here. However this turned out to be difficult - all of the schemes I tried for refcounting the growable anon_vma and clearing turned out ugly. So, I'm now proposing only the minimal fix. The overhead of taking the page table lock during stack expansion is expected to be small: glibc doesn't use expandable stacks for the threads it creates, so having multiple growable stacks is actually uncommon and we don't expect the page table lock to get bounced between threads. Signed-off-by: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 9ed3a06242a0..2b7d9e78a569 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2069,6 +2069,18 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { + /* + * vma_gap_update() doesn't support concurrent + * updates, but we only hold a shared mmap_sem + * lock here, so we need to protect against + * concurrent vma expansions. + * vma_lock_anon_vma() doesn't help here, as + * we don't guarantee that all growable vmas + * in a mm share the same root anon vma. + * So, we reuse mm->page_table_lock to guard + * against concurrent vma expansions. + */ + spin_lock(&vma->vm_mm->page_table_lock); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; anon_vma_interval_tree_post_update_vma(vma); @@ -2076,6 +2088,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) vma_gap_update(vma->vm_next); else vma->vm_mm->highest_vm_end = address; + spin_unlock(&vma->vm_mm->page_table_lock); + perf_event_mmap(vma); } } @@ -2126,11 +2140,25 @@ int expand_downwards(struct vm_area_struct *vma, if (grow <= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { + /* + * vma_gap_update() doesn't support concurrent + * updates, but we only hold a shared mmap_sem + * lock here, so we need to protect against + * concurrent vma expansions. + * vma_lock_anon_vma() doesn't help here, as + * we don't guarantee that all growable vmas + * in a mm share the same root anon vma. + * So, we reuse mm->page_table_lock to guard + * against concurrent vma expansions. + */ + spin_lock(&vma->vm_mm->page_table_lock); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; anon_vma_interval_tree_post_update_vma(vma); vma_gap_update(vma); + spin_unlock(&vma->vm_mm->page_table_lock); + perf_event_mmap(vma); } } -- cgit From 8c4894c6bc790d0e31e072202939ac6747bbe7ac Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 12 Dec 2012 13:52:28 -0800 Subject: hwpoison, hugetlbfs: fix "bad pmd" warning in unmapping hwpoisoned hugepage When a process which used a hwpoisoned hugepage tries to exit() or munmap(), the kernel can print out "bad pmd" message because page table walker in free_pgtables() encounters 'hwpoisoned entry' on pmd. This is because currently we fail to clear the hwpoisoned entry in __unmap_hugepage_range(), so this patch simply does it. Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Tony Luck Cc: Wu Fengguang Cc: Aneesh Kumar K.V Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bd22bd895299..e53f39cd67db 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2386,8 +2386,10 @@ again: /* * HWPoisoned hugepage is already unmapped and dropped reference */ - if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) + if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { + pte_clear(mm, address, ptep); continue; + } page = pte_page(pte); /* -- cgit From 5f24ae585be985691c017b7ab90b3669dca32d6d Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 12 Dec 2012 13:52:30 -0800 Subject: hwpoison, hugetlbfs: fix RSS-counter warning Memory error handling on hugepages can break a RSS counter, which emits a message like "Bad rss-counter state mm:ffff88040abecac0 idx:1 val:-1". This is because PageAnon returns true for hugepage (this behavior is necessary for reverse mapping to work on hugetlbfs). [akpm@linux-foundation.org: clean up code layout] Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Tony Luck Cc: Wu Fengguang Cc: Aneesh Kumar K.V Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index cf7e99a87c32..face808a489e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1249,12 +1249,14 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, update_hiwater_rss(mm); if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { - if (PageAnon(page)) - dec_mm_counter(mm, MM_ANONPAGES); - else - dec_mm_counter(mm, MM_FILEPAGES); + if (!PageHuge(page)) { + if (PageAnon(page)) + dec_mm_counter(mm, MM_ANONPAGES); + else + dec_mm_counter(mm, MM_FILEPAGES); + } set_pte_at(mm, address, pte, - swp_entry_to_pte(make_hwpoison_entry(page))); + swp_entry_to_pte(make_hwpoison_entry(page))); } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; -- cgit From 56f2fb147659e05b1e87b99791bf44b988d38545 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 12 Dec 2012 13:52:33 -0800 Subject: mm/hugetlb.c: fix warning on freeing hwpoisoned hugepage Fix the warning from __list_del_entry() which is triggered when a process tries to do free_huge_page() for a hwpoisoned hugepage. free_huge_page() can be called for hwpoisoned hugepage from unpoison_memory(). This function gets refcount once and clears PageHWPoison, and then puts refcount twice to return the hugepage back to free pool. The second put_page() finally reaches free_huge_page(). Signed-off-by: Naoya Horiguchi Reviewed-by: Aneesh Kumar K.V Cc: Andi Kleen Cc: Tony Luck Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e53f39cd67db..22508ef943e6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3172,7 +3172,13 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) spin_lock(&hugetlb_lock); if (is_hugepage_on_freelist(hpage)) { - list_del(&hpage->lru); + /* + * Hwpoisoned hugepage isn't linked to activelist or freelist, + * but dangling hpage->lru can trigger list-debug warnings + * (this happens when we call unpoison_memory() on it), + * so let it point to itself with list_del_init(). + */ + list_del_init(&hpage->lru); set_page_refcounted(hpage); h->free_huge_pages--; h->free_huge_pages_node[nid]--; -- cgit From 816422ad76474fed8052b6f7b905a054d082e59a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 12 Dec 2012 13:52:36 -0800 Subject: asm-generic, mm: pgtable: consolidate zero page helpers We have two different implementation of is_zero_pfn() and my_zero_pfn() helpers: for architectures with and without zero page coloring. Let's consolidate them in . Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index f9a4b0cb8623..3f680e7c7645 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -717,13 +717,6 @@ static inline bool is_cow_mapping(vm_flags_t flags) return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } -#ifndef is_zero_pfn -static inline int is_zero_pfn(unsigned long pfn) -{ - return pfn == zero_pfn; -} -#endif - /* * vm_normal_page -- This function gets the "struct page" associated with a pte. * -- cgit From 66521d5aa6c0f74b5e90c21569acbaa8c5ac0998 Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Wed, 12 Dec 2012 13:52:37 -0800 Subject: mm/memory.c: remove unused code from do_wp_page() page_mkwrite is initalized with zero and only set once, from that point exists no way to get to the oom or oom_free_new labels. [akpm@linux-foundation.org: cleanup] Signed-off-by: Dominik Dingel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 3f680e7c7645..db2e9e797a05 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2780,13 +2780,8 @@ unlock: oom_free_new: page_cache_release(new_page); oom: - if (old_page) { - if (page_mkwrite) { - unlock_page(old_page); - page_cache_release(old_page); - } + if (old_page) page_cache_release(old_page); - } return VM_FAULT_OOM; unwritable_page: -- cgit From 98870901cce098bbe94d90d2c41d8d1fa8d94392 Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Wed, 12 Dec 2012 13:52:39 -0800 Subject: mm/bootmem.c: remove unused wrapper function reserve_bootmem_generic() reserve_bootmem_generic() has no caller, Signed-off-by: Lin Feng Acked-by: Johannes Weiner Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 19262ac05dd2..1324cd74faec 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -460,12 +460,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, return mark_bootmem(start, end, 1, flags); } -int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, - int flags) -{ - return reserve_bootmem(phys, len, flags); -} - static unsigned long __init align_idx(struct bootmem_data *bdata, unsigned long idx, unsigned long step) { -- cgit From a4f1de176614f634c367e5994a7bcc428c940df0 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 16 Dec 2012 18:56:58 -0800 Subject: mm: fix kernel BUG at huge_memory.c:1474! Andrea's autonuma-benchmark numa01 hits kernel BUG at huge_memory.c:1474! in change_huge_pmd called from change_protection from change_prot_numa from task_numa_work. That BUG, introduced in the huge zero page commit cad7f613c4d0 ("thp: change_huge_pmd(): make sure we don't try to make a page writable") was trying to verify that newprot never adds write permission to an anonymous huge page; but Automatic NUMA Balancing's 4b10e7d562c9 ("mm: mempolicy: Implement change_prot_numa() in terms of change_protection()") adds a new prot_numa path into change_huge_pmd(), which makes no use of the newprot provided, and may retain the write bit in the pmd. Just move the BUG_ON(pmd_write(entry)) up into the !prot_numa block. Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d7ee1691fd21..32754eece63e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1460,9 +1460,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma) == 1) { pmd_t entry; entry = pmdp_get_and_clear(mm, addr, pmd); - if (!prot_numa) + if (!prot_numa) { entry = pmd_modify(entry, newprot); - else { + BUG_ON(pmd_write(entry)); + } else { struct page *page = pmd_page(*pmd); /* only check non-shared pages */ @@ -1471,7 +1472,6 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, entry = pmd_mknuma(entry); } } - BUG_ON(pmd_write(entry)); set_pmd_at(mm, addr, pmd, entry); spin_unlock(&vma->vm_mm->page_table_lock); ret = 1; -- cgit From 9360b53661a2c7754517b2925580055bacc8ec38 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 17 Dec 2012 11:29:09 -0800 Subject: Revert "bdi: add a user-tunable cpu_list for the bdi flusher threads" This reverts commit 8fa72d234da9b6b473bbb1f74d533663e4996e6b. People disagree about how this should be done, so let's revert this for now so that nobody starts using the new tuning interface. Tejun is thinking about a more generic interface for thread pool affinity. Requested-by: Tejun Heo Acked-by: Jeff Moyer Acked-by: Jens Axboe Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 84 -------------------------------------------------------- 1 file changed, 84 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index bd6a6cabef71..d3ca2b3ee176 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -10,7 +10,6 @@ #include #include #include -#include #include static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); @@ -222,63 +221,12 @@ static ssize_t max_ratio_store(struct device *dev, } BDI_SHOW(max_ratio, bdi->max_ratio) -static ssize_t cpu_list_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t count) -{ - struct backing_dev_info *bdi = dev_get_drvdata(dev); - struct bdi_writeback *wb = &bdi->wb; - cpumask_var_t newmask; - ssize_t ret; - struct task_struct *task; - - if (!alloc_cpumask_var(&newmask, GFP_KERNEL)) - return -ENOMEM; - - ret = cpulist_parse(buf, newmask); - if (!ret) { - spin_lock_bh(&bdi->wb_lock); - task = wb->task; - if (task) - get_task_struct(task); - spin_unlock_bh(&bdi->wb_lock); - - mutex_lock(&bdi->flusher_cpumask_lock); - if (task) { - ret = set_cpus_allowed_ptr(task, newmask); - put_task_struct(task); - } - if (ret == 0) { - cpumask_copy(bdi->flusher_cpumask, newmask); - ret = count; - } - mutex_unlock(&bdi->flusher_cpumask_lock); - - } - free_cpumask_var(newmask); - - return ret; -} - -static ssize_t cpu_list_show(struct device *dev, - struct device_attribute *attr, char *page) -{ - struct backing_dev_info *bdi = dev_get_drvdata(dev); - ssize_t ret; - - mutex_lock(&bdi->flusher_cpumask_lock); - ret = cpulist_scnprintf(page, PAGE_SIZE-1, bdi->flusher_cpumask); - mutex_unlock(&bdi->flusher_cpumask_lock); - - return ret; -} - #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) static struct device_attribute bdi_dev_attrs[] = { __ATTR_RW(read_ahead_kb), __ATTR_RW(min_ratio), __ATTR_RW(max_ratio), - __ATTR_RW(cpu_list), __ATTR_NULL, }; @@ -480,7 +428,6 @@ static int bdi_forker_thread(void *ptr) writeback_inodes_wb(&bdi->wb, 1024, WB_REASON_FORKER_THREAD); } else { - int ret; /* * The spinlock makes sure we do not lose * wake-ups when racing with 'bdi_queue_work()'. @@ -490,14 +437,6 @@ static int bdi_forker_thread(void *ptr) spin_lock_bh(&bdi->wb_lock); bdi->wb.task = task; spin_unlock_bh(&bdi->wb_lock); - mutex_lock(&bdi->flusher_cpumask_lock); - ret = set_cpus_allowed_ptr(task, - bdi->flusher_cpumask); - mutex_unlock(&bdi->flusher_cpumask_lock); - if (ret) - printk_once("%s: failed to bind flusher" - " thread %s, error %d\n", - __func__, task->comm, ret); wake_up_process(task); } bdi_clear_pending(bdi); @@ -570,17 +509,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, dev_name(dev)); if (IS_ERR(wb->task)) return PTR_ERR(wb->task); - } else { - int node; - /* - * Set up a default cpumask for the flusher threads that - * includes all cpus on the same numa node as the device. - * The mask may be overridden via sysfs. - */ - node = dev_to_node(bdi->dev); - if (node != NUMA_NO_NODE) - cpumask_copy(bdi->flusher_cpumask, - cpumask_of_node(node)); } bdi_debug_register(bdi, dev_name(dev)); @@ -706,15 +634,6 @@ int bdi_init(struct backing_dev_info *bdi) bdi_wb_init(&bdi->wb, bdi); - if (!bdi_cap_flush_forker(bdi)) { - bdi->flusher_cpumask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); - if (!bdi->flusher_cpumask) - return -ENOMEM; - cpumask_setall(bdi->flusher_cpumask); - mutex_init(&bdi->flusher_cpumask_lock); - } else - bdi->flusher_cpumask = NULL; - for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { err = percpu_counter_init(&bdi->bdi_stat[i], 0); if (err) @@ -737,7 +656,6 @@ int bdi_init(struct backing_dev_info *bdi) err: while (i--) percpu_counter_destroy(&bdi->bdi_stat[i]); - kfree(bdi->flusher_cpumask); } return err; @@ -765,8 +683,6 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi_unregister(bdi); - kfree(bdi->flusher_cpumask); - /* * If bdi_unregister() had already been called earlier, the * wakeup_timer could still be armed because bdi_prune_sb() -- cgit From b3dd20709db2cab0da0ade1f246fd6e6ab1396eb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Dec 2012 15:59:24 -0800 Subject: mm/memory.c: suppress warning gcc-4.4.4 screws this up. mm/memory.c: In function 'do_pmd_numa_page': mm/memory.c:3594: warning: no return statement in function returning non-void Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index e6a3b933517e..23f1fdfdfcf1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3590,6 +3590,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { BUG(); + return 0; } #endif /* CONFIG_NUMA_BALANCING */ -- cgit From 965c8e59cfcf845ecde2265a1d1bfee5f011d302 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Dec 2012 15:59:39 -0800 Subject: lseek: the "whence" argument is called "whence" But the kernel decided to call it "origin" instead. Fix most of the sites. Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 03f9ba8fb8e5..5c90d84c2b02 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1719,7 +1719,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. */ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, - pgoff_t index, pgoff_t end, int origin) + pgoff_t index, pgoff_t end, int whence) { struct page *page; struct pagevec pvec; @@ -1733,13 +1733,13 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, pvec.nr = shmem_find_get_pages_and_swap(mapping, index, pvec.nr, pvec.pages, indices); if (!pvec.nr) { - if (origin == SEEK_DATA) + if (whence == SEEK_DATA) index = end; break; } for (i = 0; i < pvec.nr; i++, index++) { if (index < indices[i]) { - if (origin == SEEK_HOLE) { + if (whence == SEEK_HOLE) { done = true; break; } @@ -1751,8 +1751,8 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, page = NULL; } if (index >= end || - (page && origin == SEEK_DATA) || - (!page && origin == SEEK_HOLE)) { + (page && whence == SEEK_DATA) || + (!page && whence == SEEK_HOLE)) { done = true; break; } @@ -1765,15 +1765,15 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, return index; } -static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) +static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; pgoff_t start, end; loff_t new_offset; - if (origin != SEEK_DATA && origin != SEEK_HOLE) - return generic_file_llseek_size(file, offset, origin, + if (whence != SEEK_DATA && whence != SEEK_HOLE) + return generic_file_llseek_size(file, offset, whence, MAX_LFS_FILESIZE, i_size_read(inode)); mutex_lock(&inode->i_mutex); /* We're holding i_mutex so we can access i_size directly */ @@ -1785,12 +1785,12 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) else { start = offset >> PAGE_CACHE_SHIFT; end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - new_offset = shmem_seek_hole_data(mapping, start, end, origin); + new_offset = shmem_seek_hole_data(mapping, start, end, whence); new_offset <<= PAGE_CACHE_SHIFT; if (new_offset > offset) { if (new_offset < inode->i_size) offset = new_offset; - else if (origin == SEEK_DATA) + else if (whence == SEEK_DATA) offset = -ENXIO; else offset = inode->i_size; -- cgit From 2fbc57c53a815ea30b926dd7627897a02daae302 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 17 Dec 2012 16:01:23 -0800 Subject: mm: use kbasename() Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 23f1fdfdfcf1..e0a9b0ce4f10 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include @@ -4118,15 +4119,12 @@ void print_vma_addr(char *prefix, unsigned long ip) struct file *f = vma->vm_file; char *buf = (char *)__get_free_page(GFP_KERNEL); if (buf) { - char *p, *s; + char *p; p = d_path(&f->f_path, buf, PAGE_SIZE); if (IS_ERR(p)) p = "?"; - s = strrchr(p, '/'); - if (s) - p = s+1; - printk("%s%s[%lx+%lx]", prefix, p, + printk("%s%s[%lx+%lx]", prefix, kbasename(p), vma->vm_start, vma->vm_end - vma->vm_start); free_page((unsigned long)buf); -- cgit From ce4a9cc579381bc70b12ebb91c57da31baf8e3b7 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 10 Dec 2012 19:50:57 +1100 Subject: mm,numa: fix update_mmu_cache_pmd call This build error is currently hidden by the fact that the x86 implementation of 'update_mmu_cache_pmd()' is a macro that doesn't use its last argument, but commit b32967ff101a ("mm: numa: Add THP migration for the NUMA working set scanning fault case") introduced a call with the wrong third argument. In the akpm tree, it causes this build error: mm/migrate.c: In function 'migrate_misplaced_transhuge_page_put': mm/migrate.c:1666:2: error: incompatible type for argument 3 of 'update_mmu_cache_pmd' arch/x86/include/asm/pgtable.h:792:20: note: expected 'struct pmd_t *' but argument is of type 'pmd_t' Fix it. Signed-off-by: Stephen Rothwell Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 32efd8028bc9..3b676b0c5c3e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1734,7 +1734,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); - update_mmu_cache_pmd(vma, address, entry); + update_mmu_cache_pmd(vma, address, &entry); page_remove_rmap(page); /* * Finish the charge transaction under the page table lock to -- cgit From 0bb2c7637ef0db4f44528698fa725179fb4917ad Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 18 Dec 2012 14:21:32 -0800 Subject: mm/page_alloc.c: remove duplicate check While allocating pages using buddy allocator, the compound page is probably split up to free pages. Under these circumstances, the compound page should be destroyed by destroy_compound_page(). However, there is a duplicate check to judge if the page is compound. Remove the duplicate check since the compound_order() returns 0 when the page doesn't have PG_head set in destroy_compound_page(). That is to say, destroy_compound_page() needn't check PageHead(). Signed-off-by: Gavin Shan Acked-by: Johannes Weiner Acked-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d037c8bc1512..62496edbd8dd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -371,8 +371,7 @@ static int destroy_compound_page(struct page *page, unsigned long order) int nr_pages = 1 << order; int bad = 0; - if (unlikely(compound_order(page) != order) || - unlikely(!PageHead(page))) { + if (unlikely(compound_order(page) != order)) { bad_page(page); bad++; } -- cgit From c2974058a9caa82c9dd9e1e11e4f2c6ce42ff17e Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 18 Dec 2012 14:21:33 -0800 Subject: memory-hotplug: document and enable CONFIG_MOVABLE_NODE Add help info for CONFIG_MOVABLE_NODE and permit its selection. This option allows the user to online all memory of a node as movable memory. So that the whole node can be hotplugged. Users who don't use the hotplug feature are also fine with this option on since they won't online memory as movable. Signed-off-by: Tang Chen Reviewed-by: Yasuaki Ishimatsu Cc: Lai Jiangshan Cc: Wen Congyang Cc: Ingo Molnar [akpm@linux-foundation.org: tweak help text] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 71259e052ce8..278e3ab1f169 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -149,7 +149,18 @@ config MOVABLE_NODE depends on NO_BOOTMEM depends on X86_64 depends on NUMA - depends on BROKEN + default n + help + Allow a node to have only movable memory. Pages used by the kernel, + such as direct mapping pages cannot be migrated. So the corresponding + memory device cannot be hotplugged. This option allows users to + online all the memory of a node as movable memory so that the whole + node can be hotplugged. Users who don't use the memory hotplug + feature are fine with this option on since they don't online memory + as movable. + + Say Y here if you want to hotplug a whole node. + Say N here if you want kernel to use memory on all nodes evenly. # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG -- cgit From a0956d54492eb7257b09230680a8812b42cdee92 Mon Sep 17 00:00:00 2001 From: Suleiman Souhlal Date: Tue, 18 Dec 2012 14:21:36 -0800 Subject: memcg: make it possible to use the stock for more than one page We currently have a percpu stock cache scheme that charges one page at a time from memcg->res, the user counter. When the kernel memory controller comes into play, we'll need to charge more than that. This is because kernel memory allocations will also draw from the user counter, and can be bigger than a single page, as it is the case with the stack (usually 2 pages) or some higher order slabs. [glommer@parallels.com: added a changelog ] Signed-off-by: Suleiman Souhlal Signed-off-by: Glauber Costa Acked-by: David Rientjes Acked-by: Kamezawa Hiroyuki Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Tejun Heo Cc: Christoph Lameter Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bbfac5063ca8..92887b286246 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2060,20 +2060,28 @@ struct memcg_stock_pcp { static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); static DEFINE_MUTEX(percpu_charge_mutex); -/* - * Try to consume stocked charge on this cpu. If success, one page is consumed - * from local stock and true is returned. If the stock is 0 or charges from a - * cgroup which is not current target, returns false. This stock will be - * refilled. +/** + * consume_stock: Try to consume stocked charge on this cpu. + * @memcg: memcg to consume from. + * @nr_pages: how many pages to charge. + * + * The charges will only happen if @memcg matches the current cpu's memcg + * stock, and at least @nr_pages are available in that stock. Failure to + * service an allocation will refill the stock. + * + * returns true if successful, false otherwise. */ -static bool consume_stock(struct mem_cgroup *memcg) +static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; bool ret = true; + if (nr_pages > CHARGE_BATCH) + return false; + stock = &get_cpu_var(memcg_stock); - if (memcg == stock->cached && stock->nr_pages) - stock->nr_pages--; + if (memcg == stock->cached && stock->nr_pages >= nr_pages) + stock->nr_pages -= nr_pages; else /* need to call res_counter_charge */ ret = false; put_cpu_var(memcg_stock); @@ -2371,7 +2379,7 @@ again: memcg = *ptr; if (mem_cgroup_is_root(memcg)) goto done; - if (nr_pages == 1 && consume_stock(memcg)) + if (consume_stock(memcg, nr_pages)) goto done; css_get(&memcg->css); } else { @@ -2396,7 +2404,7 @@ again: rcu_read_unlock(); goto done; } - if (nr_pages == 1 && consume_stock(memcg)) { + if (consume_stock(memcg, nr_pages)) { /* * It seems dagerous to access memcg without css_get(). * But considering how consume_stok works, it's not -- cgit From 4c9c535968cacf507c5febe49e6ec5123eadf207 Mon Sep 17 00:00:00 2001 From: Suleiman Souhlal Date: Tue, 18 Dec 2012 14:21:41 -0800 Subject: memcg: reclaim when more than one page needed mem_cgroup_do_charge() was written before kmem accounting, and expects three cases: being called for 1 page, being called for a stock of 32 pages, or being called for a hugepage. If we call for 2 or 3 pages (and both the stack and several slabs used in process creation are such, at least with the debug options I had), it assumed it's being called for stock and just retried without reclaiming. Fix that by passing down a minsize argument in addition to the csize. And what to do about that (csize == PAGE_SIZE && ret) retry? If it's needed at all (and presumably is since it's there, perhaps to handle races), then it should be extended to more than PAGE_SIZE, yet how far? And should there be a retry count limit, of what? For now retry up to COSTLY_ORDER (as page_alloc.c does) and make sure not to do it if __GFP_NORETRY. v4: fixed nr pages calculation pointed out by Christoph Lameter. Signed-off-by: Suleiman Souhlal Signed-off-by: Glauber Costa Acked-by: Kamezawa Hiroyuki Acked-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: David Rientjes Cc: Tejun Heo Cc: Christoph Lameter Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 92887b286246..20e3619870ae 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2258,7 +2258,8 @@ enum { }; static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, bool oom_check) + unsigned int nr_pages, unsigned int min_pages, + bool oom_check) { unsigned long csize = nr_pages * PAGE_SIZE; struct mem_cgroup *mem_over_limit; @@ -2281,18 +2282,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); /* - * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch - * of regular pages (CHARGE_BATCH), or a single regular page (1). - * * Never reclaim on behalf of optional batching, retry with a * single page instead. */ - if (nr_pages == CHARGE_BATCH) + if (nr_pages > min_pages) return CHARGE_RETRY; if (!(gfp_mask & __GFP_WAIT)) return CHARGE_WOULDBLOCK; + if (gfp_mask & __GFP_NORETRY) + return CHARGE_NOMEM; + ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) return CHARGE_RETRY; @@ -2305,7 +2306,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (nr_pages == 1 && ret) + if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) return CHARGE_RETRY; /* @@ -2439,7 +2440,8 @@ again: nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; } - ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); + ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, + oom_check); switch (ret) { case CHARGE_OK: break; -- cgit From 86ae53e1a138a3295c04ae69bf404be00244a381 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:21:45 -0800 Subject: memcg: change defines to an enum This is just a cleanup patch for clarity of expression. In earlier submissions, people asked it to be in a separate patch, so here it is. Signed-off-by: Glauber Costa Acked-by: Kamezawa Hiroyuki Acked-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: David Rientjes Cc: Tejun Heo Cc: Christoph Lameter Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 20e3619870ae..c7b0b1b803a5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -388,9 +388,12 @@ enum charge_type { }; /* for encoding cft->private value on file */ -#define _MEM (0) -#define _MEMSWAP (1) -#define _OOM_TYPE (2) +enum res_type { + _MEM, + _MEMSWAP, + _OOM_TYPE, +}; + #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) @@ -3952,7 +3955,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); char str[64]; u64 val; - int type, name, len; + int name, len; + enum res_type type; type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); @@ -3988,7 +3992,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, const char *buffer) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - int type, name; + enum res_type type; + int name; unsigned long long val; int ret; @@ -4064,7 +4069,8 @@ out: static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - int type, name; + int name; + enum res_type type; type = MEMFILE_TYPE(event); name = MEMFILE_ATTR(event); @@ -4400,7 +4406,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp, struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); u64 threshold, usage; int i, size, ret; @@ -4483,7 +4489,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); u64 usage; int i, j, size; @@ -4561,7 +4567,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_eventfd_list *event; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); BUG_ON(type != _OOM_TYPE); event = kmalloc(sizeof(*event), GFP_KERNEL); @@ -4586,7 +4592,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_eventfd_list *ev, *tmp; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); BUG_ON(type != _OOM_TYPE); -- cgit From 510fc4e11b772fd60f2c545c64d4c55abd07ce36 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:21:47 -0800 Subject: memcg: kmem accounting basic infrastructure Add the basic infrastructure for the accounting of kernel memory. To control that, the following files are created: * memory.kmem.usage_in_bytes * memory.kmem.limit_in_bytes * memory.kmem.failcnt * memory.kmem.max_usage_in_bytes They have the same meaning of their user memory counterparts. They reflect the state of the "kmem" res_counter. Per cgroup kmem memory accounting is not enabled until a limit is set for the group. Once the limit is set the accounting cannot be disabled for that group. This means that after the patch is applied, no behavioral changes exists for whoever is still using memcg to control their memory usage, until memory.kmem.limit_in_bytes is set for the first time. We always account to both user and kernel resource_counters. This effectively means that an independent kernel limit is in place when the limit is set to a lower value than the user memory. A equal or higher value means that the user limit will always hit first, meaning that kmem is effectively unlimited. People who want to track kernel memory but not limit it, can set this limit to a very high number (like RESOURCE_MAX - 1page - that no one will ever hit, or equal to the user memory) [akpm@linux-foundation.org: MEMCG_MMEM only works with slab and slub] Signed-off-by: Glauber Costa Acked-by: Kamezawa Hiroyuki Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Tejun Heo Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 123 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c7b0b1b803a5..bba1cb4bbb82 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -267,6 +267,10 @@ struct mem_cgroup { struct work_struct work_freeing; }; + /* + * the counter to account for kernel memory usage. + */ + struct res_counter kmem; /* * Per cgroup active and inactive list, similar to the * per zone LRU lists. @@ -282,6 +286,7 @@ struct mem_cgroup { * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; + unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ bool oom_lock; atomic_t under_oom; @@ -334,6 +339,20 @@ struct mem_cgroup { #endif }; +/* internal only representation about the status of kmem accounting. */ +enum { + KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ +}; + +#define KMEM_ACCOUNTED_MASK (1 << KMEM_ACCOUNTED_ACTIVE) + +#ifdef CONFIG_MEMCG_KMEM +static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) +{ + set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); +} +#endif + /* Stuffs for move charges at task migration. */ /* * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a @@ -392,6 +411,7 @@ enum res_type { _MEM, _MEMSWAP, _OOM_TYPE, + _KMEM, }; #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) @@ -1456,6 +1476,10 @@ done: res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); + printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n", + res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, + res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, + res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); } /* @@ -3977,6 +4001,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, else val = res_counter_read_u64(&memcg->memsw, name); break; + case _KMEM: + val = res_counter_read_u64(&memcg->kmem, name); + break; default: BUG(); } @@ -3984,6 +4011,59 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); return simple_read_from_buffer(buf, nbytes, ppos, str, len); } + +static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) +{ + int ret = -EINVAL; +#ifdef CONFIG_MEMCG_KMEM + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + /* + * For simplicity, we won't allow this to be disabled. It also can't + * be changed if the cgroup has children already, or if tasks had + * already joined. + * + * If tasks join before we set the limit, a person looking at + * kmem.usage_in_bytes will have no way to determine when it took + * place, which makes the value quite meaningless. + * + * After it first became limited, changes in the value of the limit are + * of course permitted. + * + * Taking the cgroup_lock is really offensive, but it is so far the only + * way to guarantee that no children will appear. There are plenty of + * other offenders, and they should all go away. Fine grained locking + * is probably the way to go here. When we are fully hierarchical, we + * can also get rid of the use_hierarchy check. + */ + cgroup_lock(); + mutex_lock(&set_limit_mutex); + if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { + if (cgroup_task_count(cont) || (memcg->use_hierarchy && + !list_empty(&cont->children))) { + ret = -EBUSY; + goto out; + } + ret = res_counter_set_limit(&memcg->kmem, val); + VM_BUG_ON(ret); + + memcg_kmem_set_active(memcg); + } else + ret = res_counter_set_limit(&memcg->kmem, val); +out: + mutex_unlock(&set_limit_mutex); + cgroup_unlock(); +#endif + return ret; +} + +static void memcg_propagate_kmem(struct mem_cgroup *memcg) +{ + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + if (!parent) + return; + memcg->kmem_account_flags = parent->kmem_account_flags; +} + /* * The user of this function is... * RES_LIMIT. @@ -4015,8 +4095,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, break; if (type == _MEM) ret = mem_cgroup_resize_limit(memcg, val); - else + else if (type == _MEMSWAP) ret = mem_cgroup_resize_memsw_limit(memcg, val); + else if (type == _KMEM) + ret = memcg_update_kmem_limit(cont, val); + else + return -EINVAL; break; case RES_SOFT_LIMIT: ret = res_counter_memparse_write_strategy(buffer, &val); @@ -4082,14 +4166,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) case RES_MAX_USAGE: if (type == _MEM) res_counter_reset_max(&memcg->res); - else + else if (type == _MEMSWAP) res_counter_reset_max(&memcg->memsw); + else if (type == _KMEM) + res_counter_reset_max(&memcg->kmem); + else + return -EINVAL; break; case RES_FAILCNT: if (type == _MEM) res_counter_reset_failcnt(&memcg->res); - else + else if (type == _MEMSWAP) res_counter_reset_failcnt(&memcg->memsw); + else if (type == _KMEM) + res_counter_reset_failcnt(&memcg->kmem); + else + return -EINVAL; break; } @@ -4651,6 +4743,7 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, #ifdef CONFIG_MEMCG_KMEM static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { + memcg_propagate_kmem(memcg); return mem_cgroup_sockets_init(memcg, ss); }; @@ -4764,6 +4857,31 @@ static struct cftype mem_cgroup_files[] = { .trigger = mem_cgroup_reset, .read = mem_cgroup_read, }, +#endif +#ifdef CONFIG_MEMCG_KMEM + { + .name = "kmem.limit_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), + .write_string = mem_cgroup_write, + .read = mem_cgroup_read, + }, + { + .name = "kmem.usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), + .read = mem_cgroup_read, + }, + { + .name = "kmem.failcnt", + .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), + .trigger = mem_cgroup_reset, + .read = mem_cgroup_read, + }, + { + .name = "kmem.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), + .trigger = mem_cgroup_reset, + .read = mem_cgroup_read, + }, #endif { }, /* terminate */ }; @@ -5010,6 +5128,7 @@ mem_cgroup_css_alloc(struct cgroup *cont) if (parent && parent->use_hierarchy) { res_counter_init(&memcg->res, &parent->res); res_counter_init(&memcg->memsw, &parent->memsw); + res_counter_init(&memcg->kmem, &parent->kmem); /* * We increment refcnt of the parent to ensure that we can * safely access it on res_counter_charge/uncharge. @@ -5020,6 +5139,7 @@ mem_cgroup_css_alloc(struct cgroup *cont) } else { res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); + res_counter_init(&memcg->kmem, NULL); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this -- cgit From 7ae1e1d0f8ac2927ed7e3ca6d15e42d485903459 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:21:56 -0800 Subject: memcg: kmem controller infrastructure Introduce infrastructure for tracking kernel memory pages to a given memcg. This will happen whenever the caller includes the flag __GFP_KMEMCG flag, and the task belong to a memcg other than the root. In memcontrol.h those functions are wrapped in inline acessors. The idea is to later on, patch those with static branches, so we don't incur any overhead when no mem cgroups with limited kmem are being used. Users of this functionality shall interact with the memcg core code through the following functions: memcg_kmem_newpage_charge: will return true if the group can handle the allocation. At this point, struct page is not yet allocated. memcg_kmem_commit_charge: will either revert the charge, if struct page allocation failed, or embed memcg information into page_cgroup. memcg_kmem_uncharge_page: called at free time, will revert the charge. Signed-off-by: Glauber Costa Acked-by: Michal Hocko Acked-by: Kamezawa Hiroyuki Cc: Christoph Lameter Cc: Pekka Enberg Cc: Johannes Weiner Cc: Tejun Heo Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Mel Gorman Cc: Rik van Riel Cc: Suleiman Souhlal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bba1cb4bbb82..b9afa060b8d6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -10,6 +10,10 @@ * Copyright (C) 2009 Nokia Corporation * Author: Kirill A. Shutemov * + * Kernel Memory Controller + * Copyright (C) 2012 Parallels Inc. and Google Inc. + * Authors: Glauber Costa and Suleiman Souhlal + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -2661,6 +2665,172 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, memcg_check_events(memcg, page); } +#ifdef CONFIG_MEMCG_KMEM +static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) +{ + return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && + (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); +} + +static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) +{ + struct res_counter *fail_res; + struct mem_cgroup *_memcg; + int ret = 0; + bool may_oom; + + ret = res_counter_charge(&memcg->kmem, size, &fail_res); + if (ret) + return ret; + + /* + * Conditions under which we can wait for the oom_killer. Those are + * the same conditions tested by the core page allocator + */ + may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); + + _memcg = memcg; + ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, + &_memcg, may_oom); + + if (ret == -EINTR) { + /* + * __mem_cgroup_try_charge() chosed to bypass to root due to + * OOM kill or fatal signal. Since our only options are to + * either fail the allocation or charge it to this cgroup, do + * it as a temporary condition. But we can't fail. From a + * kmem/slab perspective, the cache has already been selected, + * by mem_cgroup_kmem_get_cache(), so it is too late to change + * our minds. + * + * This condition will only trigger if the task entered + * memcg_charge_kmem in a sane state, but was OOM-killed during + * __mem_cgroup_try_charge() above. Tasks that were already + * dying when the allocation triggers should have been already + * directed to the root cgroup in memcontrol.h + */ + res_counter_charge_nofail(&memcg->res, size, &fail_res); + if (do_swap_account) + res_counter_charge_nofail(&memcg->memsw, size, + &fail_res); + ret = 0; + } else if (ret) + res_counter_uncharge(&memcg->kmem, size); + + return ret; +} + +static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) +{ + res_counter_uncharge(&memcg->kmem, size); + res_counter_uncharge(&memcg->res, size); + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, size); +} + +/* + * We need to verify if the allocation against current->mm->owner's memcg is + * possible for the given order. But the page is not allocated yet, so we'll + * need a further commit step to do the final arrangements. + * + * It is possible for the task to switch cgroups in this mean time, so at + * commit time, we can't rely on task conversion any longer. We'll then use + * the handle argument to return to the caller which cgroup we should commit + * against. We could also return the memcg directly and avoid the pointer + * passing, but a boolean return value gives better semantics considering + * the compiled-out case as well. + * + * Returning true means the allocation is possible. + */ +bool +__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) +{ + struct mem_cgroup *memcg; + int ret; + + *_memcg = NULL; + memcg = try_get_mem_cgroup_from_mm(current->mm); + + /* + * very rare case described in mem_cgroup_from_task. Unfortunately there + * isn't much we can do without complicating this too much, and it would + * be gfp-dependent anyway. Just let it go + */ + if (unlikely(!memcg)) + return true; + + if (!memcg_can_account_kmem(memcg)) { + css_put(&memcg->css); + return true; + } + + mem_cgroup_get(memcg); + + ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); + if (!ret) + *_memcg = memcg; + else + mem_cgroup_put(memcg); + + css_put(&memcg->css); + return (ret == 0); +} + +void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, + int order) +{ + struct page_cgroup *pc; + + VM_BUG_ON(mem_cgroup_is_root(memcg)); + + /* The page allocation failed. Revert */ + if (!page) { + memcg_uncharge_kmem(memcg, PAGE_SIZE << order); + mem_cgroup_put(memcg); + return; + } + + pc = lookup_page_cgroup(page); + lock_page_cgroup(pc); + pc->mem_cgroup = memcg; + SetPageCgroupUsed(pc); + unlock_page_cgroup(pc); +} + +void __memcg_kmem_uncharge_pages(struct page *page, int order) +{ + struct mem_cgroup *memcg = NULL; + struct page_cgroup *pc; + + + pc = lookup_page_cgroup(page); + /* + * Fast unlocked return. Theoretically might have changed, have to + * check again after locking. + */ + if (!PageCgroupUsed(pc)) + return; + + lock_page_cgroup(pc); + if (PageCgroupUsed(pc)) { + memcg = pc->mem_cgroup; + ClearPageCgroupUsed(pc); + } + unlock_page_cgroup(pc); + + /* + * We trust that only if there is a memcg associated with the page, it + * is a valid allocation + */ + if (!memcg) + return; + + VM_BUG_ON(mem_cgroup_is_root(memcg)); + memcg_uncharge_kmem(memcg, PAGE_SIZE << order); + mem_cgroup_put(memcg); +} +#endif /* CONFIG_MEMCG_KMEM */ + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) -- cgit From 6a1a0d3b625a4091e7a0eb249aefc6a644385149 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:00 -0800 Subject: mm: allocate kernel pages to the right memcg When a process tries to allocate a page with the __GFP_KMEMCG flag, the page allocator will call the corresponding memcg functions to validate the allocation. Tasks in the root memcg can always proceed. To avoid adding markers to the page - and a kmem flag that would necessarily follow, as much as doing page_cgroup lookups for no reason, whoever is marking its allocations with __GFP_KMEMCG flag is responsible for telling the page allocator that this is such an allocation at free_pages() time. This is done by the invocation of __free_accounted_pages() and free_accounted_pages(). Signed-off-by: Glauber Costa Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Kamezawa Hiroyuki Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Johannes Weiner Cc: Suleiman Souhlal Cc: Tejun Heo Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 62496edbd8dd..2ad2ad168efe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2612,6 +2612,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; + struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask; @@ -2630,6 +2631,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; + /* + * Will only have any effect when __GFP_KMEMCG is set. This is + * verified in the (always inline) callee + */ + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + retry_cpuset: cpuset_mems_cookie = get_mems_allowed(); @@ -2665,6 +2673,8 @@ out: if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) goto retry_cpuset; + memcg_kmem_commit_charge(page, memcg, order); + return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2717,6 +2727,31 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); +/* + * __free_memcg_kmem_pages and free_memcg_kmem_pages will free + * pages allocated with __GFP_KMEMCG. + * + * Those pages are accounted to a particular memcg, embedded in the + * corresponding page_cgroup. To avoid adding a hit in the allocator to search + * for that information only to find out that it is NULL for users who have no + * interest in that whatsoever, we provide these functions. + * + * The caller knows better which flags it relies on. + */ +void __free_memcg_kmem_pages(struct page *page, unsigned int order) +{ + memcg_kmem_uncharge_pages(page, order); + __free_pages(page, order); +} + +void free_memcg_kmem_pages(unsigned long addr, unsigned int order) +{ + if (addr != 0) { + VM_BUG_ON(!virt_addr_valid((void *)addr)); + __free_memcg_kmem_pages(virt_to_page((void *)addr), order); + } +} + static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) { if (addr) { -- cgit From 7de37682bec35bbe0cd69b8112ef257bc5fb1c3e Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:07 -0800 Subject: memcg: kmem accounting lifecycle management Because kmem charges can outlive the cgroup, we need to make sure that we won't free the memcg structure while charges are still in flight. For reviewing simplicity, the charge functions will issue mem_cgroup_get() at every charge, and mem_cgroup_put() at every uncharge. This can get expensive, however, and we can do better. mem_cgroup_get() only really needs to be issued once: when the first limit is set. In the same spirit, we only need to issue mem_cgroup_put() when the last charge is gone. We'll need an extra bit in kmem_account_flags for that: KMEM_ACCOUNTED_DEAD. it will be set when the cgroup dies, if there are charges in the group. If there aren't, we can proceed right away. Our uncharge function will have to test that bit every time the charges drop to 0. Because that is not the likely output of res_counter_uncharge, this should not impose a big hit on us: it is certainly much better than a reference count decrease at every operation. Signed-off-by: Glauber Costa Acked-by: Michal Hocko Acked-by: Kamezawa Hiroyuki Cc: Christoph Lameter Cc: Pekka Enberg Cc: Johannes Weiner Cc: Suleiman Souhlal Cc: Tejun Heo Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b9afa060b8d6..9a62ac3ea881 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -346,6 +346,7 @@ struct mem_cgroup { /* internal only representation about the status of kmem accounting. */ enum { KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ + KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ }; #define KMEM_ACCOUNTED_MASK (1 << KMEM_ACCOUNTED_ACTIVE) @@ -355,6 +356,23 @@ static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) { set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); } + +static bool memcg_kmem_is_active(struct mem_cgroup *memcg) +{ + return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); +} + +static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) +{ + if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) + set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); +} + +static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) +{ + return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, + &memcg->kmem_account_flags); +} #endif /* Stuffs for move charges at task migration. */ @@ -2722,10 +2740,16 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) { - res_counter_uncharge(&memcg->kmem, size); res_counter_uncharge(&memcg->res, size); if (do_swap_account) res_counter_uncharge(&memcg->memsw, size); + + /* Not down to 0 */ + if (res_counter_uncharge(&memcg->kmem, size)) + return; + + if (memcg_kmem_test_and_clear_dead(memcg)) + mem_cgroup_put(memcg); } /* @@ -2764,13 +2788,9 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) return true; } - mem_cgroup_get(memcg); - ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); if (!ret) *_memcg = memcg; - else - mem_cgroup_put(memcg); css_put(&memcg->css); return (ret == 0); @@ -2786,7 +2806,6 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, /* The page allocation failed. Revert */ if (!page) { memcg_uncharge_kmem(memcg, PAGE_SIZE << order); - mem_cgroup_put(memcg); return; } @@ -2827,7 +2846,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) VM_BUG_ON(mem_cgroup_is_root(memcg)); memcg_uncharge_kmem(memcg, PAGE_SIZE << order); - mem_cgroup_put(memcg); } #endif /* CONFIG_MEMCG_KMEM */ @@ -4217,6 +4235,13 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) VM_BUG_ON(ret); memcg_kmem_set_active(memcg); + /* + * kmem charges can outlive the cgroup. In the case of slab + * pages, for instance, a page contain objects from various + * processes, so it is unfeasible to migrate them away. We + * need to reference count the memcg because of that. + */ + mem_cgroup_get(memcg); } else ret = res_counter_set_limit(&memcg->kmem, val); out: @@ -4232,6 +4257,10 @@ static void memcg_propagate_kmem(struct mem_cgroup *memcg) if (!parent) return; memcg->kmem_account_flags = parent->kmem_account_flags; +#ifdef CONFIG_MEMCG_KMEM + if (memcg_kmem_is_active(memcg)) + mem_cgroup_get(memcg); +#endif } /* @@ -4920,6 +4949,20 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) static void kmem_cgroup_destroy(struct mem_cgroup *memcg) { mem_cgroup_sockets_destroy(memcg); + + memcg_kmem_mark_dead(memcg); + + if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) + return; + + /* + * Charges already down to 0, undo mem_cgroup_get() done in the charge + * path here, being careful not to race with memcg_uncharge_kmem: it is + * possible that the charges went down to 0 between mark_dead and the + * res_counter read, so in that case, we don't need the put + */ + if (memcg_kmem_test_and_clear_dead(memcg)) + mem_cgroup_put(memcg); } #else static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) -- cgit From a8964b9b84f99c0b1b5d7c09520f89f0700e742e Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:09 -0800 Subject: memcg: use static branches when code not in use We can use static branches to patch the code in or out when not used. Because the _ACTIVE bit on kmem_accounted is only set after the increment is done, we guarantee that the root memcg will always be selected for kmem charges until all call sites are patched (see memcg_kmem_enabled). This guarantees that no mischarges are applied. Static branch decrement happens when the last reference count from the kmem accounting in memcg dies. This will only happen when the charges drop down to 0. When that happens, we need to disable the static branch only on those memcgs that enabled it. To achieve this, we would be forced to complicate the code by keeping track of which memcgs were the ones that actually enabled limits, and which ones got it from its parents. It is a lot simpler just to do static_key_slow_inc() on every child that is accounted. Signed-off-by: Glauber Costa Acked-by: Michal Hocko Acked-by: Kamezawa Hiroyuki Cc: Christoph Lameter Cc: Pekka Enberg Cc: Johannes Weiner Cc: Suleiman Souhlal Cc: Tejun Heo Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 75 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9a62ac3ea881..bc70254558fa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -346,10 +346,13 @@ struct mem_cgroup { /* internal only representation about the status of kmem accounting. */ enum { KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ + KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */ KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ }; -#define KMEM_ACCOUNTED_MASK (1 << KMEM_ACCOUNTED_ACTIVE) +/* We account when limit is on, but only after call sites are patched */ +#define KMEM_ACCOUNTED_MASK \ + ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED)) #ifdef CONFIG_MEMCG_KMEM static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) @@ -362,6 +365,11 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg) return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); } +static void memcg_kmem_set_activated(struct mem_cgroup *memcg) +{ + set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); +} + static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) { if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) @@ -532,6 +540,26 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) } #endif +#ifdef CONFIG_MEMCG_KMEM +struct static_key memcg_kmem_enabled_key; + +static void disarm_kmem_keys(struct mem_cgroup *memcg) +{ + if (memcg_kmem_is_active(memcg)) + static_key_slow_dec(&memcg_kmem_enabled_key); +} +#else +static void disarm_kmem_keys(struct mem_cgroup *memcg) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +static void disarm_static_keys(struct mem_cgroup *memcg) +{ + disarm_sock_keys(memcg); + disarm_kmem_keys(memcg); +} + static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * @@ -4204,6 +4232,8 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) { int ret = -EINVAL; #ifdef CONFIG_MEMCG_KMEM + bool must_inc_static_branch = false; + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); /* * For simplicity, we won't allow this to be disabled. It also can't @@ -4234,7 +4264,15 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) ret = res_counter_set_limit(&memcg->kmem, val); VM_BUG_ON(ret); - memcg_kmem_set_active(memcg); + /* + * After this point, kmem_accounted (that we test atomically in + * the beginning of this conditional), is no longer 0. This + * guarantees only one process will set the following boolean + * to true. We don't need test_and_set because we're protected + * by the set_limit_mutex anyway. + */ + memcg_kmem_set_activated(memcg); + must_inc_static_branch = true; /* * kmem charges can outlive the cgroup. In the case of slab * pages, for instance, a page contain objects from various @@ -4247,6 +4285,27 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) out: mutex_unlock(&set_limit_mutex); cgroup_unlock(); + + /* + * We are by now familiar with the fact that we can't inc the static + * branch inside cgroup_lock. See disarm functions for details. A + * worker here is overkill, but also wrong: After the limit is set, we + * must start accounting right away. Since this operation can't fail, + * we can safely defer it to here - no rollback will be needed. + * + * The boolean used to control this is also safe, because + * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be + * able to set it to true; + */ + if (must_inc_static_branch) { + static_key_slow_inc(&memcg_kmem_enabled_key); + /* + * setting the active bit after the inc will guarantee no one + * starts accounting before all call sites are patched + */ + memcg_kmem_set_active(memcg); + } + #endif return ret; } @@ -4258,8 +4317,20 @@ static void memcg_propagate_kmem(struct mem_cgroup *memcg) return; memcg->kmem_account_flags = parent->kmem_account_flags; #ifdef CONFIG_MEMCG_KMEM - if (memcg_kmem_is_active(memcg)) + /* + * When that happen, we need to disable the static branch only on those + * memcgs that enabled it. To achieve this, we would be forced to + * complicate the code by keeping track of which memcgs were the ones + * that actually enabled limits, and which ones got it from its + * parents. + * + * It is a lot simpler just to do static_key_slow_inc() on every child + * that is accounted. + */ + if (memcg_kmem_is_active(memcg)) { mem_cgroup_get(memcg); + static_key_slow_inc(&memcg_kmem_enabled_key); + } #endif } @@ -5184,7 +5255,7 @@ static void free_work(struct work_struct *work) * to move this code around, and make sure it is outside * the cgroup_lock. */ - disarm_sock_keys(memcg); + disarm_static_keys(memcg); if (size < PAGE_SIZE) kfree(memcg); else -- cgit From bea207c86e4eb158bf20f7b36bee72da9272e8d3 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:11 -0800 Subject: memcg: allow a memcg with kmem charges to be destructed Because the ultimate goal of the kmem tracking in memcg is to track slab pages as well, we can't guarantee that we'll always be able to point a page to a particular process, and migrate the charges along with it - since in the common case, a page will contain data belonging to multiple processes. Because of that, when we destroy a memcg, we only make sure the destruction will succeed by discounting the kmem charges from the user charges when we try to empty the cgroup. Signed-off-by: Glauber Costa Acked-by: Kamezawa Hiroyuki Reviewed-by: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bc70254558fa..f96ccc90fa66 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -547,6 +547,11 @@ static void disarm_kmem_keys(struct mem_cgroup *memcg) { if (memcg_kmem_is_active(memcg)) static_key_slow_dec(&memcg_kmem_enabled_key); + /* + * This check can't live in kmem destruction function, + * since the charges will outlive the cgroup + */ + WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); } #else static void disarm_kmem_keys(struct mem_cgroup *memcg) @@ -4025,6 +4030,7 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) { int node, zid; + u64 usage; do { /* This is for making all *used* pages to be on LRU. */ @@ -4045,13 +4051,20 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) cond_resched(); /* + * Kernel memory may not necessarily be trackable to a specific + * process. So they are not migrated, and therefore we can't + * expect their value to drop to 0 here. + * Having res filled up with kmem only is enough. + * * This is a safety check because mem_cgroup_force_empty_list * could have raced with mem_cgroup_replace_page_cache callers * so the lru seemed empty but the page could have been added * right after the check. RES_USAGE should be safe as we always * charge before adding to the LRU. */ - } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0); + usage = res_counter_read_u64(&memcg->res, RES_USAGE) - + res_counter_read_u64(&memcg->kmem, RES_USAGE); + } while (usage > 0); } /* -- cgit From c8b2a36fb1597e9390cf4c1a7f2dd394dc7d7b17 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:13 -0800 Subject: memcg: execute the whole memcg freeing in free_worker() A lot of the initialization we do in mem_cgroup_create() is done with softirqs enabled. This include grabbing a css id, which holds &ss->id_lock->rlock, and the per-zone trees, which holds rtpz->lock->rlock. All of those signal to the lockdep mechanism that those locks can be used in SOFTIRQ-ON-W context. This means that the freeing of memcg structure must happen in a compatible context, otherwise we'll get a deadlock, like the one below, caught by lockdep: free_accounted_pages+0x47/0x4c free_task+0x31/0x5c __put_task_struct+0xc2/0xdb put_task_struct+0x1e/0x22 delayed_put_task_struct+0x7a/0x98 __rcu_process_callbacks+0x269/0x3df rcu_process_callbacks+0x31/0x5b __do_softirq+0x122/0x277 This usage pattern could not be triggered before kmem came into play. With the introduction of kmem stack handling, it is possible that we call the last mem_cgroup_put() from the task destructor, which is run in an rcu callback. Such callbacks are run with softirqs disabled, leading to the offensive usage pattern. In general, we have little, if any, means to guarantee in which context the last memcg_put will happen. The best we can do is test it and try to make sure no invalid context releases are happening. But as we add more code to memcg, the possible interactions grow in number and expose more ways to get context conflicts. One thing to keep in mind, is that part of the freeing process is already deferred to a worker, such as vfree(), that can only be called from process context. For the moment, the only two functions we really need moved away are: * free_css_id(), and * mem_cgroup_remove_from_trees(). But because the later accesses per-zone info, free_mem_cgroup_per_zone_info() needs to be moved as well. With that, we are left with the per_cpu stats only. Better move it all. Signed-off-by: Glauber Costa Tested-by: Greg Thelen Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Johannes Weiner Cc: JoonSoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 66 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f96ccc90fa66..e16694d5e118 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5247,16 +5247,29 @@ out_free: } /* - * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, - * but in process context. The work_freeing structure is overlaid - * on the rcu_freeing structure, which itself is overlaid on memsw. + * At destroying mem_cgroup, references from swap_cgroup can remain. + * (scanning all at force_empty is too costly...) + * + * Instead of clearing all references at force_empty, we remember + * the number of reference from swap_cgroup and free mem_cgroup when + * it goes down to 0. + * + * Removal of cgroup itself succeeds regardless of refs from swap. */ -static void free_work(struct work_struct *work) + +static void __mem_cgroup_free(struct mem_cgroup *memcg) { - struct mem_cgroup *memcg; + int node; int size = sizeof(struct mem_cgroup); - memcg = container_of(work, struct mem_cgroup, work_freeing); + mem_cgroup_remove_from_trees(memcg); + free_css_id(&mem_cgroup_subsys, &memcg->css); + + for_each_node(node) + free_mem_cgroup_per_zone_info(memcg, node); + + free_percpu(memcg->stat); + /* * We need to make sure that (at least for now), the jump label * destruction code runs outside of the cgroup lock. This is because @@ -5275,38 +5288,27 @@ static void free_work(struct work_struct *work) vfree(memcg); } -static void free_rcu(struct rcu_head *rcu_head) -{ - struct mem_cgroup *memcg; - - memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); - INIT_WORK(&memcg->work_freeing, free_work); - schedule_work(&memcg->work_freeing); -} /* - * At destroying mem_cgroup, references from swap_cgroup can remain. - * (scanning all at force_empty is too costly...) - * - * Instead of clearing all references at force_empty, we remember - * the number of reference from swap_cgroup and free mem_cgroup when - * it goes down to 0. - * - * Removal of cgroup itself succeeds regardless of refs from swap. + * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, + * but in process context. The work_freeing structure is overlaid + * on the rcu_freeing structure, which itself is overlaid on memsw. */ - -static void __mem_cgroup_free(struct mem_cgroup *memcg) +static void free_work(struct work_struct *work) { - int node; + struct mem_cgroup *memcg; - mem_cgroup_remove_from_trees(memcg); - free_css_id(&mem_cgroup_subsys, &memcg->css); + memcg = container_of(work, struct mem_cgroup, work_freeing); + __mem_cgroup_free(memcg); +} - for_each_node(node) - free_mem_cgroup_per_zone_info(memcg, node); +static void free_rcu(struct rcu_head *rcu_head) +{ + struct mem_cgroup *memcg; - free_percpu(memcg->stat); - call_rcu(&memcg->rcu_freeing, free_rcu); + memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); + INIT_WORK(&memcg->work_freeing, free_work); + schedule_work(&memcg->work_freeing); } static void mem_cgroup_get(struct mem_cgroup *memcg) @@ -5318,7 +5320,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) { if (atomic_sub_and_test(count, &memcg->refcnt)) { struct mem_cgroup *parent = parent_mem_cgroup(memcg); - __mem_cgroup_free(memcg); + call_rcu(&memcg->rcu_freeing, free_rcu); if (parent) mem_cgroup_put(parent); } -- cgit From ba6c496ed834a37a26fc6fc87fc9aecb0fa0014d Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:27 -0800 Subject: slab/slub: struct memcg_params For the kmem slab controller, we need to record some extra information in the kmem_cache structure. Signed-off-by: Glauber Costa Signed-off-by: Suleiman Souhlal Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'mm') diff --git a/mm/slab.h b/mm/slab.h index 1cb9c9ee0e6f..49e7a8b1d27e 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -100,4 +100,17 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos); + +#ifdef CONFIG_MEMCG_KMEM +static inline bool is_root_cache(struct kmem_cache *s) +{ + return !s->memcg_params || s->memcg_params->is_root_cache; +} +#else +static inline bool is_root_cache(struct kmem_cache *s) +{ + return true; +} + +#endif #endif -- cgit From 6ccfb5bcf52bcf100fa085946f044fdbba015048 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:31 -0800 Subject: slab: annotate on-slab caches nodelist locks We currently provide lockdep annotation for kmalloc caches, and also caches that have SLAB_DEBUG_OBJECTS enabled. The reason for this is that we can quite frequently nest in the l3->list_lock lock, which is not something trivial to avoid. My proposal with this patch, is to extend this to caches whose slab management object lives within the slab as well ("on_slab"). The need for this arose in the context of testing kmemcg-slab patches. With such patchset, we can have per-memcg kmalloc caches. So the same path that led to nesting between kmalloc caches will could then lead to in-memcg nesting. Because they are not annotated, lockdep will trigger. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 2c3a2e0394db..c26ab9fbe1f5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -641,6 +641,26 @@ static void init_node_lock_keys(int q) } } +static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q) +{ + struct kmem_list3 *l3; + l3 = cachep->nodelists[q]; + if (!l3) + return; + + slab_set_lock_classes(cachep, &on_slab_l3_key, + &on_slab_alc_key, q); +} + +static inline void on_slab_lock_classes(struct kmem_cache *cachep) +{ + int node; + + VM_BUG_ON(OFF_SLAB(cachep)); + for_each_node(node) + on_slab_lock_classes_node(cachep, node); +} + static inline void init_lock_keys(void) { int node; @@ -657,6 +677,14 @@ static inline void init_lock_keys(void) { } +static inline void on_slab_lock_classes(struct kmem_cache *cachep) +{ +} + +static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node) +{ +} + static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) { } @@ -1385,6 +1413,9 @@ static int __cpuinit cpuup_prepare(long cpu) free_alien_cache(alien); if (cachep->flags & SLAB_DEBUG_OBJECTS) slab_set_debugobj_lock_classes_node(cachep, node); + else if (!OFF_SLAB(cachep) && + !(cachep->flags & SLAB_DESTROY_BY_RCU)) + on_slab_lock_classes_node(cachep, node); } init_node_lock_keys(node); @@ -2489,7 +2520,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); slab_set_debugobj_lock_classes(cachep); - } + } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU)) + on_slab_lock_classes(cachep); return 0; } -- cgit From 2633d7a028239a738b793be5ca8fa6ac312f5793 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:34 -0800 Subject: slab/slub: consider a memcg parameter in kmem_create_cache Allow a memcg parameter to be passed during cache creation. When the slub allocator is being used, it will only merge caches that belong to the same memcg. We'll do this by scanning the global list, and then translating the cache to a memcg-specific cache Default function is created as a wrapper, passing NULL to the memcg version. We only merge caches that belong to the same memcg. A helper is provided, memcg_css_id: because slub needs a unique cache name for sysfs. Since this is visible, but not the canonical location for slab data, the cache name is not used, the css_id should suffice. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ mm/slab.h | 23 +++++++++++++++++++---- mm/slab_common.c | 42 +++++++++++++++++++++++++++++++++--------- mm/slub.c | 19 +++++++++++++++---- 4 files changed, 118 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e16694d5e118..3eafe6cf6ca4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -341,6 +341,14 @@ struct mem_cgroup { #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct tcp_memcontrol tcp_mem; #endif +#if defined(CONFIG_MEMCG_KMEM) + /* analogous to slab_common's slab_caches list. per-memcg */ + struct list_head memcg_slab_caches; + /* Not a spinlock, we can take a lot of time walking the list */ + struct mutex slab_caches_mutex; + /* Index in the kmem_cache->memcg_params->memcg_caches array */ + int kmemcg_id; +#endif }; /* internal only representation about the status of kmem accounting. */ @@ -2785,6 +2793,47 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) mem_cgroup_put(memcg); } +void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) +{ + if (!memcg) + return; + + mutex_lock(&memcg->slab_caches_mutex); + list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); + mutex_unlock(&memcg->slab_caches_mutex); +} + +/* + * helper for acessing a memcg's index. It will be used as an index in the + * child cache array in kmem_cache, and also to derive its name. This function + * will return -1 when this is not a kmem-limited memcg. + */ +int memcg_cache_id(struct mem_cgroup *memcg) +{ + return memcg ? memcg->kmemcg_id : -1; +} + +int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s) +{ + size_t size = sizeof(struct memcg_cache_params); + + if (!memcg_kmem_enabled()) + return 0; + + s->memcg_params = kzalloc(size, GFP_KERNEL); + if (!s->memcg_params) + return -ENOMEM; + + if (memcg) + s->memcg_params->memcg = memcg; + return 0; +} + +void memcg_release_cache(struct kmem_cache *s) +{ + kfree(s->memcg_params); +} + /* * We need to verify if the allocation against current->mm->owner's memcg is * possible for the given order. But the page is not allocated yet, so we'll @@ -5026,7 +5075,9 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, #ifdef CONFIG_MEMCG_KMEM static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { + memcg->kmemcg_id = -1; memcg_propagate_kmem(memcg); + return mem_cgroup_sockets_init(memcg, ss); }; diff --git a/mm/slab.h b/mm/slab.h index 49e7a8b1d27e..abe582d20c79 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -43,12 +43,15 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, extern void create_boot_cache(struct kmem_cache *, const char *name, size_t size, unsigned long flags); +struct mem_cgroup; #ifdef CONFIG_SLUB -struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)); +struct kmem_cache * +__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)); #else -static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +static inline struct kmem_cache * +__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)) { return NULL; } #endif @@ -106,11 +109,23 @@ static inline bool is_root_cache(struct kmem_cache *s) { return !s->memcg_params || s->memcg_params->is_root_cache; } + +static inline bool cache_match_memcg(struct kmem_cache *cachep, + struct mem_cgroup *memcg) +{ + return (is_root_cache(cachep) && !memcg) || + (cachep->memcg_params->memcg == memcg); +} #else static inline bool is_root_cache(struct kmem_cache *s) { return true; } +static inline bool cache_match_memcg(struct kmem_cache *cachep, + struct mem_cgroup *memcg) +{ + return true; +} #endif #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index a8e76d79ee65..3031badcc577 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "slab.h" @@ -27,7 +28,8 @@ DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; #ifdef CONFIG_DEBUG_VM -static int kmem_cache_sanity_check(const char *name, size_t size) +static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, + size_t size) { struct kmem_cache *s = NULL; @@ -53,7 +55,13 @@ static int kmem_cache_sanity_check(const char *name, size_t size) continue; } - if (!strcmp(s->name, name)) { + /* + * For simplicity, we won't check this in the list of memcg + * caches. We have control over memcg naming, and if there + * aren't duplicates in the global list, there won't be any + * duplicates in the memcg lists as well. + */ + if (!memcg && !strcmp(s->name, name)) { pr_err("%s (%s): Cache name already exists.\n", __func__, name); dump_stack(); @@ -66,7 +74,8 @@ static int kmem_cache_sanity_check(const char *name, size_t size) return 0; } #else -static inline int kmem_cache_sanity_check(const char *name, size_t size) +static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg, + const char *name, size_t size) { return 0; } @@ -125,8 +134,9 @@ unsigned long calculate_alignment(unsigned long flags, * as davem. */ -struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) +struct kmem_cache * +kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s = NULL; int err = 0; @@ -134,7 +144,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align get_online_cpus(); mutex_lock(&slab_mutex); - if (!kmem_cache_sanity_check(name, size) == 0) + if (!kmem_cache_sanity_check(memcg, name, size) == 0) goto out_locked; /* @@ -145,7 +155,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align */ flags &= CACHE_CREATE_MASK; - s = __kmem_cache_alias(name, size, align, flags, ctor); + s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); if (s) goto out_locked; @@ -154,6 +164,13 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align s->object_size = s->size = size; s->align = calculate_alignment(flags, align, size); s->ctor = ctor; + + if (memcg_register_cache(memcg, s)) { + kmem_cache_free(kmem_cache, s); + err = -ENOMEM; + goto out_locked; + } + s->name = kstrdup(name, GFP_KERNEL); if (!s->name) { kmem_cache_free(kmem_cache, s); @@ -163,10 +180,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align err = __kmem_cache_create(s, flags); if (!err) { - s->refcount = 1; list_add(&s->list, &slab_caches); - + memcg_cache_list_add(memcg, s); } else { kfree(s->name); kmem_cache_free(kmem_cache, s); @@ -194,6 +210,13 @@ out_locked: return s; } + +struct kmem_cache * +kmem_cache_create(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) +{ + return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor); +} EXPORT_SYMBOL(kmem_cache_create); void kmem_cache_destroy(struct kmem_cache *s) @@ -209,6 +232,7 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); + memcg_release_cache(s); kfree(s->name); kmem_cache_free(kmem_cache, s); } else { diff --git a/mm/slub.c b/mm/slub.c index 87f9f32bf0cd..985332b38852 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -3786,7 +3787,7 @@ static int slab_unmergeable(struct kmem_cache *s) return 0; } -static struct kmem_cache *find_mergeable(size_t size, +static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, size_t align, unsigned long flags, const char *name, void (*ctor)(void *)) { @@ -3822,17 +3823,21 @@ static struct kmem_cache *find_mergeable(size_t size, if (s->size - size >= sizeof(void *)) continue; + if (!cache_match_memcg(s, memcg)) + continue; + return s; } return NULL; } -struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +struct kmem_cache * +__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; - s = find_mergeable(size, align, flags, name, ctor); + s = find_mergeable(memcg, size, align, flags, name, ctor); if (s) { s->refcount++; /* @@ -5156,6 +5161,12 @@ static char *create_unique_id(struct kmem_cache *s) if (p != name + 1) *p++ = '-'; p += sprintf(p, "%07d", s->size); + +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) + p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg)); +#endif + BUG_ON(p > name + ID_STR_LENGTH - 1); return name; } -- cgit From 55007d849759252ddd573aeb36143b947202d509 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:38 -0800 Subject: memcg: allocate memory for memcg caches whenever a new memcg appears Every cache that is considered a root cache (basically the "original" caches, tied to the root memcg/no-memcg) will have an array that should be large enough to store a cache pointer per each memcg in the system. Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently in the 64k pointers range. Most of the time, we won't be using that much. What goes in this patch, is a simple scheme to dynamically allocate such an array, in order to minimize memory usage for memcg caches. Because we would also like to avoid allocations all the time, at least for now, the array will only grow. It will tend to be big enough to hold the maximum number of kmem-limited memcgs ever achieved. We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have more than that, we'll start doubling the size of this array every time the limit is reached. Because we are only considering kmem limited memcgs, a natural point for this to happen is when we write to the limit. At that point, we already have set_limit_mutex held, so that will become our natural synchronization mechanism. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 207 ++++++++++++++++++++++++++++++++++++++++++++++++++----- mm/slab_common.c | 28 ++++++++ 2 files changed, 219 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3eafe6cf6ca4..db38b60e5f87 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -378,6 +378,11 @@ static void memcg_kmem_set_activated(struct mem_cgroup *memcg) set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); } +static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) +{ + clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); +} + static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) { if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) @@ -549,12 +554,48 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) #endif #ifdef CONFIG_MEMCG_KMEM +/* + * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. + * There are two main reasons for not using the css_id for this: + * 1) this works better in sparse environments, where we have a lot of memcgs, + * but only a few kmem-limited. Or also, if we have, for instance, 200 + * memcgs, and none but the 200th is kmem-limited, we'd have to have a + * 200 entry array for that. + * + * 2) In order not to violate the cgroup API, we would like to do all memory + * allocation in ->create(). At that point, we haven't yet allocated the + * css_id. Having a separate index prevents us from messing with the cgroup + * core for this + * + * The current size of the caches array is stored in + * memcg_limited_groups_array_size. It will double each time we have to + * increase it. + */ +static DEFINE_IDA(kmem_limited_groups); +static int memcg_limited_groups_array_size; +/* + * MIN_SIZE is different than 1, because we would like to avoid going through + * the alloc/free process all the time. In a small machine, 4 kmem-limited + * cgroups is a reasonable guess. In the future, it could be a parameter or + * tunable, but that is strictly not necessary. + * + * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get + * this constant directly from cgroup, but it is understandable that this is + * better kept as an internal representation in cgroup.c. In any case, the + * css_id space is not getting any smaller, and we don't have to necessarily + * increase ours as well if it increases. + */ +#define MEMCG_CACHES_MIN_SIZE 4 +#define MEMCG_CACHES_MAX_SIZE 65535 + struct static_key memcg_kmem_enabled_key; static void disarm_kmem_keys(struct mem_cgroup *memcg) { - if (memcg_kmem_is_active(memcg)) + if (memcg_kmem_is_active(memcg)) { static_key_slow_dec(&memcg_kmem_enabled_key); + ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); + } /* * This check can't live in kmem destruction function, * since the charges will outlive the cgroup @@ -2813,6 +2854,120 @@ int memcg_cache_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } +/* + * This ends up being protected by the set_limit mutex, during normal + * operation, because that is its main call site. + * + * But when we create a new cache, we can call this as well if its parent + * is kmem-limited. That will have to hold set_limit_mutex as well. + */ +int memcg_update_cache_sizes(struct mem_cgroup *memcg) +{ + int num, ret; + + num = ida_simple_get(&kmem_limited_groups, + 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); + if (num < 0) + return num; + /* + * After this point, kmem_accounted (that we test atomically in + * the beginning of this conditional), is no longer 0. This + * guarantees only one process will set the following boolean + * to true. We don't need test_and_set because we're protected + * by the set_limit_mutex anyway. + */ + memcg_kmem_set_activated(memcg); + + ret = memcg_update_all_caches(num+1); + if (ret) { + ida_simple_remove(&kmem_limited_groups, num); + memcg_kmem_clear_activated(memcg); + return ret; + } + + memcg->kmemcg_id = num; + INIT_LIST_HEAD(&memcg->memcg_slab_caches); + mutex_init(&memcg->slab_caches_mutex); + return 0; +} + +static size_t memcg_caches_array_size(int num_groups) +{ + ssize_t size; + if (num_groups <= 0) + return 0; + + size = 2 * num_groups; + if (size < MEMCG_CACHES_MIN_SIZE) + size = MEMCG_CACHES_MIN_SIZE; + else if (size > MEMCG_CACHES_MAX_SIZE) + size = MEMCG_CACHES_MAX_SIZE; + + return size; +} + +/* + * We should update the current array size iff all caches updates succeed. This + * can only be done from the slab side. The slab mutex needs to be held when + * calling this. + */ +void memcg_update_array_size(int num) +{ + if (num > memcg_limited_groups_array_size) + memcg_limited_groups_array_size = memcg_caches_array_size(num); +} + +int memcg_update_cache_size(struct kmem_cache *s, int num_groups) +{ + struct memcg_cache_params *cur_params = s->memcg_params; + + VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); + + if (num_groups > memcg_limited_groups_array_size) { + int i; + ssize_t size = memcg_caches_array_size(num_groups); + + size *= sizeof(void *); + size += sizeof(struct memcg_cache_params); + + s->memcg_params = kzalloc(size, GFP_KERNEL); + if (!s->memcg_params) { + s->memcg_params = cur_params; + return -ENOMEM; + } + + s->memcg_params->is_root_cache = true; + + /* + * There is the chance it will be bigger than + * memcg_limited_groups_array_size, if we failed an allocation + * in a cache, in which case all caches updated before it, will + * have a bigger array. + * + * But if that is the case, the data after + * memcg_limited_groups_array_size is certainly unused + */ + for (i = 0; i < memcg_limited_groups_array_size; i++) { + if (!cur_params->memcg_caches[i]) + continue; + s->memcg_params->memcg_caches[i] = + cur_params->memcg_caches[i]; + } + + /* + * Ideally, we would wait until all caches succeed, and only + * then free the old one. But this is not worth the extra + * pointer per-cache we'd have to have for this. + * + * It is not a big deal if some caches are left with a size + * bigger than the others. And all updates will reset this + * anyway. + */ + kfree(cur_params); + } + return 0; +} + int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s) { size_t size = sizeof(struct memcg_cache_params); @@ -2820,6 +2975,9 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s) if (!memcg_kmem_enabled()) return 0; + if (!memcg) + size += memcg_limited_groups_array_size * sizeof(void *); + s->memcg_params = kzalloc(size, GFP_KERNEL); if (!s->memcg_params) return -ENOMEM; @@ -4326,14 +4484,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) ret = res_counter_set_limit(&memcg->kmem, val); VM_BUG_ON(ret); - /* - * After this point, kmem_accounted (that we test atomically in - * the beginning of this conditional), is no longer 0. This - * guarantees only one process will set the following boolean - * to true. We don't need test_and_set because we're protected - * by the set_limit_mutex anyway. - */ - memcg_kmem_set_activated(memcg); + ret = memcg_update_cache_sizes(memcg); + if (ret) { + res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); + goto out; + } must_inc_static_branch = true; /* * kmem charges can outlive the cgroup. In the case of slab @@ -4372,11 +4527,13 @@ out: return ret; } -static void memcg_propagate_kmem(struct mem_cgroup *memcg) +static int memcg_propagate_kmem(struct mem_cgroup *memcg) { + int ret = 0; struct mem_cgroup *parent = parent_mem_cgroup(memcg); if (!parent) - return; + goto out; + memcg->kmem_account_flags = parent->kmem_account_flags; #ifdef CONFIG_MEMCG_KMEM /* @@ -4389,11 +4546,24 @@ static void memcg_propagate_kmem(struct mem_cgroup *memcg) * It is a lot simpler just to do static_key_slow_inc() on every child * that is accounted. */ - if (memcg_kmem_is_active(memcg)) { - mem_cgroup_get(memcg); - static_key_slow_inc(&memcg_kmem_enabled_key); - } + if (!memcg_kmem_is_active(memcg)) + goto out; + + /* + * destroy(), called if we fail, will issue static_key_slow_inc() and + * mem_cgroup_put() if kmem is enabled. We have to either call them + * unconditionally, or clear the KMEM_ACTIVE flag. I personally find + * this more consistent, since it always leads to the same destroy path + */ + mem_cgroup_get(memcg); + static_key_slow_inc(&memcg_kmem_enabled_key); + + mutex_lock(&set_limit_mutex); + ret = memcg_update_cache_sizes(memcg); + mutex_unlock(&set_limit_mutex); #endif +out: + return ret; } /* @@ -5075,8 +5245,12 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, #ifdef CONFIG_MEMCG_KMEM static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { + int ret; + memcg->kmemcg_id = -1; - memcg_propagate_kmem(memcg); + ret = memcg_propagate_kmem(memcg); + if (ret) + return ret; return mem_cgroup_sockets_init(memcg, ss); }; @@ -5479,6 +5653,7 @@ mem_cgroup_css_alloc(struct cgroup *cont) res_counter_init(&memcg->res, &parent->res); res_counter_init(&memcg->memsw, &parent->memsw); res_counter_init(&memcg->kmem, &parent->kmem); + /* * We increment refcnt of the parent to ensure that we can * safely access it on res_counter_charge/uncharge. diff --git a/mm/slab_common.c b/mm/slab_common.c index 3031badcc577..1c424b6511bf 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -81,6 +81,34 @@ static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg, } #endif +#ifdef CONFIG_MEMCG_KMEM +int memcg_update_all_caches(int num_memcgs) +{ + struct kmem_cache *s; + int ret = 0; + mutex_lock(&slab_mutex); + + list_for_each_entry(s, &slab_caches, list) { + if (!is_root_cache(s)) + continue; + + ret = memcg_update_cache_size(s, num_memcgs); + /* + * See comment in memcontrol.c, memcg_update_cache_size: + * Instead of freeing the memory, we'll just leave the caches + * up to this point in an updated state. + */ + if (ret) + goto out; + } + + memcg_update_array_size(num_memcgs); +out: + mutex_unlock(&slab_mutex); + return ret; +} +#endif + /* * Figure out what the alignment of the objects will be given a set of * flags, a user specified alignment and the size of the objects. -- cgit From d7f25f8a2f81252d1ac134470ba1d0a287cf8fcd Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:40 -0800 Subject: memcg: infrastructure to match an allocation to the right cache The page allocator is able to bind a page to a memcg when it is allocated. But for the caches, we'd like to have as many objects as possible in a page belonging to the same cache. This is done in this patch by calling memcg_kmem_get_cache in the beginning of every allocation function. This function is patched out by static branches when kernel memory controller is not being used. It assumes that the task allocating, which determines the memcg in the page allocator, belongs to the same cgroup throughout the whole process. Misaccounting can happen if the task calls memcg_kmem_get_cache() while belonging to a cgroup, and later on changes. This is considered acceptable, and should only happen upon task migration. Before the cache is created by the memcg core, there is also a possible imbalance: the task belongs to a memcg, but the cache being allocated from is the global cache, since the child cache is not yet guaranteed to be ready. This case is also fine, since in this case the GFP_KMEMCG will not be passed and the page allocator will not attempt any cgroup accounting. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 217 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 217 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index db38b60e5f87..efd26620a60b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -588,7 +588,14 @@ static int memcg_limited_groups_array_size; #define MEMCG_CACHES_MIN_SIZE 4 #define MEMCG_CACHES_MAX_SIZE 65535 +/* + * A lot of the calls to the cache allocation functions are expected to be + * inlined by the compiler. Since the calls to memcg_kmem_get_cache are + * conditional to this static branch, we'll have to allow modules that does + * kmem_cache_alloc and the such to see this symbol as well + */ struct static_key memcg_kmem_enabled_key; +EXPORT_SYMBOL(memcg_kmem_enabled_key); static void disarm_kmem_keys(struct mem_cgroup *memcg) { @@ -2989,9 +2996,219 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s) void memcg_release_cache(struct kmem_cache *s) { + struct kmem_cache *root; + struct mem_cgroup *memcg; + int id; + + /* + * This happens, for instance, when a root cache goes away before we + * add any memcg. + */ + if (!s->memcg_params) + return; + + if (s->memcg_params->is_root_cache) + goto out; + + memcg = s->memcg_params->memcg; + id = memcg_cache_id(memcg); + + root = s->memcg_params->root_cache; + root->memcg_params->memcg_caches[id] = NULL; + mem_cgroup_put(memcg); + + mutex_lock(&memcg->slab_caches_mutex); + list_del(&s->memcg_params->list); + mutex_unlock(&memcg->slab_caches_mutex); + +out: kfree(s->memcg_params); } +static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) +{ + char *name; + struct dentry *dentry; + + rcu_read_lock(); + dentry = rcu_dereference(memcg->css.cgroup->dentry); + rcu_read_unlock(); + + BUG_ON(dentry == NULL); + + name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name, + memcg_cache_id(memcg), dentry->d_name.name); + + return name; +} + +static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, + struct kmem_cache *s) +{ + char *name; + struct kmem_cache *new; + + name = memcg_cache_name(memcg, s); + if (!name) + return NULL; + + new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, + (s->flags & ~SLAB_PANIC), s->ctor); + + kfree(name); + return new; +} + +/* + * This lock protects updaters, not readers. We want readers to be as fast as + * they can, and they will either see NULL or a valid cache value. Our model + * allow them to see NULL, in which case the root memcg will be selected. + * + * We need this lock because multiple allocations to the same cache from a non + * will span more than one worker. Only one of them can create the cache. + */ +static DEFINE_MUTEX(memcg_cache_mutex); +static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + struct kmem_cache *new_cachep; + int idx; + + BUG_ON(!memcg_can_account_kmem(memcg)); + + idx = memcg_cache_id(memcg); + + mutex_lock(&memcg_cache_mutex); + new_cachep = cachep->memcg_params->memcg_caches[idx]; + if (new_cachep) + goto out; + + new_cachep = kmem_cache_dup(memcg, cachep); + + if (new_cachep == NULL) { + new_cachep = cachep; + goto out; + } + + mem_cgroup_get(memcg); + new_cachep->memcg_params->root_cache = cachep; + + cachep->memcg_params->memcg_caches[idx] = new_cachep; + /* + * the readers won't lock, make sure everybody sees the updated value, + * so they won't put stuff in the queue again for no reason + */ + wmb(); +out: + mutex_unlock(&memcg_cache_mutex); + return new_cachep; +} + +struct create_work { + struct mem_cgroup *memcg; + struct kmem_cache *cachep; + struct work_struct work; +}; + +static void memcg_create_cache_work_func(struct work_struct *w) +{ + struct create_work *cw; + + cw = container_of(w, struct create_work, work); + memcg_create_kmem_cache(cw->memcg, cw->cachep); + /* Drop the reference gotten when we enqueued. */ + css_put(&cw->memcg->css); + kfree(cw); +} + +/* + * Enqueue the creation of a per-memcg kmem_cache. + * Called with rcu_read_lock. + */ +static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + struct create_work *cw; + + cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); + if (cw == NULL) + return; + + /* The corresponding put will be done in the workqueue. */ + if (!css_tryget(&memcg->css)) { + kfree(cw); + return; + } + + cw->memcg = memcg; + cw->cachep = cachep; + + INIT_WORK(&cw->work, memcg_create_cache_work_func); + schedule_work(&cw->work); +} + +/* + * Return the kmem_cache we're supposed to use for a slab allocation. + * We try to use the current memcg's version of the cache. + * + * If the cache does not exist yet, if we are the first user of it, + * we either create it immediately, if possible, or create it asynchronously + * in a workqueue. + * In the latter case, we will let the current allocation go through with + * the original cache. + * + * Can't be called in interrupt context or from kernel threads. + * This function needs to be called with rcu_read_lock() held. + */ +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, + gfp_t gfp) +{ + struct mem_cgroup *memcg; + int idx; + + VM_BUG_ON(!cachep->memcg_params); + VM_BUG_ON(!cachep->memcg_params->is_root_cache); + + rcu_read_lock(); + memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); + rcu_read_unlock(); + + if (!memcg_can_account_kmem(memcg)) + return cachep; + + idx = memcg_cache_id(memcg); + + /* + * barrier to mare sure we're always seeing the up to date value. The + * code updating memcg_caches will issue a write barrier to match this. + */ + read_barrier_depends(); + if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) { + /* + * If we are in a safe context (can wait, and not in interrupt + * context), we could be be predictable and return right away. + * This would guarantee that the allocation being performed + * already belongs in the new cache. + * + * However, there are some clashes that can arrive from locking. + * For instance, because we acquire the slab_mutex while doing + * kmem_cache_dup, this means no further allocation could happen + * with the slab_mutex held. + * + * Also, because cache creation issue get_online_cpus(), this + * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, + * that ends up reversed during cpu hotplug. (cpuset allocates + * a bunch of GFP_KERNEL memory during cpuup). Due to all that, + * better to defer everything. + */ + memcg_create_cache_enqueue(memcg, cachep); + return cachep; + } + + return cachep->memcg_params->memcg_caches[idx]; +} +EXPORT_SYMBOL(__memcg_kmem_get_cache); + /* * We need to verify if the allocation against current->mm->owner's memcg is * possible for the given order. But the page is not allocated yet, so we'll -- cgit From 0e9d92f2d02d8c8320f0502307c688d07bdac2b3 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:42 -0800 Subject: memcg: skip memcg kmem allocations in specified code regions Create a mechanism that skip memcg allocations during certain pieces of our core code. It basically works in the same way as preempt_disable()/preempt_enable(): By marking a region under which all allocations will be accounted to the root memcg. We need this to prevent races in early cache creation, when we allocate data using caches that are not necessarily created already. Signed-off-by: Glauber Costa yCc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index efd26620a60b..65302a083d2f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3025,6 +3025,37 @@ out: kfree(s->memcg_params); } +/* + * During the creation a new cache, we need to disable our accounting mechanism + * altogether. This is true even if we are not creating, but rather just + * enqueing new caches to be created. + * + * This is because that process will trigger allocations; some visible, like + * explicit kmallocs to auxiliary data structures, name strings and internal + * cache structures; some well concealed, like INIT_WORK() that can allocate + * objects during debug. + * + * If any allocation happens during memcg_kmem_get_cache, we will recurse back + * to it. This may not be a bounded recursion: since the first cache creation + * failed to complete (waiting on the allocation), we'll just try to create the + * cache again, failing at the same point. + * + * memcg_kmem_get_cache is prepared to abort after seeing a positive count of + * memcg_kmem_skip_account. So we enclose anything that might allocate memory + * inside the following two functions. + */ +static inline void memcg_stop_kmem_account(void) +{ + VM_BUG_ON(!current->mm); + current->memcg_kmem_skip_account++; +} + +static inline void memcg_resume_kmem_account(void) +{ + VM_BUG_ON(!current->mm); + current->memcg_kmem_skip_account--; +} + static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) { char *name; @@ -3084,7 +3115,6 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out; new_cachep = kmem_cache_dup(memcg, cachep); - if (new_cachep == NULL) { new_cachep = cachep; goto out; @@ -3125,8 +3155,8 @@ static void memcg_create_cache_work_func(struct work_struct *w) * Enqueue the creation of a per-memcg kmem_cache. * Called with rcu_read_lock. */ -static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { struct create_work *cw; @@ -3147,6 +3177,24 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, schedule_work(&cw->work); } +static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + /* + * We need to stop accounting when we kmalloc, because if the + * corresponding kmalloc cache is not yet created, the first allocation + * in __memcg_create_cache_enqueue will recurse. + * + * However, it is better to enclose the whole function. Depending on + * the debugging options enabled, INIT_WORK(), for instance, can + * trigger an allocation. This too, will make us recurse. Because at + * this point we can't allow ourselves back into memcg_kmem_get_cache, + * the safest choice is to do it like this, wrapping the whole function. + */ + memcg_stop_kmem_account(); + __memcg_create_cache_enqueue(memcg, cachep); + memcg_resume_kmem_account(); +} /* * Return the kmem_cache we're supposed to use for a slab allocation. * We try to use the current memcg's version of the cache. @@ -3169,6 +3217,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, VM_BUG_ON(!cachep->memcg_params); VM_BUG_ON(!cachep->memcg_params->is_root_cache); + if (!current->mm || current->memcg_kmem_skip_account) + return cachep; + rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); rcu_read_unlock(); -- cgit From b9ce5ef49f00daf2254c6953c8d31f79aabccd34 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:46 -0800 Subject: sl[au]b: always get the cache from its page in kmem_cache_free() struct page already has this information. If we start chaining caches, this information will always be more trustworthy than whatever is passed into the function. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 6 +++++- mm/slab.h | 39 +++++++++++++++++++++++++++++++++++++++ mm/slob.c | 2 +- mm/slub.c | 15 +++------------ 4 files changed, 48 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index c26ab9fbe1f5..bab6fec765a7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -87,7 +87,6 @@ */ #include -#include "slab.h" #include #include #include @@ -128,6 +127,8 @@ #include "internal.h" +#include "slab.h" + /* * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller code (especially in the critical paths). @@ -3883,6 +3884,9 @@ EXPORT_SYMBOL(__kmalloc); void kmem_cache_free(struct kmem_cache *cachep, void *objp) { unsigned long flags; + cachep = cache_from_obj(cachep, objp); + if (!cachep) + return; local_irq_save(flags); debug_check_no_locks_freed(objp, cachep->object_size); diff --git a/mm/slab.h b/mm/slab.h index abe582d20c79..c95e922b166d 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -116,6 +116,13 @@ static inline bool cache_match_memcg(struct kmem_cache *cachep, return (is_root_cache(cachep) && !memcg) || (cachep->memcg_params->memcg == memcg); } + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return (p == s) || + (s->memcg_params && (p == s->memcg_params->root_cache)); +} #else static inline bool is_root_cache(struct kmem_cache *s) { @@ -127,5 +134,37 @@ static inline bool cache_match_memcg(struct kmem_cache *cachep, { return true; } + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return true; +} #endif + +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + struct page *page; + + /* + * When kmemcg is not being used, both assignments should return the + * same value. but we don't want to pay the assignment price in that + * case. If it is not compiled in, the compiler should be smart enough + * to not do even the assignment. In that case, slab_equal_or_root + * will also be a constant. + */ + if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE)) + return s; + + page = virt_to_head_page(x); + cachep = page->slab_cache; + if (slab_equal_or_root(cachep, s)) + return cachep; + + pr_err("%s: Wrong slab cache. %s but object is from %s\n", + __FUNCTION__, cachep->name, s->name); + WARN_ON_ONCE(1); + return s; +} #endif diff --git a/mm/slob.c b/mm/slob.c index 795bab7d391d..a99fdf7a0907 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -58,7 +58,6 @@ #include #include -#include "slab.h" #include #include /* struct reclaim_state */ @@ -73,6 +72,7 @@ #include +#include "slab.h" /* * slob_block has a field 'units', which indicates size of block if +ve, * or offset of next block if -ve (in SLOB_UNITs). diff --git a/mm/slub.c b/mm/slub.c index 985332b38852..6d5f2305d7a4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2611,19 +2611,10 @@ redo: void kmem_cache_free(struct kmem_cache *s, void *x) { - struct page *page; - - page = virt_to_head_page(x); - - if (kmem_cache_debug(s) && page->slab_cache != s) { - pr_err("kmem_cache_free: Wrong slab cache. %s but object" - " is from %s\n", page->slab_cache->name, s->name); - WARN_ON_ONCE(1); + s = cache_from_obj(s, x); + if (!s) return; - } - - slab_free(s, page, x, _RET_IP_); - + slab_free(s, virt_to_head_page(x), x, _RET_IP_); trace_kmem_cache_free(_RET_IP_, x); } EXPORT_SYMBOL(kmem_cache_free); -- cgit From d79923fad95b0cdf7770e024677180c734cb7148 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:48 -0800 Subject: sl[au]b: allocate objects from memcg cache We are able to match a cache allocation to a particular memcg. If the task doesn't change groups during the allocation itself - a rare event, this will give us a good picture about who is the first group to touch a cache page. This patch uses the now available infrastructure by calling memcg_kmem_get_cache() before all the cache allocations. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 +++ mm/slab.c | 6 +++++- mm/slub.c | 7 ++++--- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 65302a083d2f..cc13797d0fbc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3086,6 +3086,9 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, (s->flags & ~SLAB_PANIC), s->ctor); + if (new) + new->allocflags |= __GFP_KMEMCG; + kfree(name); return new; } diff --git a/mm/slab.c b/mm/slab.c index bab6fec765a7..e265865e8700 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1933,7 +1933,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) } if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - free_pages((unsigned long)addr, cachep->gfporder); + free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder); } static void kmem_rcu_free(struct rcu_head *head) @@ -3486,6 +3486,8 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, if (slab_should_failslab(cachep, flags)) return NULL; + cachep = memcg_kmem_get_cache(cachep, flags); + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); @@ -3571,6 +3573,8 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) if (slab_should_failslab(cachep, flags)) return NULL; + cachep = memcg_kmem_get_cache(cachep, flags); + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); diff --git a/mm/slub.c b/mm/slub.c index 6d5f2305d7a4..ef39e872b8eb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1405,7 +1405,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) reset_page_mapcount(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_pages(page, order); + __free_memcg_kmem_pages(page, order); } #define need_reserve_slab_rcu \ @@ -2323,6 +2323,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, if (slab_pre_alloc_hook(s, gfpflags)) return NULL; + s = memcg_kmem_get_cache(s, gfpflags); redo: /* @@ -3284,7 +3285,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK; + flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; page = alloc_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); @@ -3390,7 +3391,7 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kmemleak_free(x); - __free_pages(page, compound_order(page)); + __free_memcg_kmem_pages(page, compound_order(page)); return; } slab_free(page->slab_cache, page, object, _RET_IP_); -- cgit From 1f458cbf122288b23620ee822e19bcbb76c8d6ec Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:50 -0800 Subject: memcg: destroy memcg caches Implement destruction of memcg caches. Right now, only caches where our reference counter is the last remaining are deleted. If there are any other reference counters around, we just leave the caches lying around until they go away. When that happens, a destruction function is called from the cache code. Caches are only destroyed in process context, so we queue them up for later processing in the general case. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/slab.c | 3 +++ mm/slab.h | 23 +++++++++++++++++++++ mm/slub.c | 7 ++++++- 4 files changed, 95 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cc13797d0fbc..270a36789859 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2779,6 +2779,19 @@ static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); } +/* + * This is a bit cumbersome, but it is rarely used and avoids a backpointer + * in the memcg_cache_params struct. + */ +static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) +{ + struct kmem_cache *cachep; + + VM_BUG_ON(p->is_root_cache); + cachep = p->root_cache; + return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; +} + static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) { struct res_counter *fail_res; @@ -3056,6 +3069,31 @@ static inline void memcg_resume_kmem_account(void) current->memcg_kmem_skip_account--; } +static void kmem_cache_destroy_work_func(struct work_struct *w) +{ + struct kmem_cache *cachep; + struct memcg_cache_params *p; + + p = container_of(w, struct memcg_cache_params, destroy); + + cachep = memcg_params_to_cache(p); + + if (!atomic_read(&cachep->memcg_params->nr_pages)) + kmem_cache_destroy(cachep); +} + +void mem_cgroup_destroy_cache(struct kmem_cache *cachep) +{ + if (!cachep->memcg_params->dead) + return; + + /* + * We have to defer the actual destroying to a workqueue, because + * we might currently be in a context that cannot sleep. + */ + schedule_work(&cachep->memcg_params->destroy); +} + static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) { char *name; @@ -3125,6 +3163,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, mem_cgroup_get(memcg); new_cachep->memcg_params->root_cache = cachep; + atomic_set(&new_cachep->memcg_params->nr_pages , 0); cachep->memcg_params->memcg_caches[idx] = new_cachep; /* @@ -3143,6 +3182,25 @@ struct create_work { struct work_struct work; }; +static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +{ + struct kmem_cache *cachep; + struct memcg_cache_params *params; + + if (!memcg_kmem_is_active(memcg)) + return; + + mutex_lock(&memcg->slab_caches_mutex); + list_for_each_entry(params, &memcg->memcg_slab_caches, list) { + cachep = memcg_params_to_cache(params); + cachep->memcg_params->dead = true; + INIT_WORK(&cachep->memcg_params->destroy, + kmem_cache_destroy_work_func); + schedule_work(&cachep->memcg_params->destroy); + } + mutex_unlock(&memcg->slab_caches_mutex); +} + static void memcg_create_cache_work_func(struct work_struct *w) { struct create_work *cw; @@ -3358,6 +3416,10 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) VM_BUG_ON(mem_cgroup_is_root(memcg)); memcg_uncharge_kmem(memcg, PAGE_SIZE << order); } +#else +static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +{ +} #endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -5975,6 +6037,7 @@ static void mem_cgroup_css_offline(struct cgroup *cont) struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); mem_cgroup_reparent_charges(memcg); + mem_cgroup_destroy_all_caches(memcg); } static void mem_cgroup_css_free(struct cgroup *cont) diff --git a/mm/slab.c b/mm/slab.c index e265865e8700..7467343f9fe7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1895,6 +1895,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) if (page->pfmemalloc) SetPageSlabPfmemalloc(page + i); } + memcg_bind_pages(cachep, cachep->gfporder); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); @@ -1931,6 +1932,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) __ClearPageSlab(page); page++; } + + memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder); diff --git a/mm/slab.h b/mm/slab.h index c95e922b166d..43d8a38b534f 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -117,6 +117,21 @@ static inline bool cache_match_memcg(struct kmem_cache *cachep, (cachep->memcg_params->memcg == memcg); } +static inline void memcg_bind_pages(struct kmem_cache *s, int order) +{ + if (!is_root_cache(s)) + atomic_add(1 << order, &s->memcg_params->nr_pages); +} + +static inline void memcg_release_pages(struct kmem_cache *s, int order) +{ + if (is_root_cache(s)) + return; + + if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages)) + mem_cgroup_destroy_cache(s); +} + static inline bool slab_equal_or_root(struct kmem_cache *s, struct kmem_cache *p) { @@ -135,6 +150,14 @@ static inline bool cache_match_memcg(struct kmem_cache *cachep, return true; } +static inline void memcg_bind_pages(struct kmem_cache *s, int order) +{ +} + +static inline void memcg_release_pages(struct kmem_cache *s, int order) +{ +} + static inline bool slab_equal_or_root(struct kmem_cache *s, struct kmem_cache *p) { diff --git a/mm/slub.c b/mm/slub.c index ef39e872b8eb..692177bebdf0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1344,6 +1344,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) void *start; void *last; void *p; + int order; BUG_ON(flags & GFP_SLAB_BUG_MASK); @@ -1352,7 +1353,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) goto out; + order = compound_order(page); inc_slabs_node(s, page_to_nid(page), page->objects); + memcg_bind_pages(s, order); page->slab_cache = s; __SetPageSlab(page); if (page->pfmemalloc) @@ -1361,7 +1364,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) start = page_address(page); if (unlikely(s->flags & SLAB_POISON)) - memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); + memset(start, POISON_INUSE, PAGE_SIZE << order); last = start; for_each_object(p, s, start, page->objects) { @@ -1402,6 +1405,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); + + memcg_release_pages(s, order); reset_page_mapcount(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; -- cgit From 7cf2798240a2a2230cb16a391beef98d8a7ad362 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:55 -0800 Subject: memcg/sl[au]b: track all the memcg children of a kmem_cache This enables us to remove all the children of a kmem_cache being destroyed, if for example the kernel module it's being used in gets unloaded. Otherwise, the children will still point to the destroyed parent. Signed-off-by: Suleiman Souhlal Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++-- mm/slab_common.c | 3 +++ 2 files changed, 50 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 270a36789859..4b68ec2c8df6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2772,6 +2772,8 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, memcg_check_events(memcg, page); } +static DEFINE_MUTEX(set_limit_mutex); + #ifdef CONFIG_MEMCG_KMEM static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) { @@ -3176,6 +3178,51 @@ out: return new_cachep; } +void kmem_cache_destroy_memcg_children(struct kmem_cache *s) +{ + struct kmem_cache *c; + int i; + + if (!s->memcg_params) + return; + if (!s->memcg_params->is_root_cache) + return; + + /* + * If the cache is being destroyed, we trust that there is no one else + * requesting objects from it. Even if there are, the sanity checks in + * kmem_cache_destroy should caught this ill-case. + * + * Still, we don't want anyone else freeing memcg_caches under our + * noses, which can happen if a new memcg comes to life. As usual, + * we'll take the set_limit_mutex to protect ourselves against this. + */ + mutex_lock(&set_limit_mutex); + for (i = 0; i < memcg_limited_groups_array_size; i++) { + c = s->memcg_params->memcg_caches[i]; + if (!c) + continue; + + /* + * We will now manually delete the caches, so to avoid races + * we need to cancel all pending destruction workers and + * proceed with destruction ourselves. + * + * kmem_cache_destroy() will call kmem_cache_shrink internally, + * and that could spawn the workers again: it is likely that + * the cache still have active pages until this very moment. + * This would lead us back to mem_cgroup_destroy_cache. + * + * But that will not execute at all if the "dead" flag is not + * set, so flip it down to guarantee we are in control. + */ + c->memcg_params->dead = false; + cancel_delayed_work_sync(&c->memcg_params->destroy); + kmem_cache_destroy(c); + } + mutex_unlock(&set_limit_mutex); +} + struct create_work { struct mem_cgroup *memcg; struct kmem_cache *cachep; @@ -4284,8 +4331,6 @@ void mem_cgroup_print_bad_page(struct page *page) } #endif -static DEFINE_MUTEX(set_limit_mutex); - static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) { diff --git a/mm/slab_common.c b/mm/slab_common.c index 1c424b6511bf..080a43804bf1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -249,6 +249,9 @@ EXPORT_SYMBOL(kmem_cache_create); void kmem_cache_destroy(struct kmem_cache *s) { + /* Destroy all the children caches if we aren't a memcg cache */ + kmem_cache_destroy_memcg_children(s); + get_online_cpus(); mutex_lock(&slab_mutex); s->refcount--; -- cgit From 22933152934f30de6f05b600c03f8a08f853a8d2 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:59 -0800 Subject: memcg/sl[au]b: shrink dead caches This means that when we destroy a memcg cache that happened to be empty, those caches may take a lot of time to go away: removing the memcg reference won't destroy them - because there are pending references, and the empty pages will stay there, until a shrinker is called upon for any reason. In this patch, we will call kmem_cache_shrink() for all dead caches that cannot be destroyed because of remaining pages. After shrinking, it is possible that it could be freed. If this is not the case, we'll schedule a lazy worker to keep trying. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 46 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4b68ec2c8df6..7633e0d429e0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3080,7 +3080,27 @@ static void kmem_cache_destroy_work_func(struct work_struct *w) cachep = memcg_params_to_cache(p); - if (!atomic_read(&cachep->memcg_params->nr_pages)) + /* + * If we get down to 0 after shrink, we could delete right away. + * However, memcg_release_pages() already puts us back in the workqueue + * in that case. If we proceed deleting, we'll get a dangling + * reference, and removing the object from the workqueue in that case + * is unnecessary complication. We are not a fast path. + * + * Note that this case is fundamentally different from racing with + * shrink_slab(): if memcg_cgroup_destroy_cache() is called in + * kmem_cache_shrink, not only we would be reinserting a dead cache + * into the queue, but doing so from inside the worker racing to + * destroy it. + * + * So if we aren't down to zero, we'll just schedule a worker and try + * again + */ + if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { + kmem_cache_shrink(cachep); + if (atomic_read(&cachep->memcg_params->nr_pages) == 0) + return; + } else kmem_cache_destroy(cachep); } @@ -3089,6 +3109,26 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep) if (!cachep->memcg_params->dead) return; + /* + * There are many ways in which we can get here. + * + * We can get to a memory-pressure situation while the delayed work is + * still pending to run. The vmscan shrinkers can then release all + * cache memory and get us to destruction. If this is the case, we'll + * be executed twice, which is a bug (the second time will execute over + * bogus data). In this case, cancelling the work should be fine. + * + * But we can also get here from the worker itself, if + * kmem_cache_shrink is enough to shake all the remaining objects and + * get the page count to 0. In this case, we'll deadlock if we try to + * cancel the work (the worker runs with an internal lock held, which + * is the same lock we would hold for cancel_work_sync().) + * + * Since we can't possibly know who got us here, just refrain from + * running if there is already work pending + */ + if (work_pending(&cachep->memcg_params->destroy)) + return; /* * We have to defer the actual destroying to a workqueue, because * we might currently be in a context that cannot sleep. @@ -3217,7 +3257,7 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s) * set, so flip it down to guarantee we are in control. */ c->memcg_params->dead = false; - cancel_delayed_work_sync(&c->memcg_params->destroy); + cancel_work_sync(&c->memcg_params->destroy); kmem_cache_destroy(c); } mutex_unlock(&set_limit_mutex); @@ -3242,7 +3282,7 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) cachep = memcg_params_to_cache(params); cachep->memcg_params->dead = true; INIT_WORK(&cachep->memcg_params->destroy, - kmem_cache_destroy_work_func); + kmem_cache_destroy_work_func); schedule_work(&cachep->memcg_params->destroy); } mutex_unlock(&memcg->slab_caches_mutex); -- cgit From 749c54151a6e5b229e4ae067dbc651e54b161fbc Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:23:01 -0800 Subject: memcg: aggregate memcg cache values in slabinfo When we create caches in memcgs, we need to display their usage information somewhere. We'll adopt a scheme similar to /proc/meminfo, with aggregate totals shown in the global file, and per-group information stored in the group itself. For the time being, only reads are allowed in the per-group cache. Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 30 +++++++++++++++++++++++++++++- mm/slab.h | 27 +++++++++++++++++++++++++++ mm/slab_common.c | 44 ++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 96 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7633e0d429e0..a32d83c2e353 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -572,7 +572,8 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) * increase it. */ static DEFINE_IDA(kmem_limited_groups); -static int memcg_limited_groups_array_size; +int memcg_limited_groups_array_size; + /* * MIN_SIZE is different than 1, because we would like to avoid going through * the alloc/free process all the time. In a small machine, 4 kmem-limited @@ -2794,6 +2795,27 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; } +#ifdef CONFIG_SLABINFO +static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft, + struct seq_file *m) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct memcg_cache_params *params; + + if (!memcg_can_account_kmem(memcg)) + return -EIO; + + print_slabinfo_header(m); + + mutex_lock(&memcg->slab_caches_mutex); + list_for_each_entry(params, &memcg->memcg_slab_caches, list) + cache_show(memcg_params_to_cache(params), m); + mutex_unlock(&memcg->slab_caches_mutex); + + return 0; +} +#endif + static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) { struct res_counter *fail_res; @@ -5822,6 +5844,12 @@ static struct cftype mem_cgroup_files[] = { .trigger = mem_cgroup_reset, .read = mem_cgroup_read, }, +#ifdef CONFIG_SLABINFO + { + .name = "kmem.slabinfo", + .read_seq_string = mem_cgroup_slabinfo_read, + }, +#endif #endif { }, /* terminate */ }; diff --git a/mm/slab.h b/mm/slab.h index 43d8a38b534f..ec5dae1c8e75 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -138,6 +138,23 @@ static inline bool slab_equal_or_root(struct kmem_cache *s, return (p == s) || (s->memcg_params && (p == s->memcg_params->root_cache)); } + +/* + * We use suffixes to the name in memcg because we can't have caches + * created in the system with the same name. But when we print them + * locally, better refer to them with the base name + */ +static inline const char *cache_name(struct kmem_cache *s) +{ + if (!is_root_cache(s)) + return s->memcg_params->root_cache->name; + return s->name; +} + +static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) +{ + return s->memcg_params->memcg_caches[idx]; +} #else static inline bool is_root_cache(struct kmem_cache *s) { @@ -163,6 +180,16 @@ static inline bool slab_equal_or_root(struct kmem_cache *s, { return true; } + +static inline const char *cache_name(struct kmem_cache *s) +{ + return s->name; +} + +static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) +{ + return NULL; +} #endif static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) diff --git a/mm/slab_common.c b/mm/slab_common.c index 080a43804bf1..081f1b8d9a7b 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -322,7 +322,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, #ifdef CONFIG_SLABINFO -static void print_slabinfo_header(struct seq_file *m) +void print_slabinfo_header(struct seq_file *m) { /* * Output format version, so at least we can change it @@ -366,16 +366,43 @@ static void s_stop(struct seq_file *m, void *p) mutex_unlock(&slab_mutex); } -static int s_show(struct seq_file *m, void *p) +static void +memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) +{ + struct kmem_cache *c; + struct slabinfo sinfo; + int i; + + if (!is_root_cache(s)) + return; + + for_each_memcg_cache_index(i) { + c = cache_from_memcg(s, i); + if (!c) + continue; + + memset(&sinfo, 0, sizeof(sinfo)); + get_slabinfo(c, &sinfo); + + info->active_slabs += sinfo.active_slabs; + info->num_slabs += sinfo.num_slabs; + info->shared_avail += sinfo.shared_avail; + info->active_objs += sinfo.active_objs; + info->num_objs += sinfo.num_objs; + } +} + +int cache_show(struct kmem_cache *s, struct seq_file *m) { - struct kmem_cache *s = list_entry(p, struct kmem_cache, list); struct slabinfo sinfo; memset(&sinfo, 0, sizeof(sinfo)); get_slabinfo(s, &sinfo); + memcg_accumulate_slabinfo(s, &sinfo); + seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", - s->name, sinfo.active_objs, sinfo.num_objs, s->size, + cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size, sinfo.objects_per_slab, (1 << sinfo.cache_order)); seq_printf(m, " : tunables %4u %4u %4u", @@ -387,6 +414,15 @@ static int s_show(struct seq_file *m, void *p) return 0; } +static int s_show(struct seq_file *m, void *p) +{ + struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + + if (!is_root_cache(s)) + return 0; + return cache_show(s, m); +} + /* * slabinfo_op - iterator that generates /proc/slabinfo * -- cgit From 943a451a87d229ca564a27274b58eaeae35fde5d Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:23:03 -0800 Subject: slab: propagate tunable values SLAB allows us to tune a particular cache behavior with tunables. When creating a new memcg cache copy, we'd like to preserve any tunables the parent cache already had. This could be done by an explicit call to do_tune_cpucache() after the cache is created. But this is not very convenient now that the caches are created from common code, since this function is SLAB-specific. Another method of doing that is taking advantage of the fact that do_tune_cpucache() is always called from enable_cpucache(), which is called at cache initialization. We can just preset the values, and then things work as expected. It can also happen that a root cache has its tunables updated during normal system operation. In this case, we will propagate the change to all caches that are already active. This change will require us to move the assignment of root_cache in memcg_params a bit earlier. We need this to be already set - which memcg_kmem_register_cache will do - when we reach __kmem_cache_create() Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 ++++++---- mm/slab.c | 44 +++++++++++++++++++++++++++++++++++++++++--- mm/slab.h | 12 ++++++++++++ mm/slab_common.c | 7 ++++--- 4 files changed, 63 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a32d83c2e353..f3009b4bae51 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3012,7 +3012,8 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) return 0; } -int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s) +int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, + struct kmem_cache *root_cache) { size_t size = sizeof(struct memcg_cache_params); @@ -3026,8 +3027,10 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s) if (!s->memcg_params) return -ENOMEM; - if (memcg) + if (memcg) { s->memcg_params->memcg = memcg; + s->memcg_params->root_cache = root_cache; + } return 0; } @@ -3186,7 +3189,7 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, return NULL; new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, - (s->flags & ~SLAB_PANIC), s->ctor); + (s->flags & ~SLAB_PANIC), s->ctor, s); if (new) new->allocflags |= __GFP_KMEMCG; @@ -3226,7 +3229,6 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, } mem_cgroup_get(memcg); - new_cachep->memcg_params->root_cache = cachep; atomic_set(&new_cachep->memcg_params->nr_pages , 0); cachep->memcg_params->memcg_caches[idx] = new_cachep; diff --git a/mm/slab.c b/mm/slab.c index 7467343f9fe7..4dcbf96a77b4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4041,7 +4041,7 @@ static void do_ccupdate_local(void *info) } /* Always called with the slab_mutex held */ -static int do_tune_cpucache(struct kmem_cache *cachep, int limit, +static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared, gfp_t gfp) { struct ccupdate_struct *new; @@ -4084,12 +4084,48 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, return alloc_kmemlist(cachep, gfp); } +static int do_tune_cpucache(struct kmem_cache *cachep, int limit, + int batchcount, int shared, gfp_t gfp) +{ + int ret; + struct kmem_cache *c = NULL; + int i = 0; + + ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); + + if (slab_state < FULL) + return ret; + + if ((ret < 0) || !is_root_cache(cachep)) + return ret; + + for_each_memcg_cache_index(i) { + c = cache_from_memcg(cachep, i); + if (c) + /* return value determined by the parent cache only */ + __do_tune_cpucache(c, limit, batchcount, shared, gfp); + } + + return ret; +} + /* Called with slab_mutex held always */ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) { int err; - int limit, shared; + int limit = 0; + int shared = 0; + int batchcount = 0; + + if (!is_root_cache(cachep)) { + struct kmem_cache *root = memcg_root_cache(cachep); + limit = root->limit; + shared = root->shared; + batchcount = root->batchcount; + } + if (limit && shared && batchcount) + goto skip_setup; /* * The head array serves three purposes: * - create a LIFO ordering, i.e. return objects that are cache-warm @@ -4131,7 +4167,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) if (limit > 32) limit = 32; #endif - err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); + batchcount = (limit + 1) / 2; +skip_setup: + err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); if (err) printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", cachep->name, -err); diff --git a/mm/slab.h b/mm/slab.h index ec5dae1c8e75..34a98d642196 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -155,6 +155,13 @@ static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) { return s->memcg_params->memcg_caches[idx]; } + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + if (is_root_cache(s)) + return s; + return s->memcg_params->root_cache; +} #else static inline bool is_root_cache(struct kmem_cache *s) { @@ -190,6 +197,11 @@ static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) { return NULL; } + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + return s; +} #endif static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) diff --git a/mm/slab_common.c b/mm/slab_common.c index 081f1b8d9a7b..3f3cd97d3fdf 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -164,7 +164,8 @@ unsigned long calculate_alignment(unsigned long flags, struct kmem_cache * kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) + size_t align, unsigned long flags, void (*ctor)(void *), + struct kmem_cache *parent_cache) { struct kmem_cache *s = NULL; int err = 0; @@ -193,7 +194,7 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, s->align = calculate_alignment(flags, align, size); s->ctor = ctor; - if (memcg_register_cache(memcg, s)) { + if (memcg_register_cache(memcg, s, parent_cache)) { kmem_cache_free(kmem_cache, s); err = -ENOMEM; goto out_locked; @@ -243,7 +244,7 @@ struct kmem_cache * kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { - return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor); + return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL); } EXPORT_SYMBOL(kmem_cache_create); -- cgit From 107dab5c92d5f9c3afe962036e47c207363255c7 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:23:05 -0800 Subject: slub: slub-specific propagation changes SLUB allows us to tune a particular cache behavior with sysfs-based tunables. When creating a new memcg cache copy, we'd like to preserve any tunables the parent cache already had. This can be done by tapping into the store attribute function provided by the allocator. We of course don't need to mess with read-only fields. Since the attributes can have multiple types and are stored internally by sysfs, the best strategy is to issue a ->show() in the root cache, and then ->store() in the memcg cache. The drawback of that, is that sysfs can allocate up to a page in buffering for show(), that we are likely not to need, but also can't guarantee. To avoid always allocating a page for that, we can update the caches at store time with the maximum attribute size ever stored to the root cache. We will then get a buffer big enough to hold it. The corolary to this, is that if no stores happened, nothing will be propagated. It can also happen that a root cache has its tunables updated during normal system operation. In this case, we will propagate the change to all caches that are already active. [akpm@linux-foundation.org: tweak code to avoid __maybe_unused] Signed-off-by: Glauber Costa Cc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 692177bebdf0..21c94d9695ec 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -201,13 +201,14 @@ enum track_item { TRACK_ALLOC, TRACK_FREE }; static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); static void sysfs_slab_remove(struct kmem_cache *); - +static void memcg_propagate_slab_attrs(struct kmem_cache *s); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } static inline void sysfs_slab_remove(struct kmem_cache *s) { } +static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } #endif static inline void stat(const struct kmem_cache *s, enum stat_item si) @@ -3865,6 +3866,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) if (slab_state <= UP) return 0; + memcg_propagate_slab_attrs(s); mutex_unlock(&slab_mutex); err = sysfs_slab_add(s); mutex_lock(&slab_mutex); @@ -5098,10 +5100,82 @@ static ssize_t slab_attr_store(struct kobject *kobj, return -EIO; err = attribute->store(s, buf, len); +#ifdef CONFIG_MEMCG_KMEM + if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { + int i; + + mutex_lock(&slab_mutex); + if (s->max_attr_size < len) + s->max_attr_size = len; + for_each_memcg_cache_index(i) { + struct kmem_cache *c = cache_from_memcg(s, i); + /* + * This function's return value is determined by the + * parent cache only + */ + if (c) + attribute->store(c, buf, len); + } + mutex_unlock(&slab_mutex); + } +#endif return err; } +static void memcg_propagate_slab_attrs(struct kmem_cache *s) +{ +#ifdef CONFIG_MEMCG_KMEM + int i; + char *buffer = NULL; + + if (!is_root_cache(s)) + return; + + /* + * This mean this cache had no attribute written. Therefore, no point + * in copying default values around + */ + if (!s->max_attr_size) + return; + + for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { + char mbuf[64]; + char *buf; + struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); + + if (!attr || !attr->store || !attr->show) + continue; + + /* + * It is really bad that we have to allocate here, so we will + * do it only as a fallback. If we actually allocate, though, + * we can just use the allocated buffer until the end. + * + * Most of the slub attributes will tend to be very small in + * size, but sysfs allows buffers up to a page, so they can + * theoretically happen. + */ + if (buffer) + buf = buffer; + else if (s->max_attr_size < ARRAY_SIZE(mbuf)) + buf = mbuf; + else { + buffer = (char *) get_zeroed_page(GFP_KERNEL); + if (WARN_ON(!buffer)) + continue; + buf = buffer; + } + + attr->show(s->memcg_params->root_cache, buf); + attr->store(s, buf, strlen(buf)); + } + + if (buffer) + free_page((unsigned long)buffer); +#endif +} + static const struct sysfs_ops slab_sysfs_ops = { .show = slab_attr_show, .store = slab_attr_store, -- cgit From ebe945c27628fca03723582eba138acc2e2f3d15 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:23:10 -0800 Subject: memcg: add comments clarifying aspects of cache attribute propagation This patch clarifies two aspects of cache attribute propagation. First, the expected context for the for_each_memcg_cache macro in memcontrol.h. The usages already in the codebase are safe. In mm/slub.c, it is trivially safe because the lock is acquired right before the loop. In mm/slab.c, it is less so: the lock is acquired by an outer function a few steps back in the stack, so a VM_BUG_ON() is added to make sure it is indeed safe. A comment is also added to detail why we are returning the value of the parent cache and ignoring the children's when we propagate the attributes. Signed-off-by: Glauber Costa Cc: Michal Hocko Cc: Kamezawa Hiroyuki Cc: Johannes Weiner Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 1 + mm/slub.c | 21 +++++++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 4dcbf96a77b4..e7667a3584bc 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4099,6 +4099,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, if ((ret < 0) || !is_root_cache(cachep)) return ret; + VM_BUG_ON(!mutex_is_locked(&slab_mutex)); for_each_memcg_cache_index(i) { c = cache_from_memcg(cachep, i); if (c) diff --git a/mm/slub.c b/mm/slub.c index 21c94d9695ec..efe2cffc29b0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5108,12 +5108,25 @@ static ssize_t slab_attr_store(struct kobject *kobj, if (s->max_attr_size < len) s->max_attr_size = len; + /* + * This is a best effort propagation, so this function's return + * value will be determined by the parent cache only. This is + * basically because not all attributes will have a well + * defined semantics for rollbacks - most of the actions will + * have permanent effects. + * + * Returning the error value of any of the children that fail + * is not 100 % defined, in the sense that users seeing the + * error code won't be able to know anything about the state of + * the cache. + * + * Only returning the error code for the parent cache at least + * has well defined semantics. The cache being written to + * directly either failed or succeeded, in which case we loop + * through the descendants with best-effort propagation. + */ for_each_memcg_cache_index(i) { struct kmem_cache *c = cache_from_memcg(s, i); - /* - * This function's return value is determined by the - * parent cache only - */ if (c) attribute->store(c, buf, len); } -- cgit From 5413dfba88d6f6090c8cdf181ab9172d24752f8f Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:23:13 -0800 Subject: slub: drop mutex before deleting sysfs entry Sasha Levin recently reported a lockdep problem resulting from the new attribute propagation introduced by kmemcg series. In short, slab_mutex will be called from within the sysfs attribute store function. This will create a dependency, that will later be held backwards when a cache is destroyed - since destruction occurs with the slab_mutex held, and then calls in to the sysfs directory removal function. In this patch, I propose to adopt a strategy close to what __kmem_cache_create does before calling sysfs_slab_add, and release the lock before the call to sysfs_slab_remove. This is pretty much the last operation in the kmem_cache_shutdown() path, so we could do better by splitting this and moving this call alone to later on. This will fit nicely when sysfs handling is consistent between all caches, but will look weird now. Lockdep info: ====================================================== [ INFO: possible circular locking dependency detected ] 3.7.0-rc4-next-20121106-sasha-00008-g353b62f #117 Tainted: G W ------------------------------------------------------- trinity-child13/6961 is trying to acquire lock: (s_active#43){++++.+}, at: sysfs_addrm_finish+0x31/0x60 but task is already holding lock: (slab_mutex){+.+.+.}, at: kmem_cache_destroy+0x22/0xe0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (slab_mutex){+.+.+.}: lock_acquire+0x1aa/0x240 __mutex_lock_common+0x59/0x5a0 mutex_lock_nested+0x3f/0x50 slab_attr_store+0xde/0x110 sysfs_write_file+0xfa/0x150 vfs_write+0xb0/0x180 sys_pwrite64+0x60/0xb0 tracesys+0xe1/0xe6 -> #0 (s_active#43){++++.+}: __lock_acquire+0x14df/0x1ca0 lock_acquire+0x1aa/0x240 sysfs_deactivate+0x122/0x1a0 sysfs_addrm_finish+0x31/0x60 sysfs_remove_dir+0x89/0xd0 kobject_del+0x16/0x40 __kmem_cache_shutdown+0x40/0x60 kmem_cache_destroy+0x40/0xe0 mon_text_release+0x78/0xe0 __fput+0x122/0x2d0 ____fput+0x9/0x10 task_work_run+0xbe/0x100 do_exit+0x432/0xbd0 do_group_exit+0x84/0xd0 get_signal_to_deliver+0x81d/0x930 do_signal+0x3a/0x950 do_notify_resume+0x3e/0x90 int_signal+0x12/0x17 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(slab_mutex); lock(s_active#43); lock(slab_mutex); lock(s_active#43); *** DEADLOCK *** 2 locks held by trinity-child13/6961: #0: (mon_lock){+.+.+.}, at: mon_text_release+0x25/0xe0 #1: (slab_mutex){+.+.+.}, at: kmem_cache_destroy+0x22/0xe0 stack backtrace: Pid: 6961, comm: trinity-child13 Tainted: G W 3.7.0-rc4-next-20121106-sasha-00008-g353b62f #117 Call Trace: print_circular_bug+0x1fb/0x20c __lock_acquire+0x14df/0x1ca0 lock_acquire+0x1aa/0x240 sysfs_deactivate+0x122/0x1a0 sysfs_addrm_finish+0x31/0x60 sysfs_remove_dir+0x89/0xd0 kobject_del+0x16/0x40 __kmem_cache_shutdown+0x40/0x60 kmem_cache_destroy+0x40/0xe0 mon_text_release+0x78/0xe0 __fput+0x122/0x2d0 ____fput+0x9/0x10 task_work_run+0xbe/0x100 do_exit+0x432/0xbd0 do_group_exit+0x84/0xd0 get_signal_to_deliver+0x81d/0x930 do_signal+0x3a/0x950 do_notify_resume+0x3e/0x90 int_signal+0x12/0x17 Signed-off-by: Glauber Costa Reported-by: Sasha Levin Cc: Michal Hocko Cc: Kamezawa Hiroyuki Cc: Johannes Weiner Cc: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index efe2cffc29b0..ba2ca53f6c3a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3153,8 +3153,19 @@ int __kmem_cache_shutdown(struct kmem_cache *s) { int rc = kmem_cache_close(s); - if (!rc) + if (!rc) { + /* + * We do the same lock strategy around sysfs_slab_add, see + * __kmem_cache_create. Because this is pretty much the last + * operation we do and the lock will be released shortly after + * that in slab_common.c, we could just move sysfs_slab_remove + * to a later point in common code. We should do that when we + * have a common sysfs framework for all allocators. + */ + mutex_unlock(&slab_mutex); sysfs_slab_remove(s); + mutex_lock(&slab_mutex); + } return rc; } -- cgit From 7d12efaea7e74bf8f4953412514e836313fa32ec Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 18 Dec 2012 14:23:17 -0800 Subject: mm/mprotect.c: coding-style cleanups A few gremlins have recently crept in. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index 3dca970367db..94722a4d6b43 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -114,7 +114,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, #ifdef CONFIG_NUMA_BALANCING static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, - pmd_t *pmd) + pmd_t *pmd) { spin_lock(&mm->page_table_lock); set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); @@ -122,15 +122,15 @@ static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, } #else static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, - pmd_t *pmd) + pmd_t *pmd) { BUG(); } #endif /* CONFIG_NUMA_BALANCING */ -static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) +static inline unsigned long change_pmd_range(struct vm_area_struct *vma, + pud_t *pud, unsigned long addr, unsigned long end, + pgprot_t newprot, int dirty_accountable, int prot_numa) { pmd_t *pmd; unsigned long next; @@ -143,7 +143,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma, addr, pmd); - else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { + else if (change_huge_pmd(vma, pmd, addr, newprot, + prot_numa)) { pages += HPAGE_PMD_NR; continue; } @@ -167,9 +168,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * return pages; } -static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) +static inline unsigned long change_pud_range(struct vm_area_struct *vma, + pgd_t *pgd, unsigned long addr, unsigned long end, + pgprot_t newprot, int dirty_accountable, int prot_numa) { pud_t *pud; unsigned long next; @@ -304,7 +305,8 @@ success: dirty_accountable = 1; } - change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); + change_protection(vma, start, end, vma->vm_page_prot, + dirty_accountable, 0); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); @@ -361,8 +363,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = -EINVAL; if (!(vma->vm_flags & VM_GROWSDOWN)) goto out; - } - else { + } else { if (vma->vm_start > start) goto out; if (unlikely(grows & PROT_GROWSUP)) { @@ -378,9 +379,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, for (nstart = start ; ; ) { unsigned long newflags; - /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); + newflags = vm_flags; + newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); /* newflags >> 4 shift VM_MAY% in place of VM_% */ if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { -- cgit From 7179e7bf4592ac5a7b30257a7df6259ee81e51da Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Tue, 18 Dec 2012 14:23:19 -0800 Subject: mm/hugetlb: create hugetlb cgroup file in hugetlb_init Build kernel with CONFIG_HUGETLBFS=y,CONFIG_HUGETLB_PAGE=y and CONFIG_CGROUP_HUGETLB=y, then specify hugepagesz=xx boot option, system will fail to boot. This failure is caused by following code path: setup_hugepagesz hugetlb_add_hstate hugetlb_cgroup_file_init cgroup_add_cftypes kzalloc <--slab is *not available* yet For this path, slab is not available yet, so memory allocated will be failed, and cause WARN_ON() in hugetlb_cgroup_file_init(). So I move hugetlb_cgroup_file_init() into hugetlb_init(). [akpm@linux-foundation.org: tweak coding-style, remove pointless __init on inlined function] [akpm@linux-foundation.org: fix warning] Signed-off-by: Jianguo Wu Signed-off-by: Jiang Liu Reviewed-by: Aneesh Kumar K.V Acked-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 11 +---------- mm/hugetlb_cgroup.c | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e5318c7793ae..4f3ea0b1e57c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1906,14 +1906,12 @@ static int __init hugetlb_init(void) default_hstate.max_huge_pages = default_hstate_max_huge_pages; hugetlb_init_hstates(); - gather_bootmem_prealloc(); - report_hugepages(); hugetlb_sysfs_init(); - hugetlb_register_all_nodes(); + hugetlb_cgroup_file_init(); return 0; } @@ -1943,13 +1941,6 @@ void __init hugetlb_add_hstate(unsigned order) h->next_nid_to_free = first_node(node_states[N_MEMORY]); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); - /* - * Add cgroup control files only if the huge page consists - * of more than two normal pages. This is because we use - * page[2].lru.next for storing cgoup details. - */ - if (order >= HUGETLB_CGROUP_MIN_ORDER) - hugetlb_cgroup_file_init(hugetlb_max_hstate - 1); parsed_hstate = h; } diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index b5bde7a5c017..9cea7de22ffb 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -333,7 +333,7 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize) return buf; } -int __init hugetlb_cgroup_file_init(int idx) +static void __init __hugetlb_cgroup_file_init(int idx) { char buf[32]; struct cftype *cft; @@ -375,7 +375,22 @@ int __init hugetlb_cgroup_file_init(int idx) WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); - return 0; + return; +} + +void __init hugetlb_cgroup_file_init(void) +{ + struct hstate *h; + + for_each_hstate(h) { + /* + * Add cgroup control files only if the huge page consists + * of more than two normal pages. This is because we use + * page[2].lru.next for storing cgroup details. + */ + if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) + __hugetlb_cgroup_file_init(hstate_index(h)); + } } /* -- cgit From 79a4dcefd3256896416f5a94d2d1442a7f5aa9b8 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 18 Dec 2012 14:23:24 -0800 Subject: mm/memory_hotplug.c: improve comments Signed-off-by: Tang Chen Cc: Jiang Liu Cc: Lai Jiangshan Cc: Wen Congyang Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 962e353aa86f..d04ed87bfacb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -590,18 +590,21 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, } #ifdef CONFIG_MOVABLE_NODE -/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */ +/* + * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have + * normal memory. + */ static bool can_online_high_movable(struct zone *zone) { return true; } -#else /* #ifdef CONFIG_MOVABLE_NODE */ +#else /* CONFIG_MOVABLE_NODE */ /* ensure every online node has NORMAL memory */ static bool can_online_high_movable(struct zone *zone) { return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); } -#endif /* #ifdef CONFIG_MOVABLE_NODE */ +#endif /* CONFIG_MOVABLE_NODE */ /* check which state of node_states will be changed when online memory */ static void node_states_check_changes_online(unsigned long nr_pages, @@ -1112,12 +1115,15 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) } #ifdef CONFIG_MOVABLE_NODE -/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */ +/* + * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have + * normal memory. + */ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) { return true; } -#else /* #ifdef CONFIG_MOVABLE_NODE */ +#else /* CONFIG_MOVABLE_NODE */ /* ensure the node has NORMAL memory if it is still online */ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) { @@ -1141,7 +1147,7 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) */ return present_pages == 0; } -#endif /* #ifdef CONFIG_MOVABLE_NODE */ +#endif /* CONFIG_MOVABLE_NODE */ /* check which state of node_states will be changed when offline memory */ static void node_states_check_changes_offline(unsigned long nr_pages, -- cgit From dc053733ea44babedb20266300b984d6add8b9e5 Mon Sep 17 00:00:00 2001 From: Abhijit Pawar Date: Tue, 18 Dec 2012 14:23:27 -0800 Subject: mm/kmemleak.c: remove obsolete simple_strtoul Replace the obsolete simple_strtoul() with kstrtoul(). Signed-off-by: Abhijit Pawar Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index a217cc544060..752a705c77c2 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1556,7 +1556,8 @@ static int dump_str_object_info(const char *str) struct kmemleak_object *object; unsigned long addr; - addr= simple_strtoul(str, NULL, 0); + if (kstrtoul(str, 0, &addr)) + return -EINVAL; object = find_and_get_object(addr, 0); if (!object) { pr_info("Unknown object at 0x%08lx\n", addr); -- cgit From d37dd5dcb955dd8c2cdd4eaef1f15d1b7ecbc379 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Tue, 18 Dec 2012 14:23:28 -0800 Subject: vmscan: comment too_many_isolated() Comment "Why it's doing so" rather than "What it does" as proposed by Andrew Morton. Signed-off-by: Wu Fengguang Reviewed-by: KOSAKI Motohiro Reviewed-by: Minchan Kim Reviewed-by: Rik van Riel Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 7f3096137b8a..e73d0206dddd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1177,7 +1177,11 @@ int isolate_lru_page(struct page *page) } /* - * Are there way too many processes in the direct reclaim path already? + * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and + * then get resheduled. When there are massive number of tasks doing page + * allocation, such sleeping direct reclaimers may keep piling up on each CPU, + * the LRU list will go small and be scanned faster than necessary, leading to + * unnecessary swapping, thrashing and OOM. */ static int too_many_isolated(struct zone *zone, int file, struct scan_control *sc) -- cgit From 3cf23841b4b76eb94d3f8d0fb3627690e4431413 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Tue, 18 Dec 2012 14:23:31 -0800 Subject: mm/vmscan.c: avoid possible deadlock caused by too_many_isolated() Neil found that if too_many_isolated() returns true while performing direct reclaim we can end up waiting for other threads to complete their direct reclaim. If those threads are allowed to enter the FS or IO to free memory, but this thread is not, then it is possible that those threads will be waiting on this thread and so we get a circular deadlock. some task enters direct reclaim with GFP_KERNEL => too_many_isolated() false => vmscan and run into dirty pages => pageout() => take some FS lock => fs/block code does GFP_NOIO allocation => enter direct reclaim again => too_many_isolated() true => waiting for others to progress, however the other tasks may be circular waiting for the FS lock.. The fix is to let !__GFP_IO and !__GFP_FS direct reclaims enjoy higher priority than normal ones, by lowering the throttle threshold for the latter. Allowing ~1/8 isolated pages in normal is large enough. For example, for a 1GB LRU list, that's ~128MB isolated pages, or 1k blocked tasks (each isolates 32 4KB pages), or 64 blocked tasks per logical CPU (assuming 16 logical CPUs per NUMA node). So it's not likely some CPU goes idle waiting (when it could make progress) because of this limit: there are much more sleeping reclaim tasks than the number of CPU, so the task may well be blocked by some low level queue/lock anyway. Now !GFP_IOFS reclaims won't be waiting for GFP_IOFS reclaims to progress. They will be blocked only when there are too many concurrent !GFP_IOFS reclaims, however that's very unlikely because the IO-less direct reclaims is able to progress much more faster, and they won't deadlock each other. The threshold is raised high enough for them, so that there can be sufficient parallel progress of !GFP_IOFS reclaims. [akpm@linux-foundation.org: tweak comment] Signed-off-by: Wu Fengguang Cc: Torsten Kaiser Tested-by: NeilBrown Reviewed-by: Minchan Kim Acked-by: KOSAKI Motohiro Acked-by: Rik van Riel Cc: Mel Gorman Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index e73d0206dddd..828530e2794a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1202,6 +1202,14 @@ static int too_many_isolated(struct zone *zone, int file, isolated = zone_page_state(zone, NR_ISOLATED_ANON); } + /* + * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they + * won't get blocked by normal direct-reclaimers, forming a circular + * deadlock. + */ + if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + inactive >>= 3; + return isolated > inactive; } -- cgit From cda73a10eb3f493871ed39f468db50a65ebeddce Mon Sep 17 00:00:00 2001 From: Zlatko Calusic Date: Thu, 20 Dec 2012 00:25:13 +0100 Subject: mm: do not sleep in balance_pgdat if there's no i/o congestion On a 4GB RAM machine, where Normal zone is much smaller than DMA32 zone, the Normal zone gets fragmented in time. This requires relatively more pressure in balance_pgdat to get the zone above the required watermark. Unfortunately, the congestion_wait() call in there slows it down for a completely wrong reason, expecting that there's a lot of writeback/swapout, even when there's none (much more common). After a few days, when fragmentation progresses, this flawed logic translates to a very high CPU iowait times, even though there's no I/O congestion at all. If THP is enabled, the problem occurs sooner, but I was able to see it even on !THP kernels, just by giving it a bit more time to occur. The proper way to deal with this is to not wait, unless there's congestion. Thanks to Mel Gorman, we already have the function that perfectly fits the job. The patch was tested on a machine which nicely revealed the problem after only 1 day of uptime, and it's been working great. Signed-off-by: Zlatko Calusic Acked-by: Mel Gorman Signed-off-by: Linus Torvalds --- mm/vmscan.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 828530e2794a..adc7e9058181 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2570,7 +2570,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, static unsigned long balance_pgdat(pg_data_t *pgdat, int order, int *classzone_idx) { - int all_zones_ok; + struct zone *unbalanced_zone; unsigned long balanced; int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ @@ -2604,7 +2604,7 @@ loop_again: unsigned long lru_pages = 0; int has_under_min_watermark_zone = 0; - all_zones_ok = 1; + unbalanced_zone = NULL; balanced = 0; /* @@ -2743,7 +2743,7 @@ loop_again: } if (!zone_balanced(zone, testorder, 0, end_zone)) { - all_zones_ok = 0; + unbalanced_zone = zone; /* * We are still under min water mark. This * means that we have a GFP_ATOMIC allocation @@ -2776,7 +2776,7 @@ loop_again: pfmemalloc_watermark_ok(pgdat)) wake_up(&pgdat->pfmemalloc_wait); - if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) + if (!unbalanced_zone || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) break; /* kswapd: all done */ /* * OK, kswapd is getting into trouble. Take a nap, then take @@ -2786,7 +2786,7 @@ loop_again: if (has_under_min_watermark_zone) count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); else - congestion_wait(BLK_RW_ASYNC, HZ/10); + wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10); } /* @@ -2805,7 +2805,7 @@ out: * high-order: Balanced zones must make up at least 25% of the node * for the node to be balanced */ - if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { + if (unbalanced_zone && (!order || !pgdat_balanced(pgdat, balanced, *classzone_idx))) { cond_resched(); try_to_freeze(); -- cgit From b6b19f25f69149c0912788fb81466dd2310bb095 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 19 Dec 2012 17:44:29 -0800 Subject: ksm: make rmap walks more scalable The rmap walks in ksm.c are like those in rmap.c: they can safely be done with anon_vma_lock_read(). Signed-off-by: Hugh Dickins Acked-by: Mel Gorman Signed-off-by: Linus Torvalds --- mm/ksm.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 82dfb4b54321..51573858938d 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1624,7 +1624,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock_write(anon_vma); + anon_vma_lock_read(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1648,7 +1648,7 @@ again: if (!search_new_forks || !mapcount) break; } - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); if (!mapcount) goto out; } @@ -1678,7 +1678,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock_write(anon_vma); + anon_vma_lock_read(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1697,11 +1697,11 @@ again: ret = try_to_unmap_one(page, vma, rmap_item->address, flags); if (ret != SWAP_AGAIN || !page_mapped(page)) { - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); goto out; } } - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); } if (!search_new_forks++) goto again; @@ -1731,7 +1731,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock_write(anon_vma); + anon_vma_lock_read(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1749,11 +1749,11 @@ again: ret = rmap_one(page, vma, rmap_item->address, arg); if (ret != SWAP_AGAIN) { - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); goto out; } } - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); } if (!search_new_forks++) goto again; -- cgit From 7898575fc81bd707ce0844cb06874d48e39bbe09 Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 12:00:02 +0100 Subject: mm: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Signed-off-by: Al Viro --- mm/truncate.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index d51ce92d6e83..c75b736e54b7 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -576,29 +576,6 @@ void truncate_setsize(struct inode *inode, loff_t newsize) } EXPORT_SYMBOL(truncate_setsize); -/** - * vmtruncate - unmap mappings "freed" by truncate() syscall - * @inode: inode of the file used - * @newsize: file offset to start truncating - * - * This function is deprecated and truncate_setsize or truncate_pagecache - * should be used instead, together with filesystem specific block truncation. - */ -int vmtruncate(struct inode *inode, loff_t newsize) -{ - int error; - - error = inode_newsize_ok(inode, newsize); - if (error) - return error; - - truncate_setsize(inode, newsize); - if (inode->i_op->truncate) - inode->i_op->truncate(inode); - return 0; -} -EXPORT_SYMBOL(vmtruncate); - /** * truncate_pagecache_range - unmap and remove pagecache that is hole-punched * @inode: inode -- cgit From 010fc29a45a2e8dbc08bf45ef80b8622619aaae0 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 20 Dec 2012 15:05:06 -0800 Subject: compaction: fix build error in CMA && !COMPACTION isolate_freepages_block() and isolate_migratepages_range() are used for CMA as well as compaction so it breaks build for CONFIG_CMA && !CONFIG_COMPACTION. This patch fixes it. [akpm@linux-foundation.org: add "do { } while (0)", per Mel] Signed-off-by: Minchan Kim Cc: Mel Gorman Cc: Marek Szyprowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 5ad7f4f4d6f7..6b807e466497 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -17,6 +17,21 @@ #include #include "internal.h" +#ifdef CONFIG_COMPACTION +static inline void count_compact_event(enum vm_event_item item) +{ + count_vm_event(item); +} + +static inline void count_compact_events(enum vm_event_item item, long delta) +{ + count_vm_events(item, delta); +} +#else +#define count_compact_event(item) do { } while (0) +#define count_compact_events(item, delta) do { } while (0) +#endif + #if defined CONFIG_COMPACTION || defined CONFIG_CMA #define CREATE_TRACE_POINTS @@ -303,10 +318,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (blockpfn == end_pfn) update_pageblock_skip(cc, valid_page, total_isolated, false); - count_vm_events(COMPACTFREE_SCANNED, nr_scanned); + count_compact_events(COMPACTFREE_SCANNED, nr_scanned); if (total_isolated) - count_vm_events(COMPACTISOLATED, total_isolated); - + count_compact_events(COMPACTISOLATED, total_isolated); return total_isolated; } @@ -613,9 +627,9 @@ next_pageblock: trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); - count_vm_events(COMPACTMIGRATE_SCANNED, nr_scanned); + count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); if (nr_isolated) - count_vm_events(COMPACTISOLATED, nr_isolated); + count_compact_events(COMPACTISOLATED, nr_isolated); return low_pfn; } @@ -1110,7 +1124,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, if (!order || !may_enter_fs || !may_perform_io) return rc; - count_vm_event(COMPACTSTALL); + count_compact_event(COMPACTSTALL); #ifdef CONFIG_CMA if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) -- cgit From c8b74c2f6604923de91f8aa6539f8bb934736754 Mon Sep 17 00:00:00 2001 From: Sonny Rao Date: Thu, 20 Dec 2012 15:05:07 -0800 Subject: mm: fix calculation of dirtyable memory The system uses global_dirtyable_memory() to calculate number of dirtyable pages/pages that can be allocated to the page cache. A bug causes an underflow thus making the page count look like a big unsigned number. This in turn confuses the dirty writeback throttling to aggressively write back pages as they become dirty (usually 1 page at a time). This generally only affects systems with highmem because the underflowed count gets subtracted from the global count of dirtyable memory. The problem was introduced with v3.2-4896-gab8fabd Fix is to ensure we don't get an underflowed total of either highmem or global dirtyable memory. Signed-off-by: Sonny Rao Signed-off-by: Puneet Kumar Acked-by: Johannes Weiner Tested-by: Damien Wyart Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6f4271224493..0713bfbf0954 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -200,6 +200,18 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) x += zone_page_state(z, NR_FREE_PAGES) + zone_reclaimable_pages(z) - z->dirty_balance_reserve; } + /* + * Unreclaimable memory (kernel memory or anonymous memory + * without swap) can bring down the dirtyable pages below + * the zone's dirty balance reserve and the above calculation + * will underflow. However we still want to add in nodes + * which are below threshold (negative values) to get a more + * accurate calculation but make sure that the total never + * underflows. + */ + if ((long)x < 0) + x = 0; + /* * Make sure that the number of highmem pages is never larger * than the number of the total dirtyable memory. This can only @@ -222,8 +234,8 @@ static unsigned long global_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() - - dirty_balance_reserve; + x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); + x -= min(x, dirty_balance_reserve); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); @@ -290,9 +302,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone) * highmem zone can hold its share of dirty pages, so we don't * care about vm_highmem_is_dirtyable here. */ - return zone_page_state(zone, NR_FREE_PAGES) + - zone_reclaimable_pages(zone) - - zone->dirty_balance_reserve; + unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) + + zone_reclaimable_pages(zone); + + /* don't allow this to underflow */ + nr_pages -= min(nr_pages, zone->dirty_balance_reserve); + return nr_pages; } /** -- cgit From bcc2b02f4c1b36bc67272df7119b75bac78525ab Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Thu, 20 Dec 2012 15:05:18 -0800 Subject: mm: cma: WARN if freed memory is still in use Memory returned to free_contig_range() must have no other references. Let kernel to complain loudly if page reference count is not equal to 1. [rientjes@google.com: support sparsemem] Signed-off-by: Marek Szyprowski Reviewed-by: Kyungmin Park Acked-by: Michal Nazarewicz Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2ad2ad168efe..4ba5e37127fc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5978,8 +5978,15 @@ done: void free_contig_range(unsigned long pfn, unsigned nr_pages) { - for (; nr_pages--; ++pfn) - __free_page(pfn_to_page(pfn)); + unsigned int count = 0; + + for (; nr_pages--; pfn++) { + struct page *page = pfn_to_page(pfn); + + count += page_count(page) != 1; + __free_page(page); + } + WARN(count != 0, "%d pages are still in use!\n", count); } #endif -- cgit From 2c79737af83e0d586899f48c6010148ea2064369 Mon Sep 17 00:00:00 2001 From: Jeremy Eder Date: Thu, 20 Dec 2012 15:05:32 -0800 Subject: mm: clean up transparent hugepage sysfs error messages Clarify error messages and correct a few typos in the transparent hugepage sysfs init code. Signed-off-by: Jeremy Eder Acked-by: Rafael Aquini Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 32754eece63e..9e894edc7811 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -574,19 +574,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { - printk(KERN_ERR "hugepage: failed kobject create\n"); + printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n"); return -ENOMEM; } err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); if (err) { - printk(KERN_ERR "hugepage: failed register hugeage group\n"); + printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); goto delete_obj; } err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); if (err) { - printk(KERN_ERR "hugepage: failed register hugeage group\n"); + printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); goto remove_hp_group; } -- cgit From 154b454edaf6d94a69016db6c342c57fa935bbe9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 20 Dec 2012 15:05:40 -0800 Subject: memcg: don't register hotcpu notifier from ->css_alloc() Commit 648bb56d076b ("cgroup: lock cgroup_mutex in cgroup_init_subsys()") made cgroup_init_subsys() grab cgroup_mutex before invoking ->css_alloc() for the root css. Because memcg registers hotcpu notifier from ->css_alloc() for the root css, this introduced circular locking dependency between cgroup_mutex and cpu hotplug. Fix it by moving hotcpu notifier registration to a subsys initcall. ====================================================== [ INFO: possible circular locking dependency detected ] 3.7.0-rc4-work+ #42 Not tainted ------------------------------------------------------- bash/645 is trying to acquire lock: (cgroup_mutex){+.+.+.}, at: [] cgroup_lock+0x17/0x20 but task is already holding lock: (cpu_hotplug.lock){+.+.+.}, at: [] cpu_hotplug_begin+0x2f/0x60 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (cpu_hotplug.lock){+.+.+.}: lock_acquire+0x97/0x1e0 mutex_lock_nested+0x61/0x3b0 get_online_cpus+0x3c/0x60 rebuild_sched_domains_locked+0x1b/0x70 cpuset_write_resmask+0x298/0x2c0 cgroup_file_write+0x1ef/0x300 vfs_write+0xa8/0x160 sys_write+0x52/0xa0 system_call_fastpath+0x16/0x1b -> #0 (cgroup_mutex){+.+.+.}: __lock_acquire+0x14ce/0x1d20 lock_acquire+0x97/0x1e0 mutex_lock_nested+0x61/0x3b0 cgroup_lock+0x17/0x20 cpuset_handle_hotplug+0x1b/0x560 cpuset_update_active_cpus+0xe/0x10 cpuset_cpu_inactive+0x47/0x50 notifier_call_chain+0x66/0x150 __raw_notifier_call_chain+0xe/0x10 __cpu_notify+0x20/0x40 _cpu_down+0x7e/0x2f0 cpu_down+0x36/0x50 store_online+0x5d/0xe0 dev_attr_store+0x18/0x30 sysfs_write_file+0xe0/0x150 vfs_write+0xa8/0x160 sys_write+0x52/0xa0 system_call_fastpath+0x16/0x1b other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(cpu_hotplug.lock); lock(cgroup_mutex); lock(cpu_hotplug.lock); lock(cgroup_mutex); *** DEADLOCK *** 5 locks held by bash/645: #0: (&buffer->mutex){+.+.+.}, at: [] sysfs_write_file+0x48/0x150 #1: (s_active#42){.+.+.+}, at: [] sysfs_write_file+0xc8/0x150 #2: (x86_cpu_hotplug_driver_mutex){+.+...}, at: [] cpu_hotplug_driver_lock+0x1 +7/0x20 #3: (cpu_add_remove_lock){+.+.+.}, at: [] cpu_maps_update_begin+0x17/0x20 #4: (cpu_hotplug.lock){+.+.+.}, at: [] cpu_hotplug_begin+0x2f/0x60 stack backtrace: Pid: 645, comm: bash Not tainted 3.7.0-rc4-work+ #42 Call Trace: print_circular_bug+0x28e/0x29f __lock_acquire+0x14ce/0x1d20 lock_acquire+0x97/0x1e0 mutex_lock_nested+0x61/0x3b0 cgroup_lock+0x17/0x20 cpuset_handle_hotplug+0x1b/0x560 cpuset_update_active_cpus+0xe/0x10 cpuset_cpu_inactive+0x47/0x50 notifier_call_chain+0x66/0x150 __raw_notifier_call_chain+0xe/0x10 __cpu_notify+0x20/0x40 _cpu_down+0x7e/0x2f0 cpu_down+0x36/0x50 store_online+0x5d/0xe0 dev_attr_store+0x18/0x30 sysfs_write_file+0xe0/0x150 vfs_write+0xa8/0x160 sys_write+0x52/0xa0 system_call_fastpath+0x16/0x1b Signed-off-by: Tejun Heo Reported-by: Fengguang Wu Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f3009b4bae51..09255ec8159c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6090,7 +6090,6 @@ mem_cgroup_css_alloc(struct cgroup *cont) &per_cpu(memcg_stock, cpu); INIT_WORK(&stock->work, drain_local_stock); } - hotcpu_notifier(memcg_cpu_hotplug_callback, 0); } else { parent = mem_cgroup_from_cont(cont->parent); memcg->use_hierarchy = parent->use_hierarchy; @@ -6756,6 +6755,19 @@ struct cgroup_subsys mem_cgroup_subsys = { .use_id = 1, }; +/* + * The rest of init is performed during ->css_alloc() for root css which + * happens before initcalls. hotcpu_notifier() can't be done together as + * it would introduce circular locking by adding cgroup_lock -> cpu hotplug + * dependency. Do it from a subsys_initcall(). + */ +static int __init mem_cgroup_init(void) +{ + hotcpu_notifier(memcg_cpu_hotplug_callback, 0); + return 0; +} +subsys_initcall(mem_cgroup_init); + #ifdef CONFIG_MEMCG_SWAP static int __init enable_swap_account(char *s) { -- cgit From 4ae0a48b5efc44a95f5e7bb578f9de71fd35bfd0 Mon Sep 17 00:00:00 2001 From: Zlatko Calusic Date: Sun, 23 Dec 2012 15:12:54 +0100 Subject: mm: modify pgdat_balanced() so that it also handles order-0 Teach pgdat_balanced() about order-0 allocations so that we can simplify code in a few places in vmstat.c. Suggested-by: Andrew Morton Signed-off-by: Zlatko Calusic Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/vmscan.c | 105 ++++++++++++++++++++++++++---------------------------------- 1 file changed, 45 insertions(+), 60 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index adc7e9058181..23291b9ae871 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2452,12 +2452,16 @@ static bool zone_balanced(struct zone *zone, int order, } /* - * pgdat_balanced is used when checking if a node is balanced for high-order - * allocations. Only zones that meet watermarks and are in a zone allowed - * by the callers classzone_idx are added to balanced_pages. The total of - * balanced pages must be at least 25% of the zones allowed by classzone_idx - * for the node to be considered balanced. Forcing all zones to be balanced - * for high orders can cause excessive reclaim when there are imbalanced zones. + * pgdat_balanced() is used when checking if a node is balanced. + * + * For order-0, all zones must be balanced! + * + * For high-order allocations only zones that meet watermarks and are in a + * zone allowed by the callers classzone_idx are added to balanced_pages. The + * total of balanced pages must be at least 25% of the zones allowed by + * classzone_idx for the node to be considered balanced. Forcing all zones to + * be balanced for high orders can cause excessive reclaim when there are + * imbalanced zones. * The choice of 25% is due to * o a 16M DMA zone that is balanced will not balance a zone on any * reasonable sized machine @@ -2467,17 +2471,43 @@ static bool zone_balanced(struct zone *zone, int order, * Similarly, on x86-64 the Normal zone would need to be at least 1G * to balance a node on its own. These seemed like reasonable ratios. */ -static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, - int classzone_idx) +static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) { unsigned long present_pages = 0; + unsigned long balanced_pages = 0; int i; - for (i = 0; i <= classzone_idx; i++) - present_pages += pgdat->node_zones[i].present_pages; + /* Check the watermark levels */ + for (i = 0; i <= classzone_idx; i++) { + struct zone *zone = pgdat->node_zones + i; - /* A special case here: if zone has no page, we think it's balanced */ - return balanced_pages >= (present_pages >> 2); + if (!populated_zone(zone)) + continue; + + present_pages += zone->present_pages; + + /* + * A special case here: + * + * balance_pgdat() skips over all_unreclaimable after + * DEF_PRIORITY. Effectively, it considers them balanced so + * they must be considered balanced here as well! + */ + if (zone->all_unreclaimable) { + balanced_pages += zone->present_pages; + continue; + } + + if (zone_balanced(zone, order, 0, i)) + balanced_pages += zone->present_pages; + else if (!order) + return false; + } + + if (order) + return balanced_pages >= (present_pages >> 2); + else + return true; } /* @@ -2489,10 +2519,6 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, int classzone_idx) { - int i; - unsigned long balanced = 0; - bool all_zones_ok = true; - /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ if (remaining) return false; @@ -2511,39 +2537,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, return false; } - /* Check the watermark levels */ - for (i = 0; i <= classzone_idx; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - /* - * balance_pgdat() skips over all_unreclaimable after - * DEF_PRIORITY. Effectively, it considers them balanced so - * they must be considered balanced here as well if kswapd - * is to sleep - */ - if (zone->all_unreclaimable) { - balanced += zone->present_pages; - continue; - } - - if (!zone_balanced(zone, order, 0, i)) - all_zones_ok = false; - else - balanced += zone->present_pages; - } - - /* - * For high-order requests, the balanced zones must contain at least - * 25% of the nodes pages for kswapd to sleep. For order-0, all zones - * must be balanced - */ - if (order) - return pgdat_balanced(pgdat, balanced, classzone_idx); - else - return all_zones_ok; + return pgdat_balanced(pgdat, order, classzone_idx); } /* @@ -2571,7 +2565,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, int *classzone_idx) { struct zone *unbalanced_zone; - unsigned long balanced; int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long total_scanned; @@ -2605,7 +2598,6 @@ loop_again: int has_under_min_watermark_zone = 0; unbalanced_zone = NULL; - balanced = 0; /* * Scan in the highmem->dma direction for the highest @@ -2761,8 +2753,6 @@ loop_again: * speculatively avoid congestion waits */ zone_clear_flag(zone, ZONE_CONGESTED); - if (i <= *classzone_idx) - balanced += zone->present_pages; } } @@ -2776,7 +2766,7 @@ loop_again: pfmemalloc_watermark_ok(pgdat)) wake_up(&pgdat->pfmemalloc_wait); - if (!unbalanced_zone || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) + if (pgdat_balanced(pgdat, order, *classzone_idx)) break; /* kswapd: all done */ /* * OK, kswapd is getting into trouble. Take a nap, then take @@ -2800,12 +2790,7 @@ loop_again: } while (--sc.priority >= 0); out: - /* - * order-0: All zones must meet high watermark for a balanced node - * high-order: Balanced zones must make up at least 25% of the node - * for the node to be balanced - */ - if (unbalanced_zone && (!order || !pgdat_balanced(pgdat, balanced, *classzone_idx))) { + if (!pgdat_balanced(pgdat, order, *classzone_idx)) { cond_resched(); try_to_freeze(); -- cgit From ecccd1248d6e6986130ffcc3b0d003cb46a485c0 Mon Sep 17 00:00:00 2001 From: Zlatko Calusic Date: Fri, 28 Dec 2012 03:16:38 +0100 Subject: mm: fix null pointer dereference in wait_iff_congested() An unintended consequence of commit 4ae0a48b5efc ("mm: modify pgdat_balanced() so that it also handles order-0") is that wait_iff_congested() can now be called with NULL 'struct zone *' producing kernel oops like this: BUG: unable to handle kernel NULL pointer dereference IP: [] wait_iff_congested+0x59/0x140 This trivial patch fixes it. Reported-by: Zhouping Liu Reported-and-tested-by: Sedat Dilek Cc: Andrew Morton Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Zlatko Calusic Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 23291b9ae871..16b42af393ac 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2775,7 +2775,7 @@ loop_again: if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) { if (has_under_min_watermark_zone) count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); - else + else if (unbalanced_zone) wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10); } -- cgit From f2a07f40dbc603c15f8b06e6ec7f768af67b424f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 2 Jan 2013 02:01:33 -0800 Subject: tmpfs mempolicy: fix /proc/mounts corrupting memory Recently I suggested using "mount -o remount,mpol=local /tmp" in NUMA mempolicy testing. Very nasty. Reading /proc/mounts, /proc/pid/mounts or /proc/pid/mountinfo may then corrupt one bit of kernel memory, often in a page table (causing "Bad swap" or "Bad page map" warning or "Bad pagetable" oops), sometimes in a vm_area_struct or rbnode or somewhere worse. "mpol=prefer" and "mpol=prefer:Node" are equally toxic. Recent NUMA enhancements are not to blame: this dates back to 2.6.35, when commit e17f74af351c "mempolicy: don't call mpol_set_nodemask() when no_context" skipped mpol_parse_str()'s call to mpol_set_nodemask(), which used to initialize v.preferred_node, or set MPOL_F_LOCAL in flags. With slab poisoning, you can then rely on mpol_to_str() to set the bit for node 0x6b6b, probably in the next page above the caller's stack. mpol_parse_str() is only called from shmem_parse_options(): no_context is always true, so call it unused for now, and remove !no_context code. Set v.nodes or v.preferred_node or MPOL_F_LOCAL as mpol_to_str() might expect. Then mpol_to_str() can ignore its no_context argument also, the mpol being appropriately initialized whether contextualized or not. Rename its no_context unused too, and let subsequent patch remove them (that's not needed for stable backporting, which would involve rejects). I don't understand why MPOL_LOCAL is described as a pseudo-policy: it's a reasonable policy which suffers from a confusing implementation in terms of MPOL_PREFERRED with MPOL_F_LOCAL. I believe this would be much more robust if MPOL_LOCAL were recognized in switch statements throughout, MPOL_F_LOCAL deleted, and MPOL_PREFERRED use the (possibly empty) nodes mask like everyone else, instead of its preferred_node variant (I presume an optimization from the days before MPOL_LOCAL). But that would take me too long to get right and fully tested. Signed-off-by: Hugh Dickins Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 64 ++++++++++++++++++++++++---------------------------------- 1 file changed, 26 insertions(+), 38 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d1b315e98627..02c914cca53d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2595,8 +2595,7 @@ void numa_default_policy(void) */ /* - * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag - * Used only for mpol_parse_str() and mpol_to_str() + * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag. */ static const char * const policy_modes[] = { @@ -2610,28 +2609,21 @@ static const char * const policy_modes[] = #ifdef CONFIG_TMPFS /** - * mpol_parse_str - parse string to mempolicy + * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. * @str: string containing mempolicy to parse * @mpol: pointer to struct mempolicy pointer, returned on success. - * @no_context: flag whether to "contextualize" the mempolicy + * @unused: redundant argument, to be removed later. * * Format of input: * [=][:] * - * if @no_context is true, save the input nodemask in w.user_nodemask in - * the returned mempolicy. This will be used to "clone" the mempolicy in - * a specific context [cpuset] at a later time. Used to parse tmpfs mpol - * mount option. Note that if 'static' or 'relative' mode flags were - * specified, the input nodemask will already have been saved. Saving - * it again is redundant, but safe. - * * On success, returns 0, else 1 */ -int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) +int mpol_parse_str(char *str, struct mempolicy **mpol, int unused) { struct mempolicy *new = NULL; unsigned short mode; - unsigned short uninitialized_var(mode_flags); + unsigned short mode_flags; nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); @@ -2719,24 +2711,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (IS_ERR(new)) goto out; - if (no_context) { - /* save for contextualization */ - new->w.user_nodemask = nodes; - } else { - int ret; - NODEMASK_SCRATCH(scratch); - if (scratch) { - task_lock(current); - ret = mpol_set_nodemask(new, &nodes, scratch); - task_unlock(current); - } else - ret = -ENOMEM; - NODEMASK_SCRATCH_FREE(scratch); - if (ret) { - mpol_put(new); - goto out; - } - } + /* + * Save nodes for mpol_to_str() to show the tmpfs mount options + * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. + */ + if (mode != MPOL_PREFERRED) + new->v.nodes = nodes; + else if (nodelist) + new->v.preferred_node = first_node(nodes); + else + new->flags |= MPOL_F_LOCAL; + + /* + * Save nodes for contextualization: this will be used to "clone" + * the mempolicy in a specific context [cpuset] at a later time. + */ + new->w.user_nodemask = nodes; + err = 0; out: @@ -2756,13 +2747,13 @@ out: * @buffer: to contain formatted mempolicy string * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted - * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask + * @unused: redundant argument, to be removed later. * * Convert a mempolicy into a string. * Returns the number of characters in buffer (if positive) * or an error (negative) */ -int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) +int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int unused) { char *p = buffer; int l; @@ -2788,7 +2779,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) case MPOL_PREFERRED: nodes_clear(nodes); if (flags & MPOL_F_LOCAL) - mode = MPOL_LOCAL; /* pseudo-policy */ + mode = MPOL_LOCAL; else node_set(pol->v.preferred_node, nodes); break; @@ -2796,10 +2787,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) case MPOL_BIND: /* Fall through */ case MPOL_INTERLEAVE: - if (no_context) - nodes = pol->w.user_nodemask; - else - nodes = pol->v.nodes; + nodes = pol->v.nodes; break; default: -- cgit From a7a88b23737095e6c18a20c5d4eef9e25ec5b829 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 2 Jan 2013 02:04:23 -0800 Subject: mempolicy: remove arg from mpol_parse_str, mpol_to_str Remove the unused argument (formerly no_context) from mpol_parse_str() and from mpol_to_str(). Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 6 ++---- mm/shmem.c | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 02c914cca53d..1cb200af3828 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2612,14 +2612,13 @@ static const char * const policy_modes[] = * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. * @str: string containing mempolicy to parse * @mpol: pointer to struct mempolicy pointer, returned on success. - * @unused: redundant argument, to be removed later. * * Format of input: * [=][:] * * On success, returns 0, else 1 */ -int mpol_parse_str(char *str, struct mempolicy **mpol, int unused) +int mpol_parse_str(char *str, struct mempolicy **mpol) { struct mempolicy *new = NULL; unsigned short mode; @@ -2747,13 +2746,12 @@ out: * @buffer: to contain formatted mempolicy string * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted - * @unused: redundant argument, to be removed later. * * Convert a mempolicy into a string. * Returns the number of characters in buffer (if positive) * or an error (negative) */ -int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int unused) +int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; int l; diff --git a/mm/shmem.c b/mm/shmem.c index 5c90d84c2b02..5dd56f6efdbd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -889,7 +889,7 @@ static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) if (!mpol || mpol->mode == MPOL_DEFAULT) return; /* show nothing */ - mpol_to_str(buffer, sizeof(buffer), mpol, 1); + mpol_to_str(buffer, sizeof(buffer), mpol); seq_printf(seq, ",mpol=%s", buffer); } @@ -2463,7 +2463,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, if (!gid_valid(sbinfo->gid)) goto bad_val; } else if (!strcmp(this_char,"mpol")) { - if (mpol_parse_str(value, &sbinfo->mpol, 1)) + if (mpol_parse_str(value, &sbinfo->mpol)) goto bad_val; } else { printk(KERN_ERR "tmpfs: Bad mount option %s\n", -- cgit From 42288fe366c4f1ce7522bc9f27d0bc2a81c55264 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 21 Dec 2012 23:10:25 +0000 Subject: mm: mempolicy: Convert shared_policy mutex to spinlock Sasha was fuzzing with trinity and reported the following problem: BUG: sleeping function called from invalid context at kernel/mutex.c:269 in_atomic(): 1, irqs_disabled(): 0, pid: 6361, name: trinity-main 2 locks held by trinity-main/6361: #0: (&mm->mmap_sem){++++++}, at: [] __do_page_fault+0x1e4/0x4f0 #1: (&(&mm->page_table_lock)->rlock){+.+...}, at: [] handle_pte_fault+0x3f7/0x6a0 Pid: 6361, comm: trinity-main Tainted: G W 3.7.0-rc2-next-20121024-sasha-00001-gd95ef01-dirty #74 Call Trace: __might_sleep+0x1c3/0x1e0 mutex_lock_nested+0x29/0x50 mpol_shared_policy_lookup+0x2e/0x90 shmem_get_policy+0x2e/0x30 get_vma_policy+0x5a/0xa0 mpol_misplaced+0x41/0x1d0 handle_pte_fault+0x465/0x6a0 This was triggered by a different version of automatic NUMA balancing but in theory the current version is vunerable to the same problem. do_numa_page -> numa_migrate_prep -> mpol_misplaced -> get_vma_policy -> shmem_get_policy It's very unlikely this will happen as shared pages are not marked pte_numa -- see the page_mapcount() check in change_pte_range() -- but it is possible. To address this, this patch restores sp->lock as originally implemented by Kosaki Motohiro. In the path where get_vma_policy() is called, it should not be calling sp_alloc() so it is not necessary to treat the PTL specially. Signed-off-by: KOSAKI Motohiro Tested-by: KOSAKI Motohiro Signed-off-by: Mel Gorman Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 68 +++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1cb200af3828..e2df1c1fb41f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2132,7 +2132,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) */ /* lookup first element intersecting start-end */ -/* Caller holds sp->mutex */ +/* Caller holds sp->lock */ static struct sp_node * sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) { @@ -2196,13 +2196,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) if (!sp->root.rb_node) return NULL; - mutex_lock(&sp->mutex); + spin_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } - mutex_unlock(&sp->mutex); + spin_unlock(&sp->lock); return pol; } @@ -2328,6 +2328,14 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n) sp_free(n); } +static void sp_node_init(struct sp_node *node, unsigned long start, + unsigned long end, struct mempolicy *pol) +{ + node->start = start; + node->end = end; + node->policy = pol; +} + static struct sp_node *sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) { @@ -2344,10 +2352,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, return NULL; } newpol->flags |= MPOL_F_SHARED; - - n->start = start; - n->end = end; - n->policy = newpol; + sp_node_init(n, start, end, newpol); return n; } @@ -2357,9 +2362,12 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, unsigned long end, struct sp_node *new) { struct sp_node *n; + struct sp_node *n_new = NULL; + struct mempolicy *mpol_new = NULL; int ret = 0; - mutex_lock(&sp->mutex); +restart: + spin_lock(&sp->lock); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { @@ -2372,14 +2380,16 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, } else { /* Old policy spanning whole new range. */ if (n->end > end) { - struct sp_node *new2; - new2 = sp_alloc(end, n->end, n->policy); - if (!new2) { - ret = -ENOMEM; - goto out; - } + if (!n_new) + goto alloc_new; + + *mpol_new = *n->policy; + atomic_set(&mpol_new->refcnt, 1); + sp_node_init(n_new, n->end, end, mpol_new); + sp_insert(sp, n_new); n->end = start; - sp_insert(sp, new2); + n_new = NULL; + mpol_new = NULL; break; } else n->end = start; @@ -2390,9 +2400,27 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, } if (new) sp_insert(sp, new); -out: - mutex_unlock(&sp->mutex); + spin_unlock(&sp->lock); + ret = 0; + +err_out: + if (mpol_new) + mpol_put(mpol_new); + if (n_new) + kmem_cache_free(sn_cache, n_new); + return ret; + +alloc_new: + spin_unlock(&sp->lock); + ret = -ENOMEM; + n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); + if (!n_new) + goto err_out; + mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); + if (!mpol_new) + goto err_out; + goto restart; } /** @@ -2410,7 +2438,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ - mutex_init(&sp->mutex); + spin_lock_init(&sp->lock); if (mpol) { struct vm_area_struct pvma; @@ -2476,14 +2504,14 @@ void mpol_free_shared_policy(struct shared_policy *p) if (!p->root.rb_node) return; - mutex_lock(&p->mutex); + spin_lock(&p->lock); next = rb_first(&p->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); sp_delete(p, n); } - mutex_unlock(&p->mutex); + spin_unlock(&p->lock); } #ifdef CONFIG_NUMA_BALANCING -- cgit From fcb35a9bac6710851bbca1b7908ca85e83aecb4a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 21 Dec 2012 15:01:06 -0800 Subject: MM: vmscan: remove __devinit attribute. CONFIG_HOTPLUG is going away as an option. As a result, the __dev* markings need to be removed. This change removes the use of __devinit from the file. Based on patches originally written by Bill Pemberton, but redone by me in order to handle some of the coding style issues better, by hand. Cc: Bill Pemberton Cc: Andrew Morton Cc: Hugh Dickins Cc: Mel Gorman Cc: Rik van Riel Cc: Konstantin Khlebnikov Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 16b42af393ac..196709f5ee58 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3122,8 +3122,8 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) not required for correctness. So if the last cpu in a node goes away, we get changed to run anywhere: as the first one comes back, restore their cpu bindings. */ -static int __devinit cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) { int nid; -- cgit From a458431e176ddb27e8ef8b98c2a681b217337393 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Fri, 4 Jan 2013 15:35:08 -0800 Subject: mm: fix zone_watermark_ok_safe() accounting of isolated pages Commit 702d1a6e0766 ("memory-hotplug: fix kswapd looping forever problem") added an isolated pageblocks counter (nr_pageblock_isolate in struct zone) and used it to adjust free pages counter in zone_watermark_ok_safe() to prevent kswapd looping forever problem. Then later, commit 2139cbe627b8 ("cma: fix counting of isolated pages") fixed accounting of isolated pages in global free pages counter. It made the previous zone_watermark_ok_safe() fix unnecessary and potentially harmful (cause now isolated pages may be accounted twice making free pages counter incorrect). This patch removes the special isolated pageblocks counter altogether which fixes zone_watermark_ok_safe() free pages check. Reported-by: Tomasz Stanislawski Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Kyungmin Park Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Aaditya Kumar Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 27 --------------------------- mm/page_isolation.c | 26 ++------------------------ 2 files changed, 2 insertions(+), 51 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4ba5e37127fc..bc6cc0e913bd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -221,11 +221,6 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; -/* - * NOTE: - * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. - * Instead, use {un}set_pageblock_isolate. - */ void set_pageblock_migratetype(struct page *page, int migratetype) { @@ -1655,20 +1650,6 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, return true; } -#ifdef CONFIG_MEMORY_ISOLATION -static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) -{ - if (unlikely(zone->nr_pageblock_isolate)) - return zone->nr_pageblock_isolate * pageblock_nr_pages; - return 0; -} -#else -static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) -{ - return 0; -} -#endif - bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags) { @@ -1684,14 +1665,6 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - /* - * If the zone has MIGRATE_ISOLATE type free pages, we should consider - * it. nr_zone_isolate_freepages is never accurate so kswapd might not - * sleep although it could do so. But this is more desirable for memory - * hotplug than sleeping which can cause a livelock in the direct - * reclaim path. - */ - free_pages -= nr_zone_isolate_freepages(z); return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, free_pages); } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 9d2264ea4606..383bdbb98b04 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -8,28 +8,6 @@ #include #include "internal.h" -/* called while holding zone->lock */ -static void set_pageblock_isolate(struct page *page) -{ - if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE) - return; - - set_pageblock_migratetype(page, MIGRATE_ISOLATE); - page_zone(page)->nr_pageblock_isolate++; -} - -/* called while holding zone->lock */ -static void restore_pageblock_isolate(struct page *page, int migratetype) -{ - struct zone *zone = page_zone(page); - if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) - return; - - BUG_ON(zone->nr_pageblock_isolate <= 0); - set_pageblock_migratetype(page, migratetype); - zone->nr_pageblock_isolate--; -} - int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) { struct zone *zone; @@ -80,7 +58,7 @@ out: unsigned long nr_pages; int migratetype = get_pageblock_migratetype(page); - set_pageblock_isolate(page); + set_pageblock_migratetype(page, MIGRATE_ISOLATE); nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); __mod_zone_freepage_state(zone, -nr_pages, migratetype); @@ -103,7 +81,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) goto out; nr_pages = move_freepages_block(zone, page, migratetype); __mod_zone_freepage_state(zone, nr_pages, migratetype); - restore_pageblock_isolate(page, migratetype); + set_pageblock_migratetype(page, migratetype); out: spin_unlock_irqrestore(&zone->lock, flags); } -- cgit From 53a59fc67f97374758e63a9c785891ec62324c81 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 4 Jan 2013 15:35:12 -0800 Subject: mm: limit mmu_gather batching to fix soft lockups on !CONFIG_PREEMPT Since commit e303297e6c3a ("mm: extended batches for generic mmu_gather") we are batching pages to be freed until either tlb_next_batch cannot allocate a new batch or we are done. This works just fine most of the time but we can get in troubles with non-preemptible kernel (CONFIG_PREEMPT_NONE or CONFIG_PREEMPT_VOLUNTARY) on large machines where too aggressive batching might lead to soft lockups during process exit path (exit_mmap) because there are no scheduling points down the free_pages_and_swap_cache path and so the freeing can take long enough to trigger the soft lockup. The lockup is harmless except when the system is setup to panic on softlockup which is not that unusual. The simplest way to work around this issue is to limit the maximum number of batches in a single mmu_gather. 10k of collected pages should be safe to prevent from soft lockups (we would have 2ms for one) even if they are all freed without an explicit scheduling point. This patch doesn't add any new explicit scheduling points because it relies on zap_pmd_range during page tables zapping which calls cond_resched per PMD. The following lockup has been reported for 3.0 kernel with a huge process (in order of hundreds gigs but I do know any more details). BUG: soft lockup - CPU#56 stuck for 22s! [kernel:31053] Modules linked in: af_packet nfs lockd fscache auth_rpcgss nfs_acl sunrpc mptctl mptbase autofs4 binfmt_misc dm_round_robin dm_multipath bonding cpufreq_conservative cpufreq_userspace cpufreq_powersave pcc_cpufreq mperf microcode fuse loop osst sg sd_mod crc_t10dif st qla2xxx scsi_transport_fc scsi_tgt netxen_nic i7core_edac iTCO_wdt joydev e1000e serio_raw pcspkr edac_core iTCO_vendor_support acpi_power_meter rtc_cmos hpwdt hpilo button container usbhid hid dm_mirror dm_region_hash dm_log linear uhci_hcd ehci_hcd usbcore usb_common scsi_dh_emc scsi_dh_alua scsi_dh_hp_sw scsi_dh_rdac scsi_dh dm_snapshot pcnet32 mii edd dm_mod raid1 ext3 mbcache jbd fan thermal processor thermal_sys hwmon cciss scsi_mod Supported: Yes CPU 56 Pid: 31053, comm: kernel Not tainted 3.0.31-0.9-default #1 HP ProLiant DL580 G7 RIP: 0010: _raw_spin_unlock_irqrestore+0x8/0x10 RSP: 0018:ffff883ec1037af0 EFLAGS: 00000206 RAX: 0000000000000e00 RBX: ffffea01a0817e28 RCX: ffff88803ffd9e80 RDX: 0000000000000200 RSI: 0000000000000206 RDI: 0000000000000206 RBP: 0000000000000002 R08: 0000000000000001 R09: ffff887ec724a400 R10: 0000000000000000 R11: dead000000200200 R12: ffffffff8144c26e R13: 0000000000000030 R14: 0000000000000297 R15: 000000000000000e FS: 00007ed834282700(0000) GS:ffff88c03f200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 000000000068b240 CR3: 0000003ec13c5000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kernel (pid: 31053, threadinfo ffff883ec1036000, task ffff883ebd5d4100) Call Trace: release_pages+0xc5/0x260 free_pages_and_swap_cache+0x9d/0xc0 tlb_flush_mmu+0x5c/0x80 tlb_finish_mmu+0xe/0x50 exit_mmap+0xbd/0x120 mmput+0x49/0x120 exit_mm+0x122/0x160 do_exit+0x17a/0x430 do_group_exit+0x3d/0xb0 get_signal_to_deliver+0x247/0x480 do_signal+0x71/0x1b0 do_notify_resume+0x98/0xb0 int_signal+0x12/0x17 DWARF2 unwinder stuck at int_signal+0x12/0x17 Signed-off-by: Michal Hocko Cc: [3.0+] Cc: Mel Gorman Cc: Rik van Riel Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index e0a9b0ce4f10..49fb1cf08611 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -184,10 +184,14 @@ static int tlb_next_batch(struct mmu_gather *tlb) return 1; } + if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) + return 0; + batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); if (!batch) return 0; + tlb->batch_count++; batch->next = NULL; batch->nr = 0; batch->max = MAX_GATHER_BATCH; @@ -216,6 +220,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) tlb->local.nr = 0; tlb->local.max = ARRAY_SIZE(tlb->__pages); tlb->active = &tlb->local; + tlb->batch_count = 0; #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb->batch = NULL; -- cgit From e53289c0c5e5a24e29e571eba7af05c845c10890 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 9 Jan 2013 08:36:54 -0800 Subject: mm: reinstante dropped pmd_trans_splitting() check The check for a pmd being in the process of being split was dropped by mistake by commit d10e63f29488 ("mm: numa: Create basic numa page hinting infrastructure"). Put it back. Reported-by: Dave Jones Debugged-by: Hillf Danton Acked-by: Andrea Arcangeli Acked-by: Mel Gorman Cc: Kirill Shutemov Signed-off-by: Linus Torvalds --- mm/memory.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 49fb1cf08611..bb1369f7b9b4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3711,6 +3711,14 @@ retry: if (pmd_trans_huge(orig_pmd)) { unsigned int dirty = flags & FAULT_FLAG_WRITE; + /* + * If the pmd is splitting, return and retry the + * the fault. Alternative: wait until the split + * is done, and goto retry. + */ + if (pmd_trans_splitting(orig_pmd)) + return 0; + if (pmd_numa(orig_pmd)) return do_huge_pmd_numa_page(mm, vma, address, orig_pmd, pmd); -- cgit From 04fa5d6a6547fbfcf613efd00637666fe19b24ab Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 11 Jan 2013 14:31:40 -0800 Subject: mm: migrate: check page_count of THP before migrating Hugh Dickins pointed out that migrate_misplaced_transhuge_page() does not check page_count before migrating like base page migration and khugepage. He could not see why this was safe and he is right. The potential impact of the bug is avoided due to the limitations of NUMA balancing. The page_mapcount() check ensures that only a single address space is using this page and as THPs are typically private it should not be possible for another address space to fault it in parallel. If the address space has one associated task then it's difficult to have both a GUP pin and be referencing the page at the same time. If there are multiple tasks then a buggy scenario requires that another thread be accessing the page while the direct IO is in flight. This is dodgy behaviour as there is a possibility of corruption with or without THP migration. It would be While we happen to be safe for the most part it is shoddy to depend on such "safety" so this patch checks the page count similar to anonymous pages. Note that this does not mean that the page_mapcount() check can go away. If we were to remove the page_mapcount() check the the THP would have to be unmapped from all referencing PTEs, replaced with migration PTEs and restored properly afterwards. Signed-off-by: Mel Gorman Reported-by: Hugh Dickins Cc: Ingo Molnar Cc: Andrea Arcangeli Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 3b676b0c5c3e..c38778610aa8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1679,9 +1679,21 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, page_xchg_last_nid(new_page, page_last_nid(page)); isolated = numamigrate_isolate_page(pgdat, page); - if (!isolated) { + + /* + * Failing to isolate or a GUP pin prevents migration. The expected + * page count is 2. 1 for anonymous pages without a mapping and 1 + * for the callers pin. If the page was isolated, the page will + * need to be put back on the LRU. + */ + if (!isolated || page_count(page) != 2) { count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); put_page(new_page); + if (isolated) { + putback_lru_page(page); + isolated = 0; + goto out; + } goto out_keep_locked; } -- cgit From c0232ae861df679092c15960b6cd9f589d9b7177 Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Fri, 11 Jan 2013 14:31:44 -0800 Subject: mm: memblock: fix wrong memmove size in memblock_merge_regions() The memmove span covers from (next+1) to the end of the array, and the index of next is (i+1), so the index of (next+1) is (i+2). So the size of remaining array elements is (type->cnt - (i + 2)). Since the remaining elements of the memblock array are move forward by one element and there is only one additional element caused by this bug. So there won't be any write overflow here but read overflow. It may read one more element out of the array address if the array happens to be full. Commonly it doesn't matter at all but if the array happens to be located at the end a memblock, it may cause a invalid read operation for the physical address doesn't exist. There are 2 *happens to be* here, so I think the probability is quite low, I don't know if any guy is haunted by this bug before. Mostly I think it's user-invisible. Signed-off-by: Lin Feng Acked-by: Tejun Heo Reviewed-by: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 625905523c2a..88adc8afb610 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -314,7 +314,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) } this->size += next->size; - memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next)); + /* move forward from next + 1, index of which is i + 2 */ + memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next)); type->cnt--; } } -- cgit From 7964c06d66c76507d8b6b662bffea770c29ef0ce Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Fri, 11 Jan 2013 14:31:47 -0800 Subject: mm: compaction: fix echo 1 > compact_memory return error issue when run the folloing command under shell, it will return error sh/$ echo 1 > /proc/sys/vm/compact_memory sh/$ sh: write error: Bad address After strace, I found the following log: ... write(1, "1\n", 2) = 3 write(1, "", 4294967295) = -1 EFAULT (Bad address) write(2, "echo: write error: Bad address\n", 31echo: write error: Bad address ) = 31 This tells system return 3(COMPACT_COMPLETE) after write data to compact_memory. The fix is to make the system just return 0 instead 3(COMPACT_COMPLETE) from sysctl_compaction_handler after compaction_nodes finished. Signed-off-by: Jason Liu Suggested-by: David Rientjes Acked-by: Mel Gorman Cc: Rik van Riel Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Acked-by: David Rientjes Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 6b807e466497..f8f5c111b7d7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1210,7 +1210,7 @@ static int compact_node(int nid) } /* Compact all nodes in the system */ -static int compact_nodes(void) +static void compact_nodes(void) { int nid; @@ -1219,8 +1219,6 @@ static int compact_nodes(void) for_each_online_node(nid) compact_node(nid); - - return COMPACT_COMPLETE; } /* The written value is actually unused, all memory is compacted */ @@ -1231,7 +1229,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { if (write) - return compact_nodes(); + compact_nodes(); return 0; } -- cgit From c060f943d0929f3e429c5d9522290584f6281d6e Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Fri, 11 Jan 2013 14:31:51 -0800 Subject: mm: use aligned zone start for pfn_to_bitidx calculation The current calculation in pfn_to_bitidx assumes that (pfn - zone->zone_start_pfn) >> pageblock_order will return the same bit for all pfn in a pageblock. If zone_start_pfn is not aligned to pageblock_nr_pages, this may not always be correct. Consider the following with pageblock order = 10, zone start 2MB: pfn | pfn - zone start | (pfn - zone start) >> page block order ---------------------------------------------------------------- 0x26000 | 0x25e00 | 0x97 0x26100 | 0x25f00 | 0x97 0x26200 | 0x26000 | 0x98 0x26300 | 0x26100 | 0x98 This means that calling {get,set}_pageblock_migratetype on a single page will not set the migratetype for the full block. Fix this by rounding down zone_start_pfn when doing the bitidx calculation. For our use case, the effects of this bug were mostly tied to the fact that CMA allocations would either take a long time or fail to happen. Depending on the driver using CMA, this could result in anything from visual glitches to application failures. Signed-off-by: Laura Abbott Acked-by: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bc6cc0e913bd..c957805a7f0e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5604,7 +5604,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) pfn &= (PAGES_PER_SECTION-1); return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #else - pfn = pfn - zone->zone_start_pfn; + pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #endif /* CONFIG_SPARSEMEM */ } -- cgit From 10d73e655cef6e86ea8589dca3df4e495e4900b0 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Fri, 11 Jan 2013 14:31:52 -0800 Subject: mm: bootmem: fix free_all_bootmem_core() with odd bitmap alignment Currently free_all_bootmem_core ignores that node_min_pfn may be not multiple of BITS_PER_LONG. Eg commit 6dccdcbe2c3e ("mm: bootmem: fix checking the bitmap when finally freeing bootmem") shifts vec by lower bits of start instead of lower bits of idx. Also if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) assumes that vec bit 0 corresponds to start pfn, which is only true when node_min_pfn is a multiple of BITS_PER_LONG. Also loop in the else clause can double-free pages (e.g. with node_min_pfn == start == 1, map[0] == ~0 on 32-bit machine page 32 will be double-freed). This bug causes the following message during xtensa kernel boot: bootmem::free_all_bootmem_core nid=0 start=1 end=8000 BUG: Bad page state in process swapper pfn:00001 page:d04bd020 count:0 mapcount:-127 mapping: (null) index:0x2 page flags: 0x0() Call Trace: bad_page+0x8c/0x9c free_pages_prepare+0x5e/0x88 free_hot_cold_page+0xc/0xa0 __free_pages+0x24/0x38 __free_pages_bootmem+0x54/0x56 free_all_bootmem_core$part$11+0xeb/0x138 free_all_bootmem+0x46/0x58 mem_init+0x25/0xa4 start_kernel+0x11e/0x25c should_never_return+0x0/0x3be7 The fix is the following: - always align vec so that its bit 0 corresponds to start - provide BITS_PER_LONG bits in vec, if those bits are available in the map - don't free pages past next start position in the else clause. Signed-off-by: Max Filippov Cc: Gavin Shan Cc: Johannes Weiner Cc: Tejun Heo Cc: Yinghai Lu Cc: Joonsoo Kim Cc: Prasad Koya Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 1324cd74faec..b93376c39b61 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -185,10 +185,23 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) while (start < end) { unsigned long *map, idx, vec; + unsigned shift; map = bdata->node_bootmem_map; idx = start - bdata->node_min_pfn; + shift = idx & (BITS_PER_LONG - 1); + /* + * vec holds at most BITS_PER_LONG map bits, + * bit 0 corresponds to start. + */ vec = ~map[idx / BITS_PER_LONG]; + + if (shift) { + vec >>= shift; + if (end - start >= BITS_PER_LONG) + vec |= ~map[idx / BITS_PER_LONG + 1] << + (BITS_PER_LONG - shift); + } /* * If we have a properly aligned and fully unreserved * BITS_PER_LONG block of pages in front of us, free @@ -201,19 +214,18 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) count += BITS_PER_LONG; start += BITS_PER_LONG; } else { - unsigned long off = 0; + unsigned long cur = start; - vec >>= start & (BITS_PER_LONG - 1); - while (vec) { + start = ALIGN(start + 1, BITS_PER_LONG); + while (vec && cur != start) { if (vec & 1) { - page = pfn_to_page(start + off); + page = pfn_to_page(cur); __free_pages_bootmem(page, 0); count++; } vec >>= 1; - off++; + ++cur; } - start = ALIGN(start + 1, BITS_PER_LONG); } } -- cgit From 572043c90db65b45a4efd959db7458edcf6411ad Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 11 Jan 2013 14:31:59 -0800 Subject: mm: mmap: annotate vm_lock_anon_vma locking properly for lockdep Commit 5a505085f043 ("mm/rmap: Convert the struct anon_vma::mutex to an rwsem") turned anon_vma mutex to rwsem. However, the properly annotated nested locking in mm_take_all_locks() has been converted from mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem); to down_write(&anon_vma->root->rwsem); which is incomplete, and causes the false positive report from lockdep below. Annotate the fact that mmap_sem is used as an outter lock to serialize taking of all the anon_vma rwsems at once no matter the order, using the down_write_nest_lock() primitive. This patch fixes this lockdep report: ============================================= [ INFO: possible recursive locking detected ] 3.8.0-rc2-00036-g5f73896 #171 Not tainted --------------------------------------------- qemu-kvm/2315 is trying to acquire lock: (&anon_vma->rwsem){+.+...}, at: mm_take_all_locks+0x149/0x1b0 but task is already holding lock: (&anon_vma->rwsem){+.+...}, at: mm_take_all_locks+0x149/0x1b0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&anon_vma->rwsem); lock(&anon_vma->rwsem); *** DEADLOCK *** May be due to missing lock nesting notation 4 locks held by qemu-kvm/2315: #0: (&mm->mmap_sem){++++++}, at: do_mmu_notifier_register+0xfc/0x170 #1: (mm_all_locks_mutex){+.+...}, at: mm_take_all_locks+0x36/0x1b0 #2: (&mapping->i_mmap_mutex){+.+...}, at: mm_take_all_locks+0xc9/0x1b0 #3: (&anon_vma->rwsem){+.+...}, at: mm_take_all_locks+0x149/0x1b0 stack backtrace: Pid: 2315, comm: qemu-kvm Not tainted 3.8.0-rc2-00036-g5f73896 #171 Call Trace: print_deadlock_bug+0xf2/0x100 validate_chain+0x4f6/0x720 __lock_acquire+0x359/0x580 lock_acquire+0x121/0x190 down_write+0x3f/0x70 mm_take_all_locks+0x149/0x1b0 do_mmu_notifier_register+0x68/0x170 mmu_notifier_register+0xe/0x10 kvm_create_vm+0x22b/0x330 [kvm] kvm_dev_ioctl+0xf8/0x1a0 [kvm] do_vfs_ioctl+0x9d/0x350 sys_ioctl+0x91/0xb0 system_call_fastpath+0x16/0x1b Signed-off-by: Jiri Kosina Cc: Rik van Riel Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Mel Gorman Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index f54b235f29a9..35730ee9d515 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2886,7 +2886,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - down_write(&anon_vma->root->rwsem); + down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); /* * We can safely modify head.next after taking the * anon_vma->root->rwsem. If some other vma in this mm shares -- cgit From 062f1af2170afe817133d358d900a5f33e3856e4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 11 Jan 2013 14:32:02 -0800 Subject: mm: thp: acquire the anon_vma rwsem for write during split Zhouping Liu reported the following against 3.8-rc1 when running a mmap testcase from LTP. mapcount 0 page_mapcount 3 ------------[ cut here ]------------ kernel BUG at mm/huge_memory.c:1798! invalid opcode: 0000 [#1] SMP Modules linked in: ip6table_filter ip6_tables ebtable_nat ebtables bnep bluetooth rfkill iptable_mangle ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack iptable_filter ip_tables be2iscsi iscsi_boot_sysfs bnx2i cnic uio cxgb4i cxgb4 cxgb3i cxgb3 mdio libcxgbi ib_iser rdma_cm ib_addr iw_cm ib_cm ib_sa ib_mad ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi vfat fat dm_mirror dm_region_hash dm_log dm_mod cdc_ether iTCO_wdt i7core_edac coretemp usbnet iTCO_vendor_support mii crc32c_intel edac_core lpc_ich shpchp ioatdma mfd_core i2c_i801 pcspkr serio_raw bnx2 microcode dca vhost_net tun macvtap macvlan kvm_intel kvm uinput mgag200 sr_mod cdrom i2c_algo_bit sd_mod drm_kms_helper crc_t10dif ata_generic pata_acpi ttm ata_piix drm libata i2c_core megaraid_sas CPU 1 Pid: 23217, comm: mmap10 Not tainted 3.8.0-rc1mainline+ #17 IBM IBM System x3400 M3 Server -[7379I08]-/69Y4356 RIP: __split_huge_page+0x677/0x6d0 RSP: 0000:ffff88017a03fc08 EFLAGS: 00010293 RAX: 0000000000000003 RBX: ffff88027a6c22e0 RCX: 00000000000034d2 RDX: 000000000000748b RSI: 0000000000000046 RDI: 0000000000000246 RBP: ffff88017a03fcb8 R08: ffffffff819d2440 R09: 000000000000054a R10: 0000000000aaaaaa R11: 00000000ffffffff R12: 0000000000000000 R13: 00007f4f11a00000 R14: ffff880179e96e00 R15: ffffea0005c08000 FS: 00007f4f11f4a740(0000) GS:ffff88017bc20000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000037e9ebb404 CR3: 000000017a436000 CR4: 00000000000007e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process mmap10 (pid: 23217, threadinfo ffff88017a03e000, task ffff880172dd32e0) Stack: ffff88017a540ec8 ffff88017a03fc20 ffffffff816017b5 ffff88017a03fc88 ffffffff812fa014 0000000000000000 ffff880279ebd5c0 00000000f4f11a4c 00000007f4f11f49 00000007f4f11a00 ffff88017a540ef0 ffff88017a540ee8 Call Trace: split_huge_page+0x68/0xb0 __split_huge_page_pmd+0x134/0x330 split_huge_page_pmd_mm+0x51/0x60 split_huge_page_address+0x3b/0x50 __vma_adjust_trans_huge+0x9c/0xf0 vma_adjust+0x684/0x750 __split_vma.isra.28+0x1fa/0x220 do_munmap+0xf9/0x420 vm_munmap+0x4e/0x70 sys_munmap+0x2b/0x40 system_call_fastpath+0x16/0x1b Alexander Beregalov and Alex Xu reported similar bugs and Hillf Danton identified that commit 5a505085f043 ("mm/rmap: Convert the struct anon_vma::mutex to an rwsem") and commit 4fc3f1d66b1e ("mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable") were likely the problem. Reverting these commits was reported to solve the problem for Alexander. Despite the reason for these commits, NUMA balancing is not the direct source of the problem. split_huge_page() expects the anon_vma lock to be exclusive to serialise the whole split operation. Ordinarily it is expected that the anon_vma lock would only be required when updating the avcs but THP also uses the anon_vma rwsem for collapse and split operations where the page lock or compound lock cannot be used (as the page is changing from base to THP or vice versa) and the page table locks are insufficient. This patch takes the anon_vma lock for write to serialise against parallel split_huge_page as THP expected before the conversion to rwsem. Reported-and-tested-by: Zhouping Liu Reported-by: Alexander Beregalov Reported-by: Alex Xu Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9e894edc7811..6001ee6347a9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1819,9 +1819,19 @@ int split_huge_page(struct page *page) BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); BUG_ON(!PageAnon(page)); - anon_vma = page_lock_anon_vma_read(page); + + /* + * The caller does not necessarily hold an mmap_sem that would prevent + * the anon_vma disappearing so we first we take a reference to it + * and then lock the anon_vma for write. This is similar to + * page_lock_anon_vma_read except the write lock is taken to serialise + * against parallel split or collapse operations. + */ + anon_vma = page_get_anon_vma(page); if (!anon_vma) goto out; + anon_vma_lock_write(anon_vma); + ret = 0; if (!PageCompound(page)) goto out_unlock; @@ -1832,7 +1842,8 @@ int split_huge_page(struct page *page) BUG_ON(PageCompound(page)); out_unlock: - page_unlock_anon_vma_read(anon_vma); + anon_vma_unlock(anon_vma); + put_anon_vma(anon_vma); out: return ret; } -- cgit From 8fb74b9fb2b182d54beee592350d9ea1f325917a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 11 Jan 2013 14:32:16 -0800 Subject: mm: compaction: partially revert capture of suitable high-order page Eric Wong reported on 3.7 and 3.8-rc2 that ppoll() got stuck when waiting for POLLIN on a local TCP socket. It was easier to trigger if there was disk IO and dirty pages at the same time and he bisected it to commit 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page immediately when it is made available"). The intention of that patch was to improve high-order allocations under memory pressure after changes made to reclaim in 3.6 drastically hurt THP allocations but the approach was flawed. For Eric, the problem was that page->pfmemalloc was not being cleared for captured pages leading to a poor interaction with swap-over-NFS support causing the packets to be dropped. However, I identified a few more problems with the patch including the fact that it can increase contention on zone->lock in some cases which could result in async direct compaction being aborted early. In retrospect the capture patch took the wrong approach. What it should have done is mark the pageblock being migrated as MIGRATE_ISOLATE if it was allocating for THP and avoided races that way. While the patch was showing to improve allocation success rates at the time, the benefit is marginal given the relative complexity and it should be revisited from scratch in the context of the other reclaim-related changes that have taken place since the patch was first written and tested. This patch partially reverts commit 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page immediately when it is made available"). Reported-and-tested-by: Eric Wong Tested-by: Eric Dumazet Cc: Signed-off-by: Mel Gorman Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 92 ++++++++------------------------------------------------- mm/internal.h | 1 - mm/page_alloc.c | 35 +++++----------------- 3 files changed, 21 insertions(+), 107 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index f8f5c111b7d7..c62bd063d766 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -816,6 +816,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, static int compact_finished(struct zone *zone, struct compact_control *cc) { + unsigned int order; unsigned long watermark; if (fatal_signal_pending(current)) @@ -850,22 +851,16 @@ static int compact_finished(struct zone *zone, return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ - if (cc->page) { - /* Was a suitable page captured? */ - if (*cc->page) + for (order = cc->order; order < MAX_ORDER; order++) { + struct free_area *area = &zone->free_area[order]; + + /* Job done if page is free of the right migratetype */ + if (!list_empty(&area->free_list[cc->migratetype])) + return COMPACT_PARTIAL; + + /* Job done if allocation would set block type */ + if (cc->order >= pageblock_order && area->nr_free) return COMPACT_PARTIAL; - } else { - unsigned int order; - for (order = cc->order; order < MAX_ORDER; order++) { - struct free_area *area = &zone->free_area[cc->order]; - /* Job done if page is free of the right migratetype */ - if (!list_empty(&area->free_list[cc->migratetype])) - return COMPACT_PARTIAL; - - /* Job done if allocation would set block type */ - if (cc->order >= pageblock_order && area->nr_free) - return COMPACT_PARTIAL; - } } return COMPACT_CONTINUE; @@ -921,60 +916,6 @@ unsigned long compaction_suitable(struct zone *zone, int order) return COMPACT_CONTINUE; } -static void compact_capture_page(struct compact_control *cc) -{ - unsigned long flags; - int mtype, mtype_low, mtype_high; - - if (!cc->page || *cc->page) - return; - - /* - * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP - * regardless of the migratetype of the freelist is is captured from. - * This is fine because the order for a high-order MIGRATE_MOVABLE - * allocation is typically at least a pageblock size and overall - * fragmentation is not impaired. Other allocation types must - * capture pages from their own migratelist because otherwise they - * could pollute other pageblocks like MIGRATE_MOVABLE with - * difficult to move pages and making fragmentation worse overall. - */ - if (cc->migratetype == MIGRATE_MOVABLE) { - mtype_low = 0; - mtype_high = MIGRATE_PCPTYPES; - } else { - mtype_low = cc->migratetype; - mtype_high = cc->migratetype + 1; - } - - /* Speculatively examine the free lists without zone lock */ - for (mtype = mtype_low; mtype < mtype_high; mtype++) { - int order; - for (order = cc->order; order < MAX_ORDER; order++) { - struct page *page; - struct free_area *area; - area = &(cc->zone->free_area[order]); - if (list_empty(&area->free_list[mtype])) - continue; - - /* Take the lock and attempt capture of the page */ - if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) - return; - if (!list_empty(&area->free_list[mtype])) { - page = list_entry(area->free_list[mtype].next, - struct page, lru); - if (capture_free_page(page, cc->order, mtype)) { - spin_unlock_irqrestore(&cc->zone->lock, - flags); - *cc->page = page; - return; - } - } - spin_unlock_irqrestore(&cc->zone->lock, flags); - } - } -} - static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; @@ -1054,9 +995,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) goto out; } } - - /* Capture a page now if it is a suitable size */ - compact_capture_page(cc); } out: @@ -1069,8 +1007,7 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - bool sync, bool *contended, - struct page **page) + bool sync, bool *contended) { unsigned long ret; struct compact_control cc = { @@ -1080,7 +1017,6 @@ static unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, - .page = page, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1110,7 +1046,7 @@ int sysctl_extfrag_threshold = 500; */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended, struct page **page) + bool sync, bool *contended) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -1136,7 +1072,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, int status; status = compact_zone_order(zone, order, gfp_mask, sync, - contended, page); + contended); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ @@ -1192,7 +1128,6 @@ int compact_pgdat(pg_data_t *pgdat, int order) struct compact_control cc = { .order = order, .sync = false, - .page = NULL, }; return __compact_pgdat(pgdat, &cc); @@ -1203,7 +1138,6 @@ static int compact_node(int nid) struct compact_control cc = { .order = -1, .sync = true, - .page = NULL, }; return __compact_pgdat(NODE_DATA(nid), &cc); diff --git a/mm/internal.h b/mm/internal.h index d597f94cc205..9ba21100ebf3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -135,7 +135,6 @@ struct compact_control { int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; bool contended; /* True if a lock was contended */ - struct page **page; /* Page captured of requested size */ }; unsigned long diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c957805a7f0e..df2022ff0c8a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1384,14 +1384,8 @@ void split_page(struct page *page, unsigned int order) set_page_refcounted(page + i); } -/* - * Similar to the split_page family of functions except that the page - * required at the given order and being isolated now to prevent races - * with parallel allocators - */ -int capture_free_page(struct page *page, int alloc_order, int migratetype) +static int __isolate_free_page(struct page *page, unsigned int order) { - unsigned int order; unsigned long watermark; struct zone *zone; int mt; @@ -1399,7 +1393,6 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) BUG_ON(!PageBuddy(page)); zone = page_zone(page); - order = page_order(page); mt = get_pageblock_migratetype(page); if (mt != MIGRATE_ISOLATE) { @@ -1408,7 +1401,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) return 0; - __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); + __mod_zone_freepage_state(zone, -(1UL << order), mt); } /* Remove page from free list */ @@ -1416,11 +1409,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) zone->free_area[order].nr_free--; rmv_page_order(page); - if (alloc_order != order) - expand(zone, page, alloc_order, order, - &zone->free_area[order], migratetype); - - /* Set the pageblock if the captured page is at least a pageblock */ + /* Set the pageblock if the isolated page is at least a pageblock */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { @@ -1431,7 +1420,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) } } - return 1UL << alloc_order; + return 1UL << order; } /* @@ -1449,10 +1438,9 @@ int split_free_page(struct page *page) unsigned int order; int nr_pages; - BUG_ON(!PageBuddy(page)); order = page_order(page); - nr_pages = capture_free_page(page, order, 0); + nr_pages = __isolate_free_page(page, order); if (!nr_pages) return 0; @@ -2136,8 +2124,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { - struct page *page = NULL; - if (!order) return NULL; @@ -2149,16 +2135,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, sync_migration, - contended_compaction, &page); + contended_compaction); current->flags &= ~PF_MEMALLOC; - /* If compaction captured a page, prep and use it */ - if (page) { - prep_new_page(page, order, gfp_mask); - goto got_page; - } - if (*did_some_progress != COMPACT_SKIPPED) { + struct page *page; + /* Page migration frees to the PCP lists but we want merging */ drain_pages(get_cpu()); put_cpu(); @@ -2168,7 +2150,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, alloc_flags & ~ALLOC_NO_WATERMARKS, preferred_zone, migratetype); if (page) { -got_page: preferred_zone->compact_blockskip_flush = false; preferred_zone->compact_considered = 0; preferred_zone->compact_defer_shift = 0; -- cgit