summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c13
-rw-r--r--mm/filemap.c18
-rw-r--r--mm/ksm.c5
-rw-r--r--mm/list_lru.c12
-rw-r--r--mm/madvise.c12
-rw-r--r--mm/memcontrol.c23
-rw-r--r--mm/memory.c2
-rw-r--r--mm/memory_hotplug.c7
-rw-r--r--mm/oom_kill.c16
-rw-r--r--mm/page_alloc.c3
-rw-r--r--mm/percpu-stats.c2
-rw-r--r--mm/percpu.c4
-rw-r--r--mm/rodata_test.c2
-rw-r--r--mm/slab_common.c22
-rw-r--r--mm/swap.c4
-rw-r--r--mm/swap_state.c11
-rw-r--r--mm/z3fold.c10
17 files changed, 120 insertions, 46 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index fb548e4c7bd4..03d31a875341 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1999,17 +1999,14 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
if (pgdat->kcompactd_max_order < order)
pgdat->kcompactd_max_order = order;
- /*
- * Pairs with implicit barrier in wait_event_freezable()
- * such that wakeups are not missed in the lockless
- * waitqueue_active() call.
- */
- smp_acquire__after_ctrl_dep();
-
if (pgdat->kcompactd_classzone_idx > classzone_idx)
pgdat->kcompactd_classzone_idx = classzone_idx;
- if (!waitqueue_active(&pgdat->kcompactd_wait))
+ /*
+ * Pairs with implicit barrier in wait_event_freezable()
+ * such that wakeups are not missed.
+ */
+ if (!wq_has_sleeper(&pgdat->kcompactd_wait))
return;
if (!kcompactd_node_suitable(pgdat))
diff --git a/mm/filemap.c b/mm/filemap.c
index 870971e20967..594d73fef8b4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -620,6 +620,14 @@ int file_check_and_advance_wb_err(struct file *file)
trace_file_check_and_advance_wb_err(file, old);
spin_unlock(&file->f_lock);
}
+
+ /*
+ * We're mostly using this function as a drop in replacement for
+ * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
+ * that the legacy code would have had on these flags.
+ */
+ clear_bit(AS_EIO, &mapping->flags);
+ clear_bit(AS_ENOSPC, &mapping->flags);
return err;
}
EXPORT_SYMBOL(file_check_and_advance_wb_err);
@@ -2926,9 +2934,15 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
* we're writing. Either one is a pretty crazy thing to do,
* so we don't support it 100%. If this invalidation
* fails, tough, the write still worked...
+ *
+ * Most of the time we do not need this since dio_complete() will do
+ * the invalidation for us. However there are some file systems that
+ * do not end up with dio_complete() being called, so let's not break
+ * them by removing it completely
*/
- invalidate_inode_pages2_range(mapping,
- pos >> PAGE_SHIFT, end);
+ if (mapping->nrpages)
+ invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_SHIFT, end);
if (written > 0) {
pos += written;
diff --git a/mm/ksm.c b/mm/ksm.c
index 15dd7415f7b3..6cb60f46cce5 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1990,6 +1990,7 @@ static void stable_tree_append(struct rmap_item *rmap_item,
*/
static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
{
+ struct mm_struct *mm = rmap_item->mm;
struct rmap_item *tree_rmap_item;
struct page *tree_page = NULL;
struct stable_node *stable_node;
@@ -2062,9 +2063,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
if (ksm_use_zero_pages && (checksum == zero_checksum)) {
struct vm_area_struct *vma;
- vma = find_mergeable_vma(rmap_item->mm, rmap_item->address);
+ down_read(&mm->mmap_sem);
+ vma = find_mergeable_vma(mm, rmap_item->address);
err = try_to_merge_one_page(vma, page,
ZERO_PAGE(rmap_item->address));
+ up_read(&mm->mmap_sem);
/*
* In case of failure, the page was not really empty, so we
* need to continue. Otherwise we're done.
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 7a40fa2be858..f141f0c80ff3 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -325,12 +325,12 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru)
{
int size = memcg_nr_cache_ids;
- nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
+ nlru->memcg_lrus = kvmalloc(size * sizeof(void *), GFP_KERNEL);
if (!nlru->memcg_lrus)
return -ENOMEM;
if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
- kfree(nlru->memcg_lrus);
+ kvfree(nlru->memcg_lrus);
return -ENOMEM;
}
@@ -340,7 +340,7 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru)
static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
{
__memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
- kfree(nlru->memcg_lrus);
+ kvfree(nlru->memcg_lrus);
}
static int memcg_update_list_lru_node(struct list_lru_node *nlru,
@@ -351,12 +351,12 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
BUG_ON(old_size > new_size);
old = nlru->memcg_lrus;
- new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
+ new = kvmalloc(new_size * sizeof(void *), GFP_KERNEL);
if (!new)
return -ENOMEM;
if (__memcg_init_list_lru_node(new, old_size, new_size)) {
- kfree(new);
+ kvfree(new);
return -ENOMEM;
}
@@ -373,7 +373,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
nlru->memcg_lrus = new;
spin_unlock_irq(&nlru->lock);
- kfree(old);
+ kvfree(old);
return 0;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 21261ff0466f..25bade36e9ca 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -625,18 +625,26 @@ static int madvise_inject_error(int behavior,
{
struct page *page;
struct zone *zone;
+ unsigned int order;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- for (; start < end; start += PAGE_SIZE <<
- compound_order(compound_head(page))) {
+
+ for (; start < end; start += PAGE_SIZE << order) {
int ret;
ret = get_user_pages_fast(start, 1, 0, &page);
if (ret != 1)
return ret;
+ /*
+ * When soft offlining hugepages, after migrating the page
+ * we dissolve it, therefore in the second loop "page" will
+ * no longer be a compound page, and order will be 0.
+ */
+ order = compound_order(compound_head(page));
+
if (PageHWPoison(page)) {
put_page(page);
continue;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 15af3da5af02..d5f3a62887cf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1777,6 +1777,10 @@ static void drain_local_stock(struct work_struct *dummy)
struct memcg_stock_pcp *stock;
unsigned long flags;
+ /*
+ * The only protection from memory hotplug vs. drain_stock races is
+ * that we always operate on local CPU stock here with IRQ disabled
+ */
local_irq_save(flags);
stock = this_cpu_ptr(&memcg_stock);
@@ -1821,27 +1825,33 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
/* If someone's already draining, avoid adding running more workers. */
if (!mutex_trylock(&percpu_charge_mutex))
return;
- /* Notify other cpus that system-wide "drain" is running */
- get_online_cpus();
+ /*
+ * Notify other cpus that system-wide "drain" is running
+ * We do not care about races with the cpu hotplug because cpu down
+ * as well as workers from this path always operate on the local
+ * per-cpu data. CPU up doesn't touch memcg_stock at all.
+ */
curcpu = get_cpu();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
memcg = stock->cached;
- if (!memcg || !stock->nr_pages)
+ if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
continue;
- if (!mem_cgroup_is_descendant(memcg, root_memcg))
+ if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
+ css_put(&memcg->css);
continue;
+ }
if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
if (cpu == curcpu)
drain_local_stock(&stock->work);
else
schedule_work_on(cpu, &stock->work);
}
+ css_put(&memcg->css);
}
put_cpu();
- put_online_cpus();
mutex_unlock(&percpu_charge_mutex);
}
@@ -5648,7 +5658,8 @@ static void uncharge_batch(const struct uncharge_gather *ug)
static void uncharge_page(struct page *page, struct uncharge_gather *ug)
{
VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
+ VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
+ !PageHWPoison(page) , page);
if (!page->mem_cgroup)
return;
diff --git a/mm/memory.c b/mm/memory.c
index ec4e15494901..a728bed16c20 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -845,7 +845,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
* vm_normal_page() so that we do not have to special case all
* call site of vm_normal_page().
*/
- if (likely(pfn < highest_memmap_pfn)) {
+ if (likely(pfn <= highest_memmap_pfn)) {
struct page *page = pfn_to_page(pfn);
if (is_device_public_page(page)) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e882cb6da994..d4b5f29906b9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -328,6 +328,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
if (err && (err != -EEXIST))
break;
err = 0;
+ cond_resched();
}
vmemmap_populate_print_last();
out:
@@ -337,7 +338,7 @@ EXPORT_SYMBOL_GPL(__add_pages);
#ifdef CONFIG_MEMORY_HOTREMOVE
/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
-static int find_smallest_section_pfn(int nid, struct zone *zone,
+static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
unsigned long end_pfn)
{
@@ -362,7 +363,7 @@ static int find_smallest_section_pfn(int nid, struct zone *zone,
}
/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
-static int find_biggest_section_pfn(int nid, struct zone *zone,
+static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
unsigned long end_pfn)
{
@@ -550,7 +551,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
return ret;
scn_nr = __section_nr(ms);
- start_pfn = section_nr_to_pfn(scn_nr);
+ start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
__remove_zone(zone, start_pfn);
sparse_remove_one_section(zone, ms, map_offset);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 99736e026712..dee0f75c3013 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -40,6 +40,7 @@
#include <linux/ratelimit.h>
#include <linux/kthread.h>
#include <linux/init.h>
+#include <linux/mmu_notifier.h>
#include <asm/tlb.h>
#include "internal.h"
@@ -495,6 +496,21 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
}
/*
+ * If the mm has notifiers then we would need to invalidate them around
+ * unmap_page_range and that is risky because notifiers can sleep and
+ * what they do is basically undeterministic. So let's have a short
+ * sleep to give the oom victim some more time.
+ * TODO: we really want to get rid of this ugly hack and make sure that
+ * notifiers cannot block for unbounded amount of time and add
+ * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
+ */
+ if (mm_has_notifiers(mm)) {
+ up_read(&mm->mmap_sem);
+ schedule_timeout_idle(HZ);
+ goto unlock_oom;
+ }
+
+ /*
* MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
* work on the mm anymore. The check for MMF_OOM_SKIP must run
* under mmap_sem for reading because it serializes against the
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c841af88836a..77e4d3c5c57b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1190,7 +1190,7 @@ static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void init_reserved_page(unsigned long pfn)
+static void __meminit init_reserved_page(unsigned long pfn)
{
pg_data_t *pgdat;
int nid, zid;
@@ -5367,6 +5367,7 @@ not_early:
__init_single_page(page, pfn, zone, nid);
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ cond_resched();
} else {
__init_single_pfn(pfn, zone, nid);
}
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index 6142484e88f7..7a58460bfd27 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -73,7 +73,7 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
last_alloc + 1 : 0;
as_len = 0;
- start = chunk->start_offset;
+ start = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
/*
* If a bit is set in the allocation map, the bound_map identifies
diff --git a/mm/percpu.c b/mm/percpu.c
index 59d44d61f5f1..aa121cef76de 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -353,6 +353,8 @@ static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
block->contig_hint_start);
return;
}
+ /* reset to satisfy the second predicate above */
+ block_off = 0;
*bits = block->right_free;
*bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
@@ -407,6 +409,8 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
*bit_off = pcpu_block_off_to_off(i, block->first_free);
return;
}
+ /* reset to satisfy the second predicate above */
+ block_off = 0;
*bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
align);
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
index 6bb4deb12e78..d908c8769b48 100644
--- a/mm/rodata_test.c
+++ b/mm/rodata_test.c
@@ -14,7 +14,7 @@
#include <linux/uaccess.h>
#include <asm/sections.h>
-const int rodata_test_data = 0xC3;
+static const int rodata_test_data = 0xC3;
void rodata_test(void)
{
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 904a83be82de..80164599ca5d 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -165,9 +165,9 @@ static int init_memcg_params(struct kmem_cache *s,
if (!memcg_nr_cache_ids)
return 0;
- arr = kzalloc(sizeof(struct memcg_cache_array) +
- memcg_nr_cache_ids * sizeof(void *),
- GFP_KERNEL);
+ arr = kvzalloc(sizeof(struct memcg_cache_array) +
+ memcg_nr_cache_ids * sizeof(void *),
+ GFP_KERNEL);
if (!arr)
return -ENOMEM;
@@ -178,15 +178,23 @@ static int init_memcg_params(struct kmem_cache *s,
static void destroy_memcg_params(struct kmem_cache *s)
{
if (is_root_cache(s))
- kfree(rcu_access_pointer(s->memcg_params.memcg_caches));
+ kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
+}
+
+static void free_memcg_params(struct rcu_head *rcu)
+{
+ struct memcg_cache_array *old;
+
+ old = container_of(rcu, struct memcg_cache_array, rcu);
+ kvfree(old);
}
static int update_memcg_params(struct kmem_cache *s, int new_array_size)
{
struct memcg_cache_array *old, *new;
- new = kzalloc(sizeof(struct memcg_cache_array) +
- new_array_size * sizeof(void *), GFP_KERNEL);
+ new = kvzalloc(sizeof(struct memcg_cache_array) +
+ new_array_size * sizeof(void *), GFP_KERNEL);
if (!new)
return -ENOMEM;
@@ -198,7 +206,7 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size)
rcu_assign_pointer(s->memcg_params.memcg_caches, new);
if (old)
- kfree_rcu(old, rcu);
+ call_rcu(&old->rcu, free_memcg_params);
return 0;
}
diff --git a/mm/swap.c b/mm/swap.c
index 9295ae960d66..a77d68f2c1b6 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -575,7 +575,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
void *arg)
{
if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
- !PageUnevictable(page)) {
+ !PageSwapCache(page) && !PageUnevictable(page)) {
bool active = PageActive(page);
del_page_from_lru_list(page, lruvec,
@@ -665,7 +665,7 @@ void deactivate_file_page(struct page *page)
void mark_page_lazyfree(struct page *page)
{
if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
- !PageUnevictable(page)) {
+ !PageSwapCache(page) && !PageUnevictable(page)) {
struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
get_page(page);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 71ce2d1ccbf7..ed91091d1e68 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -242,6 +242,17 @@ int add_to_swap(struct page *page)
* clear SWAP_HAS_CACHE flag.
*/
goto fail;
+ /*
+ * Normally the page will be dirtied in unmap because its pte should be
+ * dirty. A special case is MADV_FREE page. The page'e pte could have
+ * dirty bit cleared but the page's SwapBacked bit is still set because
+ * clearing the dirty bit and SwapBacked bit has no lock protected. For
+ * such page, unmap will not set dirty bit for it, so page reclaim will
+ * not write the page out. This can cause data corruption when the page
+ * is swap in later. Always setting the dirty bit for the page solves
+ * the problem.
+ */
+ set_page_dirty(page);
return 1;
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 486550df32be..b2ba2ba585f3 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -250,6 +250,7 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
WARN_ON(!list_empty(&zhdr->buddy));
set_bit(PAGE_STALE, &page->private);
+ clear_bit(NEEDS_COMPACTING, &page->private);
spin_lock(&pool->lock);
if (!list_empty(&page->lru))
list_del(&page->lru);
@@ -303,7 +304,6 @@ static void free_pages_work(struct work_struct *w)
list_del(&zhdr->buddy);
if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
continue;
- clear_bit(NEEDS_COMPACTING, &page->private);
spin_unlock(&pool->stale_lock);
cancel_work_sync(&zhdr->work);
free_z3fold_page(page);
@@ -624,10 +624,8 @@ lookup:
* stale pages list. cancel_work_sync() can sleep so we must make
* sure it won't be called in case we're in atomic context.
*/
- if (zhdr && (can_sleep || !work_pending(&zhdr->work) ||
- !unlikely(work_busy(&zhdr->work)))) {
+ if (zhdr && (can_sleep || !work_pending(&zhdr->work))) {
list_del(&zhdr->buddy);
- clear_bit(NEEDS_COMPACTING, &page->private);
spin_unlock(&pool->stale_lock);
if (can_sleep)
cancel_work_sync(&zhdr->work);
@@ -875,16 +873,18 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
goto next;
}
next:
+ spin_lock(&pool->lock);
if (test_bit(PAGE_HEADLESS, &page->private)) {
if (ret == 0) {
+ spin_unlock(&pool->lock);
free_z3fold_page(page);
return 0;
}
} else if (kref_put(&zhdr->refcount, release_z3fold_page)) {
atomic64_dec(&pool->pages_nr);
+ spin_unlock(&pool->lock);
return 0;
}
- spin_lock(&pool->lock);
/*
* Add to the beginning of LRU.