summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/gup.c7
-rw-r--r--mm/huge_memory.c9
-rw-r--r--mm/hugetlb.c9
-rw-r--r--mm/khugepaged.c15
-rw-r--r--mm/kmemleak.c12
-rw-r--r--mm/memblock.c28
-rw-r--r--mm/memcontrol.c6
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c2
-rw-r--r--mm/mempolicy.c3
-rw-r--r--mm/mlock.c6
-rw-r--r--mm/page_alloc.c21
-rw-r--r--mm/page_owner.c6
-rw-r--r--mm/percpu-km.c8
-rw-r--r--mm/percpu-vm.c18
-rw-r--r--mm/percpu.c67
-rw-r--r--mm/shmem.c31
-rw-r--r--mm/slab.c1
-rw-r--r--mm/swap.c84
-rw-r--r--mm/vmalloc.c10
-rw-r--r--mm/vmscan.c90
-rw-r--r--mm/vmstat.c2
-rw-r--r--mm/zpool.c2
-rw-r--r--mm/zswap.c6
24 files changed, 224 insertions, 221 deletions
diff --git a/mm/gup.c b/mm/gup.c
index 1b46e6e74881..6afae32571ca 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -516,7 +516,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
}
if (ret & VM_FAULT_RETRY) {
- if (nonblocking)
+ if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
*nonblocking = 0;
return -EBUSY;
}
@@ -890,7 +890,10 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
break;
}
if (*locked) {
- /* VM_FAULT_RETRY didn't trigger */
+ /*
+ * VM_FAULT_RETRY didn't trigger or it was a
+ * FOLL_NOWAIT.
+ */
if (!pages_done)
pages_done = ret;
break;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 87ab9b8f56b5..5a68730eebd6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -555,7 +555,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
+ if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg,
+ true)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -1316,7 +1317,7 @@ alloc:
}
if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
- huge_gfp, &memcg, true))) {
+ huge_gfp | __GFP_NORETRY, &memcg, true))) {
put_page(new_page);
split_huge_pmd(vma, vmf->pmd, vmf->address);
if (page)
@@ -2783,11 +2784,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
list_for_each_safe(pos, next, &list) {
page = list_entry((void *)pos, struct page, mapping);
- lock_page(page);
+ if (!trylock_page(page))
+ goto next;
/* split_huge_page() removes page from list on success */
if (!split_huge_page(page))
split++;
unlock_page(page);
+next:
put_page(page);
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7c204e3d132b..976bbc5646fe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,7 @@
#include <linux/bootmem.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
+#include <linux/mmdebug.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
#include <linux/string_helpers.h>
@@ -1583,7 +1584,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
page = NULL;
} else {
h->surplus_huge_pages++;
- h->nr_huge_pages_node[page_to_nid(page)]++;
+ h->surplus_huge_pages_node[page_to_nid(page)]++;
}
out_unlock:
@@ -4374,6 +4375,12 @@ int hugetlb_reserve_pages(struct inode *inode,
struct resv_map *resv_map;
long gbl_reserve;
+ /* This should never happen */
+ if (from > to) {
+ VM_WARN(1, "%s called with a negative range\n", __func__);
+ return -EINVAL;
+ }
+
/*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b7e2268dfc9a..e42568284e06 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -530,7 +530,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
- VM_BUG_ON_PAGE(PageCompound(page), page);
+ /* TODO: teach khugepaged to collapse THP mapped with pte */
+ if (PageCompound(page)) {
+ result = SCAN_PAGE_COMPOUND;
+ goto out;
+ }
+
VM_BUG_ON_PAGE(!PageAnon(page), page);
/*
@@ -960,7 +965,9 @@ static void collapse_huge_page(struct mm_struct *mm,
goto out_nolock;
}
- if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+ /* Do not oom kill for khugepaged charges */
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY,
+ &memcg, true))) {
result = SCAN_CGROUP_CHARGE_FAIL;
goto out_nolock;
}
@@ -1319,7 +1326,9 @@ static void collapse_shmem(struct mm_struct *mm,
goto out;
}
- if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+ /* Do not oom kill for khugepaged charges */
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY,
+ &memcg, true))) {
result = SCAN_CGROUP_CHARGE_FAIL;
goto out;
}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index e83987c55a08..46c2290a08f1 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1657,8 +1657,7 @@ static void start_scan_thread(void)
}
/*
- * Stop the automatic memory scanning thread. This function must be called
- * with the scan_mutex held.
+ * Stop the automatic memory scanning thread.
*/
static void stop_scan_thread(void)
{
@@ -1921,12 +1920,15 @@ static void kmemleak_do_cleanup(struct work_struct *work)
{
stop_scan_thread();
+ mutex_lock(&scan_mutex);
/*
- * Once the scan thread has stopped, it is safe to no longer track
- * object freeing. Ordering of the scan thread stopping and the memory
- * accesses below is guaranteed by the kthread_stop() function.
+ * Once it is made sure that kmemleak_scan has stopped, it is safe to no
+ * longer track object freeing. Ordering of the scan thread stopping and
+ * the memory accesses below is guaranteed by the kthread_stop()
+ * function.
*/
kmemleak_free_enabled = 0;
+ mutex_unlock(&scan_mutex);
if (!kmemleak_found_leaks)
__kmemleak_do_cleanup();
diff --git a/mm/memblock.c b/mm/memblock.c
index 5a9ca2a1751b..48376bd33274 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1101,34 +1101,6 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
*out_nid = r->nid;
}
-unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
- unsigned long max_pfn)
-{
- struct memblock_type *type = &memblock.memory;
- unsigned int right = type->cnt;
- unsigned int mid, left = 0;
- phys_addr_t addr = PFN_PHYS(pfn + 1);
-
- do {
- mid = (right + left) / 2;
-
- if (addr < type->regions[mid].base)
- right = mid;
- else if (addr >= (type->regions[mid].base +
- type->regions[mid].size))
- left = mid + 1;
- else {
- /* addr is within the region, so pfn + 1 is valid */
- return min(pfn + 1, max_pfn);
- }
- } while (left < right);
-
- if (right == type->cnt)
- return max_pfn;
- else
- return min(PHYS_PFN(type->regions[right].base), max_pfn);
-}
-
/**
* memblock_set_node - set node ID on memblock regions
* @base: base of area to set node ID for
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 670e99b68aa6..9ec024b862ac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -714,9 +714,9 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
* invocations for reference counting, or use mem_cgroup_iter_break()
* to cancel a hierarchy walk before the round-trip is complete.
*
- * Reclaimers can specify a zone and a priority level in @reclaim to
+ * Reclaimers can specify a node and a priority level in @reclaim to
* divide up the memcgs in the hierarchy among all concurrent
- * reclaimers operating on the same zone and priority.
+ * reclaimers operating on the same node and priority.
*/
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
@@ -2299,7 +2299,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep)
}
/**
- * memcg_kmem_charge: charge a kmem page
+ * memcg_kmem_charge_memcg: charge a kmem page
* @page: page to charge
* @gfp: reclaim mode
* @order: allocation order
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 4b80ccee4535..8291b75f42c8 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1139,8 +1139,6 @@ int memory_failure(unsigned long pfn, int flags)
return 0;
}
- arch_unmap_kpfn(pfn);
-
orig_head = hpage = compound_head(p);
num_poisoned_pages_inc();
diff --git a/mm/memory.c b/mm/memory.c
index dd8de96f5547..5fcfc24904d1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -80,7 +80,7 @@
#include "internal.h"
-#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d879f1d8a44a..32cba0332787 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2124,6 +2124,9 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_INTERLEAVE:
return !!nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
+ /* a's ->flags is the same as b's */
+ if (a->flags & MPOL_F_LOCAL)
+ return true;
return a->v.preferred_node == b->v.preferred_node;
default:
BUG();
diff --git a/mm/mlock.c b/mm/mlock.c
index 79398200e423..74e5a6547c3d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -64,6 +64,12 @@ void clear_page_mlock(struct page *page)
mod_zone_page_state(page_zone(page), NR_MLOCK,
-hpage_nr_pages(page));
count_vm_event(UNEVICTABLE_PGCLEARED);
+ /*
+ * The previous TestClearPageMlocked() corresponds to the smp_mb()
+ * in __pagevec_lru_add_fn().
+ *
+ * See __pagevec_lru_add_fn for more explanation.
+ */
if (!isolate_lru_page(page)) {
putback_lru_page(page);
} else {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 81e18ceef579..1741dd23e7c1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -46,6 +46,7 @@
#include <linux/stop_machine.h>
#include <linux/sort.h>
#include <linux/pfn.h>
+#include <xen/xen.h>
#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
#include <linux/page-isolation.h>
@@ -347,6 +348,9 @@ static inline bool update_defer_init(pg_data_t *pgdat,
/* Always populate low zones for address-constrained allocations */
if (zone_end < pgdat_end_pfn(pgdat))
return true;
+ /* Xen PV domains need page structures early */
+ if (xen_pv_domain())
+ return true;
(*nr_initialised)++;
if ((*nr_initialised > pgdat->static_init_pgcnt) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
@@ -1906,7 +1910,9 @@ static int move_freepages(struct zone *zone,
* Remove at a later date when no bug reports exist related to
* grouping pages by mobility
*/
- VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
+ VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
+ pfn_valid(page_to_pfn(end_page)) &&
+ page_zone(start_page) != page_zone(end_page));
#endif
if (num_movable)
@@ -3590,7 +3596,7 @@ static bool __need_fs_reclaim(gfp_t gfp_mask)
return false;
/* this guy won't enter reclaim */
- if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+ if (current->flags & PF_MEMALLOC)
return false;
/* We're only interested __GFP_FS allocations for now */
@@ -5350,17 +5356,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
if (context != MEMMAP_EARLY)
goto not_early;
- if (!early_pfn_valid(pfn)) {
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- /*
- * Skip to the pfn preceding the next valid one (or
- * end_pfn), such that we hit a valid pfn (or end_pfn)
- * on our next iteration of the loop.
- */
- pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
-#endif
+ if (!early_pfn_valid(pfn))
continue;
- }
if (!early_pfn_in_nid(pfn, nid))
continue;
if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 9886c6073828..7172e0a80e13 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -123,13 +123,13 @@ void __reset_page_owner(struct page *page, unsigned int order)
static inline bool check_recursive_alloc(struct stack_trace *trace,
unsigned long ip)
{
- int i, count;
+ int i;
if (!trace->nr_entries)
return false;
- for (i = 0, count = 0; i < trace->nr_entries; i++) {
- if (trace->entries[i] == ip && ++count == 2)
+ for (i = 0; i < trace->nr_entries; i++) {
+ if (trace->entries[i] == ip)
return true;
}
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index d2a76642c4ae..38de70ab1a0d 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -34,7 +34,7 @@
#include <linux/log2.h>
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
- int page_start, int page_end)
+ int page_start, int page_end, gfp_t gfp)
{
return 0;
}
@@ -45,18 +45,18 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
/* nada */
}
-static struct pcpu_chunk *pcpu_create_chunk(void)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
{
const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
struct pcpu_chunk *chunk;
struct page *pages;
int i;
- chunk = pcpu_alloc_chunk();
+ chunk = pcpu_alloc_chunk(gfp);
if (!chunk)
return NULL;
- pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
+ pages = alloc_pages(gfp, order_base_2(nr_pages));
if (!pages) {
pcpu_free_chunk(chunk);
return NULL;
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 9158e5a81391..d8078de912de 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void)
lockdep_assert_held(&pcpu_alloc_mutex);
if (!pages)
- pages = pcpu_mem_zalloc(pages_size);
+ pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL);
return pages;
}
@@ -73,18 +73,21 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
* @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
* @page_start: page index of the first page to be allocated
* @page_end: page index of the last page to be allocated + 1
+ * @gfp: allocation flags passed to the underlying allocator
*
* Allocate pages [@page_start,@page_end) into @pages for all units.
* The allocation is for @chunk. Percpu core doesn't care about the
* content of @pages and will pass it verbatim to pcpu_map_pages().
*/
static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
- struct page **pages, int page_start, int page_end)
+ struct page **pages, int page_start, int page_end,
+ gfp_t gfp)
{
- const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM;
unsigned int cpu, tcpu;
int i;
+ gfp |= __GFP_HIGHMEM;
+
for_each_possible_cpu(cpu) {
for (i = page_start; i < page_end; i++) {
struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
@@ -262,6 +265,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
* @chunk: chunk of interest
* @page_start: the start page
* @page_end: the end page
+ * @gfp: allocation flags passed to the underlying memory allocator
*
* For each cpu, populate and map pages [@page_start,@page_end) into
* @chunk.
@@ -270,7 +274,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
* pcpu_alloc_mutex, does GFP_KERNEL allocation.
*/
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
- int page_start, int page_end)
+ int page_start, int page_end, gfp_t gfp)
{
struct page **pages;
@@ -278,7 +282,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
if (!pages)
return -ENOMEM;
- if (pcpu_alloc_pages(chunk, pages, page_start, page_end))
+ if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp))
return -ENOMEM;
if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
@@ -325,12 +329,12 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
pcpu_free_pages(chunk, pages, page_start, page_end);
}
-static struct pcpu_chunk *pcpu_create_chunk(void)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
{
struct pcpu_chunk *chunk;
struct vm_struct **vms;
- chunk = pcpu_alloc_chunk();
+ chunk = pcpu_alloc_chunk(gfp);
if (!chunk)
return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index 50e7fdf84055..9297098519a6 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -80,6 +80,7 @@
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include <linux/kmemleak.h>
+#include <linux/sched.h>
#include <asm/cacheflush.h>
#include <asm/sections.h>
@@ -447,26 +448,25 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
/**
* pcpu_mem_zalloc - allocate memory
* @size: bytes to allocate
+ * @gfp: allocation flags
*
* Allocate @size bytes. If @size is smaller than PAGE_SIZE,
- * kzalloc() is used; otherwise, vzalloc() is used. The returned
- * memory is always zeroed.
- *
- * CONTEXT:
- * Does GFP_KERNEL allocation.
+ * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
+ * This is to facilitate passing through whitelisted flags. The
+ * returned memory is always zeroed.
*
* RETURNS:
* Pointer to the allocated area on success, NULL on failure.
*/
-static void *pcpu_mem_zalloc(size_t size)
+static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
{
if (WARN_ON_ONCE(!slab_is_available()))
return NULL;
if (size <= PAGE_SIZE)
- return kzalloc(size, GFP_KERNEL);
+ return kzalloc(size, gfp);
else
- return vzalloc(size);
+ return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL);
}
/**
@@ -1154,12 +1154,12 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
return chunk;
}
-static struct pcpu_chunk *pcpu_alloc_chunk(void)
+static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
{
struct pcpu_chunk *chunk;
int region_bits;
- chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
+ chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
if (!chunk)
return NULL;
@@ -1168,17 +1168,17 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
region_bits = pcpu_chunk_map_bits(chunk);
chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
- sizeof(chunk->alloc_map[0]));
+ sizeof(chunk->alloc_map[0]), gfp);
if (!chunk->alloc_map)
goto alloc_map_fail;
chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
- sizeof(chunk->bound_map[0]));
+ sizeof(chunk->bound_map[0]), gfp);
if (!chunk->bound_map)
goto bound_map_fail;
chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
- sizeof(chunk->md_blocks[0]));
+ sizeof(chunk->md_blocks[0]), gfp);
if (!chunk->md_blocks)
goto md_blocks_fail;
@@ -1277,9 +1277,11 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
* pcpu_addr_to_page - translate address to physical address
* pcpu_verify_alloc_info - check alloc_info is acceptable during init
*/
-static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
-static struct pcpu_chunk *pcpu_create_chunk(void);
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
+ int page_start, int page_end, gfp_t gfp);
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
+ int page_start, int page_end);
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
@@ -1339,6 +1341,8 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
gfp_t gfp)
{
+ /* whitelisted flags that can be passed to the backing allocators */
+ gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
bool do_warn = !(gfp & __GFP_NOWARN);
static int warn_limit = 10;
@@ -1369,8 +1373,17 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
return NULL;
}
- if (!is_atomic)
- mutex_lock(&pcpu_alloc_mutex);
+ if (!is_atomic) {
+ /*
+ * pcpu_balance_workfn() allocates memory under this mutex,
+ * and it may wait for memory reclaim. Allow current task
+ * to become OOM victim, in case of memory pressure.
+ */
+ if (gfp & __GFP_NOFAIL)
+ mutex_lock(&pcpu_alloc_mutex);
+ else if (mutex_lock_killable(&pcpu_alloc_mutex))
+ return NULL;
+ }
spin_lock_irqsave(&pcpu_lock, flags);
@@ -1421,7 +1434,7 @@ restart:
}
if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
- chunk = pcpu_create_chunk();
+ chunk = pcpu_create_chunk(pcpu_gfp);
if (!chunk) {
err = "failed to allocate new chunk";
goto fail;
@@ -1450,7 +1463,7 @@ area_found:
page_start, page_end) {
WARN_ON(chunk->immutable);
- ret = pcpu_populate_chunk(chunk, rs, re);
+ ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
spin_lock_irqsave(&pcpu_lock, flags);
if (ret) {
@@ -1561,10 +1574,17 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
* pcpu_balance_workfn - manage the amount of free chunks and populated pages
* @work: unused
*
- * Reclaim all fully free chunks except for the first one.
+ * Reclaim all fully free chunks except for the first one. This is also
+ * responsible for maintaining the pool of empty populated pages. However,
+ * it is possible that this is called when physical memory is scarce causing
+ * OOM killer to be triggered. We should avoid doing so until an actual
+ * allocation causes the failure as it is possible that requests can be
+ * serviced from already backed regions.
*/
static void pcpu_balance_workfn(struct work_struct *work)
{
+ /* gfp flags passed to underlying allocators */
+ const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
LIST_HEAD(to_free);
struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
struct pcpu_chunk *chunk, *next;
@@ -1600,6 +1620,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
spin_unlock_irq(&pcpu_lock);
}
pcpu_destroy_chunk(chunk);
+ cond_resched();
}
/*
@@ -1645,7 +1666,7 @@ retry_pop:
chunk->nr_pages) {
int nr = min(re - rs, nr_to_pop);
- ret = pcpu_populate_chunk(chunk, rs, rs + nr);
+ ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
if (!ret) {
nr_to_pop -= nr;
spin_lock_irq(&pcpu_lock);
@@ -1662,7 +1683,7 @@ retry_pop:
if (nr_to_pop) {
/* ran out of chunks to populate, create a new one and retry */
- chunk = pcpu_create_chunk();
+ chunk = pcpu_create_chunk(gfp);
if (chunk) {
spin_lock_irq(&pcpu_lock);
pcpu_chunk_relocate(chunk, -1);
diff --git a/mm/shmem.c b/mm/shmem.c
index 1907688b75ee..b85919243399 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -493,36 +493,45 @@ next:
info = list_entry(pos, struct shmem_inode_info, shrinklist);
inode = &info->vfs_inode;
- if (nr_to_split && split >= nr_to_split) {
- iput(inode);
- continue;
- }
+ if (nr_to_split && split >= nr_to_split)
+ goto leave;
- page = find_lock_page(inode->i_mapping,
+ page = find_get_page(inode->i_mapping,
(inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
if (!page)
goto drop;
+ /* No huge page at the end of the file: nothing to split */
if (!PageTransHuge(page)) {
- unlock_page(page);
put_page(page);
goto drop;
}
+ /*
+ * Leave the inode on the list if we failed to lock
+ * the page at this time.
+ *
+ * Waiting for the lock may lead to deadlock in the
+ * reclaim path.
+ */
+ if (!trylock_page(page)) {
+ put_page(page);
+ goto leave;
+ }
+
ret = split_huge_page(page);
unlock_page(page);
put_page(page);
- if (ret) {
- /* split failed: leave it on the list */
- iput(inode);
- continue;
- }
+ /* If split failed leave the inode on the list */
+ if (ret)
+ goto leave;
split++;
drop:
list_del_init(&info->shrinklist);
removed++;
+leave:
iput(inode);
}
diff --git a/mm/slab.c b/mm/slab.c
index 324446621b3e..9095c3945425 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1283,6 +1283,7 @@ void __init kmem_cache_init(void)
nr_node_ids * sizeof(struct kmem_cache_node *),
SLAB_HWCACHE_ALIGN, 0, 0);
list_add(&kmem_cache->list, &slab_caches);
+ memcg_link_cache(kmem_cache);
slab_state = PARTIAL;
/*
diff --git a/mm/swap.c b/mm/swap.c
index 567a7b96e41d..0f17330dd0e5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -446,30 +446,6 @@ void lru_cache_add(struct page *page)
}
/**
- * add_page_to_unevictable_list - add a page to the unevictable list
- * @page: the page to be added to the unevictable list
- *
- * Add page directly to its zone's unevictable list. To avoid races with
- * tasks that might be making the page evictable, through eg. munlock,
- * munmap or exit, while it's not on the lru, we want to add the page
- * while it's locked or otherwise "invisible" to other tasks. This is
- * difficult to do when using the pagevec cache, so bypass that.
- */
-void add_page_to_unevictable_list(struct page *page)
-{
- struct pglist_data *pgdat = page_pgdat(page);
- struct lruvec *lruvec;
-
- spin_lock_irq(&pgdat->lru_lock);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
- ClearPageActive(page);
- SetPageUnevictable(page);
- SetPageLRU(page);
- add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
- spin_unlock_irq(&pgdat->lru_lock);
-}
-
-/**
* lru_cache_add_active_or_unevictable
* @page: the page to be added to LRU
* @vma: vma in which page is mapped for determining reclaimability
@@ -484,13 +460,9 @@ void lru_cache_add_active_or_unevictable(struct page *page,
{
VM_BUG_ON_PAGE(PageLRU(page), page);
- if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
+ if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
SetPageActive(page);
- lru_cache_add(page);
- return;
- }
-
- if (!TestSetPageMlocked(page)) {
+ else if (!TestSetPageMlocked(page)) {
/*
* We use the irq-unsafe __mod_zone_page_stat because this
* counter is not modified from interrupt context, and the pte
@@ -500,7 +472,7 @@ void lru_cache_add_active_or_unevictable(struct page *page,
hpage_nr_pages(page));
count_vm_event(UNEVICTABLE_PGMLOCKED);
}
- add_page_to_unevictable_list(page);
+ lru_cache_add(page);
}
/*
@@ -886,15 +858,55 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
void *arg)
{
- int file = page_is_file_cache(page);
- int active = PageActive(page);
- enum lru_list lru = page_lru(page);
+ enum lru_list lru;
+ int was_unevictable = TestClearPageUnevictable(page);
VM_BUG_ON_PAGE(PageLRU(page), page);
SetPageLRU(page);
+ /*
+ * Page becomes evictable in two ways:
+ * 1) Within LRU lock [munlock_vma_pages() and __munlock_pagevec()].
+ * 2) Before acquiring LRU lock to put the page to correct LRU and then
+ * a) do PageLRU check with lock [check_move_unevictable_pages]
+ * b) do PageLRU check before lock [clear_page_mlock]
+ *
+ * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need
+ * following strict ordering:
+ *
+ * #0: __pagevec_lru_add_fn #1: clear_page_mlock
+ *
+ * SetPageLRU() TestClearPageMlocked()
+ * smp_mb() // explicit ordering // above provides strict
+ * // ordering
+ * PageMlocked() PageLRU()
+ *
+ *
+ * if '#1' does not observe setting of PG_lru by '#0' and fails
+ * isolation, the explicit barrier will make sure that page_evictable
+ * check will put the page in correct LRU. Without smp_mb(), SetPageLRU
+ * can be reordered after PageMlocked check and can make '#1' to fail
+ * the isolation of the page whose Mlocked bit is cleared (#0 is also
+ * looking at the same page) and the evictable page will be stranded
+ * in an unevictable LRU.
+ */
+ smp_mb();
+
+ if (page_evictable(page)) {
+ lru = page_lru(page);
+ update_page_reclaim_stat(lruvec, page_is_file_cache(page),
+ PageActive(page));
+ if (was_unevictable)
+ count_vm_event(UNEVICTABLE_PGRESCUED);
+ } else {
+ lru = LRU_UNEVICTABLE;
+ ClearPageActive(page);
+ SetPageUnevictable(page);
+ if (!was_unevictable)
+ count_vm_event(UNEVICTABLE_PGCULLED);
+ }
+
add_page_to_lru_list(page, lruvec, lru);
- update_page_reclaim_stat(lruvec, file, active);
trace_mm_lru_insertion(page, lru);
}
@@ -913,7 +925,7 @@ EXPORT_SYMBOL(__pagevec_lru_add);
* @pvec: Where the resulting entries are placed
* @mapping: The address_space to search
* @start: The starting entry index
- * @nr_pages: The maximum number of pages
+ * @nr_entries: The maximum number of pages
* @indices: The cache indices corresponding to the entries in @pvec
*
* pagevec_lookup_entries() will search for and return a group of up
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 673942094328..ebff729cc956 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1943,11 +1943,15 @@ void *vmalloc_exec(unsigned long size)
}
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
-#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
+#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
-#define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL
+#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
#else
-#define GFP_VMALLOC32 GFP_KERNEL
+/*
+ * 64b systems should always have either DMA or DMA32 zones. For others
+ * GFP_DMA32 should do the right thing and use the normal zone.
+ */
+#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
#endif
/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 444749669187..cd5dc3faaa57 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -769,64 +769,7 @@ int remove_mapping(struct address_space *mapping, struct page *page)
*/
void putback_lru_page(struct page *page)
{
- bool is_unevictable;
- int was_unevictable = PageUnevictable(page);
-
- VM_BUG_ON_PAGE(PageLRU(page), page);
-
-redo:
- ClearPageUnevictable(page);
-
- if (page_evictable(page)) {
- /*
- * For evictable pages, we can use the cache.
- * In event of a race, worst case is we end up with an
- * unevictable page on [in]active list.
- * We know how to handle that.
- */
- is_unevictable = false;
- lru_cache_add(page);
- } else {
- /*
- * Put unevictable pages directly on zone's unevictable
- * list.
- */
- is_unevictable = true;
- add_page_to_unevictable_list(page);
- /*
- * When racing with an mlock or AS_UNEVICTABLE clearing
- * (page is unlocked) make sure that if the other thread
- * does not observe our setting of PG_lru and fails
- * isolation/check_move_unevictable_pages,
- * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
- * the page back to the evictable list.
- *
- * The other side is TestClearPageMlocked() or shmem_lock().
- */
- smp_mb();
- }
-
- /*
- * page's status can change while we move it among lru. If an evictable
- * page is on unevictable list, it never be freed. To avoid that,
- * check after we added it to the list, again.
- */
- if (is_unevictable && page_evictable(page)) {
- if (!isolate_lru_page(page)) {
- put_page(page);
- goto redo;
- }
- /* This means someone else dropped this page from LRU
- * So, it will be freed or putback to LRU again. There is
- * nothing to do here.
- */
- }
-
- if (was_unevictable && !is_unevictable)
- count_vm_event(UNEVICTABLE_PGRESCUED);
- else if (!was_unevictable && is_unevictable)
- count_vm_event(UNEVICTABLE_PGCULLED);
-
+ lru_cache_add(page);
put_page(page); /* drop ref from isolate */
}
@@ -1837,6 +1780,20 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
set_bit(PGDAT_WRITEBACK, &pgdat->flags);
/*
+ * If dirty pages are scanned that are not queued for IO, it
+ * implies that flushers are not doing their job. This can
+ * happen when memory pressure pushes dirty pages to the end of
+ * the LRU before the dirty limits are breached and the dirty
+ * data has expired. It can also happen when the proportion of
+ * dirty pages grows not through writes but through memory
+ * pressure reclaiming all the clean cache. And in some cases,
+ * the flushers simply cannot keep up with the allocation
+ * rate. Nudge the flusher threads in case they are asleep.
+ */
+ if (stat.nr_unqueued_dirty == nr_taken)
+ wakeup_flusher_threads(WB_REASON_VMSCAN);
+
+ /*
* Legacy memcg will stall in page writeback so avoid forcibly
* stalling here.
*/
@@ -1848,22 +1805,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)
set_bit(PGDAT_CONGESTED, &pgdat->flags);
- /*
- * If dirty pages are scanned that are not queued for IO, it
- * implies that flushers are not doing their job. This can
- * happen when memory pressure pushes dirty pages to the end of
- * the LRU before the dirty limits are breached and the dirty
- * data has expired. It can also happen when the proportion of
- * dirty pages grows not through writes but through memory
- * pressure reclaiming all the clean cache. And in some cases,
- * the flushers simply cannot keep up with the allocation
- * rate. Nudge the flusher threads in case they are asleep, but
- * also allow kswapd to start writing pages during reclaim.
- */
- if (stat.nr_unqueued_dirty == nr_taken) {
- wakeup_flusher_threads(WB_REASON_VMSCAN);
+ /* Allow kswapd to start writing pages during reclaim. */
+ if (stat.nr_unqueued_dirty == nr_taken)
set_bit(PGDAT_DIRTY, &pgdat->flags);
- }
/*
* If kswapd scans pages marked marked for immediate
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 40b2db6db6b1..33581be705f0 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1839,9 +1839,11 @@ static void vmstat_update(struct work_struct *w)
* to occur in the future. Keep on running the
* update worker thread.
*/
+ preempt_disable();
queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
this_cpu_ptr(&vmstat_work),
round_jiffies_relative(sysctl_stat_interval));
+ preempt_enable();
}
}
diff --git a/mm/zpool.c b/mm/zpool.c
index f8cb83e7699b..01a771e304fa 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -360,7 +360,7 @@ u64 zpool_get_total_size(struct zpool *zpool)
/**
* zpool_evictable() - Test if zpool is potentially evictable
- * @pool The zpool to test
+ * @zpool: The zpool to test
*
* Zpool is only potentially evictable when it's created with struct
* zpool_ops.evict and its driver implements struct zpool_driver.shrink.
diff --git a/mm/zswap.c b/mm/zswap.c
index c004aa4fd3f4..61a5c41972db 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1007,6 +1007,12 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
u8 *src, *dst;
struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
+ /* THP isn't supported */
+ if (PageTransHuge(page)) {
+ ret = -EINVAL;
+ goto reject;
+ }
+
if (!zswap_enabled || !tree) {
ret = -ENODEV;
goto reject;