summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/filemap.c67
-rw-r--r--mm/gup.c2
-rw-r--r--mm/huge_memory.c32
-rw-r--r--mm/hugetlb.c65
-rw-r--r--mm/internal.h12
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/madvise.c13
-rw-r--r--mm/memcontrol.c40
-rw-r--r--mm/memory.c135
-rw-r--r--mm/memory_hotplug.c114
-rw-r--r--mm/mmap.c46
-rw-r--r--mm/mremap.c13
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/oom_kill.c24
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c438
-rw-r--r--mm/page_ext.c6
-rw-r--r--mm/page_idle.c2
-rw-r--r--mm/page_io.c21
-rw-r--r--mm/page_owner.c68
-rw-r--r--mm/shmem.c206
-rw-r--r--mm/slub.c52
-rw-r--r--mm/sparse-vmemmap.c11
-rw-r--r--mm/sparse.c10
-rw-r--r--mm/swap.c24
-rw-r--r--mm/swap_state.c314
-rw-r--r--mm/swapfile.c362
-rw-r--r--mm/userfaultfd.c48
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c20
-rw-r--r--mm/vmscan.c113
-rw-r--r--mm/vmstat.c15
-rw-r--r--mm/z3fold.c479
-rw-r--r--mm/zsmalloc.c8
35 files changed, 1868 insertions, 905 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 48b1af447fa7..0ded10a22639 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -678,6 +678,7 @@ config ZONE_DEVICE
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
depends on ARCH_HAS_ZONE_DEVICE
+ select RADIX_TREE_MULTIORDER
help
Device memory hotplug support allows for establishing pmem,
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e01cb6e5173..9d21afd692b9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -130,17 +130,8 @@ static int page_cache_tree_insert(struct address_space *mapping,
return -EEXIST;
mapping->nrexceptional--;
- if (!dax_mapping(mapping)) {
- if (shadowp)
- *shadowp = p;
- } else {
- /* DAX can replace empty locked entry with a hole */
- WARN_ON_ONCE(p !=
- dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
- /* Wakeup waiters for exceptional entry lock */
- dax_wake_mapping_entry_waiter(mapping, page->index, p,
- true);
- }
+ if (shadowp)
+ *shadowp = p;
}
__radix_tree_replace(&mapping->page_tree, node, slot, page,
workingset_update_node, mapping);
@@ -402,8 +393,7 @@ bool filemap_range_has_page(struct address_space *mapping,
{
pgoff_t index = start_byte >> PAGE_SHIFT;
pgoff_t end = end_byte >> PAGE_SHIFT;
- struct pagevec pvec;
- bool ret;
+ struct page *page;
if (end_byte < start_byte)
return false;
@@ -411,12 +401,10 @@ bool filemap_range_has_page(struct address_space *mapping,
if (mapping->nrpages == 0)
return false;
- pagevec_init(&pvec, 0);
- if (!pagevec_lookup(&pvec, mapping, index, 1))
+ if (!find_get_pages_range(mapping, &index, end, 1, &page))
return false;
- ret = (pvec.pages[0]->index <= end);
- pagevec_release(&pvec);
- return ret;
+ put_page(page);
+ return true;
}
EXPORT_SYMBOL(filemap_range_has_page);
@@ -1564,23 +1552,29 @@ export:
}
/**
- * find_get_pages - gang pagecache lookup
+ * find_get_pages_range - gang pagecache lookup
* @mapping: The address_space to search
* @start: The starting page index
+ * @end: The final page index (inclusive)
* @nr_pages: The maximum number of pages
* @pages: Where the resulting pages are placed
*
- * find_get_pages() will search for and return a group of up to
- * @nr_pages pages in the mapping. The pages are placed at @pages.
- * find_get_pages() takes a reference against the returned pages.
+ * find_get_pages_range() will search for and return a group of up to @nr_pages
+ * pages in the mapping starting at index @start and up to index @end
+ * (inclusive). The pages are placed at @pages. find_get_pages_range() takes
+ * a reference against the returned pages.
*
* The search returns a group of mapping-contiguous pages with ascending
* indexes. There may be holes in the indices due to not-present pages.
+ * We also update @start to index the next page for the traversal.
*
- * find_get_pages() returns the number of pages which were found.
+ * find_get_pages_range() returns the number of pages which were found. If this
+ * number is smaller than @nr_pages, the end of specified range has been
+ * reached.
*/
-unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
- unsigned int nr_pages, struct page **pages)
+unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, unsigned int nr_pages,
+ struct page **pages)
{
struct radix_tree_iter iter;
void **slot;
@@ -1590,8 +1584,11 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
return 0;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) {
struct page *head, *page;
+
+ if (iter.index > end)
+ break;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1627,11 +1624,25 @@ repeat:
}
pages[ret] = page;
- if (++ret == nr_pages)
- break;
+ if (++ret == nr_pages) {
+ *start = pages[ret - 1]->index + 1;
+ goto out;
+ }
}
+ /*
+ * We come here when there is no page beyond @end. We take care to not
+ * overflow the index @start as it confuses some of the callers. This
+ * breaks the iteration when there is page at index -1 but that is
+ * already broken anyway.
+ */
+ if (end == (pgoff_t)-1)
+ *start = (pgoff_t)-1;
+ else
+ *start = end + 1;
+out:
rcu_read_unlock();
+
return ret;
}
diff --git a/mm/gup.c b/mm/gup.c
index 23f01c40c88f..33d651deeae2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1352,7 +1352,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
}
#endif /* __HAVE_ARCH_PTE_SPECIAL */
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
static int __gup_device_huge(unsigned long pfn, unsigned long addr,
unsigned long end, struct page **pages, int *nr)
{
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3644ff918434..0b51e70e0a8b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -328,7 +328,7 @@ static struct attribute *hugepage_attr[] = {
NULL,
};
-static struct attribute_group hugepage_attr_group = {
+static const struct attribute_group hugepage_attr_group = {
.attrs = hugepage_attr,
};
@@ -567,7 +567,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
goto release;
}
- clear_huge_page(page, haddr, HPAGE_PMD_NR);
+ clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
/*
* The memory barrier inside __SetPageUptodate makes sure that
* clear_huge_page writes become visible before the set_pmd_at()
@@ -1240,15 +1240,29 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
* We can only reuse the page if nobody else maps the huge page or it's
* part.
*/
- if (page_trans_huge_mapcount(page, NULL) == 1) {
+ if (!trylock_page(page)) {
+ get_page(page);
+ spin_unlock(vmf->ptl);
+ lock_page(page);
+ spin_lock(vmf->ptl);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ unlock_page(page);
+ put_page(page);
+ goto out_unlock;
+ }
+ put_page(page);
+ }
+ if (reuse_swap_page(page, NULL)) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
ret |= VM_FAULT_WRITE;
+ unlock_page(page);
goto out_unlock;
}
+ unlock_page(page);
get_page(page);
spin_unlock(vmf->ptl);
alloc:
@@ -1291,7 +1305,7 @@ alloc:
count_vm_event(THP_FAULT_ALLOC);
if (!page)
- clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
+ clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
else
copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);
@@ -2467,6 +2481,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageCompound(page), page);
+ if (PageWriteback(page))
+ return -EBUSY;
+
if (PageAnon(head)) {
/*
* The caller does not necessarily hold an mmap_sem that would
@@ -2544,7 +2561,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
__dec_node_page_state(page, NR_SHMEM_THPS);
spin_unlock(&pgdata->split_queue_lock);
__split_huge_page(page, list, flags);
- ret = 0;
+ if (PageSwapCache(head)) {
+ swp_entry_t entry = { .val = page_private(head) };
+
+ ret = split_swap_cluster(entry);
+ } else
+ ret = 0;
} else {
if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
pr_alert("total_mapcount: %u, page_count(): %u\n",
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 31e207cb399b..34625b257128 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1066,11 +1066,11 @@ static void free_gigantic_page(struct page *page, unsigned int order)
}
static int __alloc_gigantic_page(unsigned long start_pfn,
- unsigned long nr_pages)
+ unsigned long nr_pages, gfp_t gfp_mask)
{
unsigned long end_pfn = start_pfn + nr_pages;
return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
- GFP_KERNEL);
+ gfp_mask);
}
static bool pfn_range_valid_gigantic(struct zone *z,
@@ -1108,19 +1108,24 @@ static bool zone_spans_last_pfn(const struct zone *zone,
return zone_spans_pfn(zone, last_pfn);
}
-static struct page *alloc_gigantic_page(int nid, unsigned int order)
+static struct page *alloc_gigantic_page(int nid, struct hstate *h)
{
+ unsigned int order = huge_page_order(h);
unsigned long nr_pages = 1 << order;
unsigned long ret, pfn, flags;
- struct zone *z;
+ struct zonelist *zonelist;
+ struct zone *zone;
+ struct zoneref *z;
+ gfp_t gfp_mask;
- z = NODE_DATA(nid)->node_zones;
- for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
- spin_lock_irqsave(&z->lock, flags);
+ gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+ zonelist = node_zonelist(nid, gfp_mask);
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) {
+ spin_lock_irqsave(&zone->lock, flags);
- pfn = ALIGN(z->zone_start_pfn, nr_pages);
- while (zone_spans_last_pfn(z, pfn, nr_pages)) {
- if (pfn_range_valid_gigantic(z, pfn, nr_pages)) {
+ pfn = ALIGN(zone->zone_start_pfn, nr_pages);
+ while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
+ if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
/*
* We release the zone lock here because
* alloc_contig_range() will also lock the zone
@@ -1128,16 +1133,16 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order)
* spinning on this lock, it may win the race
* and cause alloc_contig_range() to fail...
*/
- spin_unlock_irqrestore(&z->lock, flags);
- ret = __alloc_gigantic_page(pfn, nr_pages);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
if (!ret)
return pfn_to_page(pfn);
- spin_lock_irqsave(&z->lock, flags);
+ spin_lock_irqsave(&zone->lock, flags);
}
pfn += nr_pages;
}
- spin_unlock_irqrestore(&z->lock, flags);
+ spin_unlock_irqrestore(&zone->lock, flags);
}
return NULL;
@@ -1150,7 +1155,7 @@ static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
{
struct page *page;
- page = alloc_gigantic_page(nid, huge_page_order(h));
+ page = alloc_gigantic_page(nid, h);
if (page) {
prep_compound_gigantic_page(page, huge_page_order(h));
prep_new_huge_page(h, page, nid);
@@ -2569,13 +2574,13 @@ static struct attribute *hstate_attrs[] = {
NULL,
};
-static struct attribute_group hstate_attr_group = {
+static const struct attribute_group hstate_attr_group = {
.attrs = hstate_attrs,
};
static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
struct kobject **hstate_kobjs,
- struct attribute_group *hstate_attr_group)
+ const struct attribute_group *hstate_attr_group)
{
int retval;
int hi = hstate_index(h);
@@ -2633,7 +2638,7 @@ static struct attribute *per_node_hstate_attrs[] = {
NULL,
};
-static struct attribute_group per_node_hstate_attr_group = {
+static const struct attribute_group per_node_hstate_attr_group = {
.attrs = per_node_hstate_attrs,
};
@@ -4600,6 +4605,15 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
return pte;
}
+/*
+ * huge_pte_offset() - Walk the page table to resolve the hugepage
+ * entry at address @addr
+ *
+ * Return: Pointer to page table or swap entry (PUD or PMD) for
+ * address @addr, or NULL if a p*d_none() entry is encountered and the
+ * size @sz doesn't match the hugepage size at this level of the page
+ * table.
+ */
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
{
@@ -4614,13 +4628,22 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
p4d = p4d_offset(pgd, addr);
if (!p4d_present(*p4d))
return NULL;
+
pud = pud_offset(p4d, addr);
- if (!pud_present(*pud))
+ if (sz != PUD_SIZE && pud_none(*pud))
return NULL;
- if (pud_huge(*pud))
+ /* hugepage or swap? */
+ if (pud_huge(*pud) || !pud_present(*pud))
return (pte_t *)pud;
+
pmd = pmd_offset(pud, addr);
- return (pte_t *) pmd;
+ if (sz != PMD_SIZE && pmd_none(*pmd))
+ return NULL;
+ /* hugepage or swap? */
+ if (pmd_huge(*pmd) || !pmd_present(*pmd))
+ return (pte_t *)pmd;
+
+ return NULL;
}
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
diff --git a/mm/internal.h b/mm/internal.h
index 4ef49fc55e58..1df011f62480 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -480,6 +480,17 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
/* Mask to get the watermark bits */
#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
+/*
+ * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
+ * cannot assume a reduced access to memory reserves is sufficient for
+ * !MMU
+ */
+#ifdef CONFIG_MMU
+#define ALLOC_OOM 0x08
+#else
+#define ALLOC_OOM ALLOC_NO_WATERMARKS
+#endif
+
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
@@ -525,4 +536,5 @@ static inline bool is_migrate_highatomic_page(struct page *page)
return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
}
+void setup_zone_pageset(struct zone *zone);
#endif /* __MM_INTERNAL_H */
diff --git a/mm/ksm.c b/mm/ksm.c
index db20f8436bc3..15dd7415f7b3 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -3043,7 +3043,7 @@ static struct attribute *ksm_attrs[] = {
NULL,
};
-static struct attribute_group ksm_attr_group = {
+static const struct attribute_group ksm_attr_group = {
.attrs = ksm_attrs,
.name = "ksm",
};
diff --git a/mm/madvise.c b/mm/madvise.c
index 4d7d1e5ddba9..eea1c733286f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -80,6 +80,17 @@ static long madvise_behavior(struct vm_area_struct *vma,
}
new_flags &= ~VM_DONTCOPY;
break;
+ case MADV_WIPEONFORK:
+ /* MADV_WIPEONFORK is only supported on anonymous memory. */
+ if (vma->vm_file || vma->vm_flags & VM_SHARED) {
+ error = -EINVAL;
+ goto out;
+ }
+ new_flags |= VM_WIPEONFORK;
+ break;
+ case MADV_KEEPONFORK:
+ new_flags &= ~VM_WIPEONFORK;
+ break;
case MADV_DONTDUMP:
new_flags |= VM_DONTDUMP;
break;
@@ -696,6 +707,8 @@ madvise_behavior_valid(int behavior)
#endif
case MADV_DONTDUMP:
case MADV_DODUMP:
+ case MADV_WIPEONFORK:
+ case MADV_KEEPONFORK:
#ifdef CONFIG_MEMORY_FAILURE
case MADV_SOFT_OFFLINE:
case MADV_HWPOISON:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e09741af816f..ad15850ee157 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -550,10 +550,12 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
* value, and reading all cpu value can be performance bottleneck in some
* common workload, threshold and synchronization as vmstat[] should be
* implemented.
+ *
+ * The parameter idx can be of type enum memcg_event_item or vm_event_item.
*/
static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
- enum memcg_event_item event)
+ int event)
{
unsigned long val = 0;
int cpu;
@@ -1915,7 +1917,7 @@ retry:
* bypass the last charges so that they can exit quickly and
* free their memory.
*/
- if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+ if (unlikely(tsk_is_oom_victim(current) ||
fatal_signal_pending(current) ||
current->flags & PF_EXITING))
goto force;
@@ -4319,6 +4321,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock(&memcg->event_list_lock);
+ memcg->low = 0;
+
memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
@@ -4635,8 +4639,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
if (!ret || !target)
put_page(page);
}
- /* There is a swap entry and a page doesn't exist or isn't charged */
- if (ent.val && !ret &&
+ /*
+ * There is a swap entry and a page doesn't exist or isn't charged.
+ * But we cannot move a tail-page in a THP.
+ */
+ if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
ret = MC_TARGET_SWAP;
if (target)
@@ -4647,8 +4654,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
- * We don't consider swapping or file mapped pages because THP does not
- * support them for now.
+ * We don't consider PMD mapped swapping or file mapped pages because THP does
+ * not support them for now.
* Caller should make sure that pmd_trans_huge(pmd) is true.
*/
static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
@@ -5423,7 +5430,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
* in turn serializes uncharging.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (page->mem_cgroup)
+ if (compound_head(page)->mem_cgroup)
goto out;
if (do_swap_account) {
@@ -5906,6 +5913,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
{
struct mem_cgroup *memcg, *swap_memcg;
+ unsigned int nr_entries;
unsigned short oldid;
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -5926,19 +5934,24 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* ancestor for the swap instead and transfer the memory+swap charge.
*/
swap_memcg = mem_cgroup_id_get_online(memcg);
- oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1);
+ nr_entries = hpage_nr_pages(page);
+ /* Get references for the tail pages, too */
+ if (nr_entries > 1)
+ mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
+ nr_entries);
VM_BUG_ON_PAGE(oldid, page);
- mem_cgroup_swap_statistics(swap_memcg, 1);
+ mem_cgroup_swap_statistics(swap_memcg, nr_entries);
page->mem_cgroup = NULL;
if (!mem_cgroup_is_root(memcg))
- page_counter_uncharge(&memcg->memory, 1);
+ page_counter_uncharge(&memcg->memory, nr_entries);
if (memcg != swap_memcg) {
if (!mem_cgroup_is_root(swap_memcg))
- page_counter_charge(&swap_memcg->memsw, 1);
- page_counter_uncharge(&memcg->memsw, 1);
+ page_counter_charge(&swap_memcg->memsw, nr_entries);
+ page_counter_uncharge(&memcg->memsw, nr_entries);
}
/*
@@ -5948,7 +5961,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* only synchronisation we have for udpating the per-CPU variables.
*/
VM_BUG_ON(!irqs_disabled());
- mem_cgroup_charge_statistics(memcg, page, false, -1);
+ mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
+ -nr_entries);
memcg_check_events(memcg, page);
if (!mem_cgroup_is_root(memcg))
diff --git a/mm/memory.c b/mm/memory.c
index 56e48e4593cb..13ee83b43878 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1513,8 +1513,20 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(mm, start, end);
- for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
+ for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
unmap_single_vma(&tlb, vma, start, end, NULL);
+
+ /*
+ * zap_page_range does not specify whether mmap_sem should be
+ * held for read or write. That allows parallel zap_page_range
+ * operations to unmap a PTE and defer a flush meaning that
+ * this call observes pte_none and fails to flush the TLB.
+ * Rather than adding a complex API, ensure that no stale
+ * TLB entries exist when this call returns.
+ */
+ flush_tlb_range(vma, start, end);
+ }
+
mmu_notifier_invalidate_range_end(mm, start, end);
tlb_finish_mmu(&tlb, start, end);
}
@@ -1676,7 +1688,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
EXPORT_SYMBOL(vm_insert_page);
static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, pgprot_t prot)
+ pfn_t pfn, pgprot_t prot, bool mkwrite)
{
struct mm_struct *mm = vma->vm_mm;
int retval;
@@ -1688,14 +1700,35 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
if (!pte)
goto out;
retval = -EBUSY;
- if (!pte_none(*pte))
- goto out_unlock;
+ if (!pte_none(*pte)) {
+ if (mkwrite) {
+ /*
+ * For read faults on private mappings the PFN passed
+ * in may not match the PFN we have mapped if the
+ * mapped PFN is a writeable COW page. In the mkwrite
+ * case we are creating a writable PTE for a shared
+ * mapping and we expect the PFNs to match.
+ */
+ if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
+ goto out_unlock;
+ entry = *pte;
+ goto out_mkwrite;
+ } else
+ goto out_unlock;
+ }
/* Ok, finally just insert the thing.. */
if (pfn_t_devmap(pfn))
entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
else
entry = pte_mkspecial(pfn_t_pte(pfn, prot));
+
+out_mkwrite:
+ if (mkwrite) {
+ entry = pte_mkyoung(entry);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ }
+
set_pte_at(mm, addr, pte, entry);
update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
@@ -1766,14 +1799,15 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
- ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
+ ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
+ false);
return ret;
}
EXPORT_SYMBOL(vm_insert_pfn_prot);
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn)
+static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn, bool mkwrite)
{
pgprot_t pgprot = vma->vm_page_prot;
@@ -1802,10 +1836,24 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
page = pfn_to_page(pfn_t_to_pfn(pfn));
return insert_page(vma, addr, page, pgprot);
}
- return insert_pfn(vma, addr, pfn, pgprot);
+ return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
+}
+
+int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn)
+{
+ return __vm_insert_mixed(vma, addr, pfn, false);
+
}
EXPORT_SYMBOL(vm_insert_mixed);
+int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn)
+{
+ return __vm_insert_mixed(vma, addr, pfn, true);
+}
+EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
+
/*
* maps a range of physical memory into the requested pages. the old
* mappings are removed. any references to nonexistent pages results
@@ -2571,7 +2619,7 @@ static int do_wp_page(struct vm_fault *vmf)
* not dirty accountable.
*/
if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
- int total_mapcount;
+ int total_map_swapcount;
if (!trylock_page(vmf->page)) {
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2586,8 +2634,8 @@ static int do_wp_page(struct vm_fault *vmf)
}
put_page(vmf->page);
}
- if (reuse_swap_page(vmf->page, &total_mapcount)) {
- if (total_mapcount == 1) {
+ if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
+ if (total_map_swapcount == 1) {
/*
* The page is all ours. Move it to
* our anon_vma so the rmap code will
@@ -2704,16 +2752,23 @@ EXPORT_SYMBOL(unmap_mapping_range);
int do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page, *swapcache;
+ struct page *page = NULL, *swapcache;
struct mem_cgroup *memcg;
+ struct vma_swap_readahead swap_ra;
swp_entry_t entry;
pte_t pte;
int locked;
int exclusive = 0;
int ret = 0;
+ bool vma_readahead = swap_use_vma_readahead();
- if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
+ if (vma_readahead)
+ page = swap_readahead_detect(vmf, &swap_ra);
+ if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
+ if (page)
+ put_page(page);
goto out;
+ }
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
@@ -2729,10 +2784,16 @@ int do_swap_page(struct vm_fault *vmf)
goto out;
}
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
- page = lookup_swap_cache(entry);
+ if (!page)
+ page = lookup_swap_cache(entry, vma_readahead ? vma : NULL,
+ vmf->address);
if (!page) {
- page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
- vmf->address);
+ if (vma_readahead)
+ page = do_swap_page_readahead(entry,
+ GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
+ else
+ page = swapin_readahead(entry,
+ GFP_HIGHUSER_MOVABLE, vma, vmf->address);
if (!page) {
/*
* Back out if somebody else faulted in this pte
@@ -4356,19 +4417,53 @@ static void clear_gigantic_page(struct page *page,
}
}
void clear_huge_page(struct page *page,
- unsigned long addr, unsigned int pages_per_huge_page)
+ unsigned long addr_hint, unsigned int pages_per_huge_page)
{
- int i;
+ int i, n, base, l;
+ unsigned long addr = addr_hint &
+ ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
clear_gigantic_page(page, addr, pages_per_huge_page);
return;
}
+ /* Clear sub-page to access last to keep its cache lines hot */
might_sleep();
- for (i = 0; i < pages_per_huge_page; i++) {
+ n = (addr_hint - addr) / PAGE_SIZE;
+ if (2 * n <= pages_per_huge_page) {
+ /* If sub-page to access in first half of huge page */
+ base = 0;
+ l = n;
+ /* Clear sub-pages at the end of huge page */
+ for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
+ cond_resched();
+ clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ }
+ } else {
+ /* If sub-page to access in second half of huge page */
+ base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
+ l = pages_per_huge_page - n;
+ /* Clear sub-pages at the begin of huge page */
+ for (i = 0; i < base; i++) {
+ cond_resched();
+ clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ }
+ }
+ /*
+ * Clear remaining sub-pages in left-right-left-right pattern
+ * towards the sub-page to access
+ */
+ for (i = 0; i < l; i++) {
+ int left_idx = base + i;
+ int right_idx = base + 2 * l - 1 - i;
+
+ cond_resched();
+ clear_user_highpage(page + left_idx,
+ addr + left_idx * PAGE_SIZE);
cond_resched();
- clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ clear_user_highpage(page + right_idx,
+ addr + right_idx * PAGE_SIZE);
}
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8dccc317aac2..73bf17df6899 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -773,31 +773,6 @@ static void node_states_set_node(int node, struct memory_notify *arg)
node_set_state(node, N_MEMORY);
}
-bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
-{
- struct pglist_data *pgdat = NODE_DATA(nid);
- struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
- struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages);
-
- /*
- * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
- * physically before ZONE_MOVABLE. All we need is they do not
- * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
- * though so let's stick with it for simplicity for now.
- * TODO make sure we do not overlap with ZONE_DEVICE
- */
- if (online_type == MMOP_ONLINE_KERNEL) {
- if (zone_is_empty(movable_zone))
- return true;
- return movable_zone->zone_start_pfn >= pfn + nr_pages;
- } else if (online_type == MMOP_ONLINE_MOVABLE) {
- return zone_end_pfn(default_zone) <= pfn;
- }
-
- /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
- return online_type == MMOP_ONLINE_KEEP;
-}
-
static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages)
{
@@ -856,7 +831,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone,
* If no kernel zone covers this pfn range it will automatically go
* to the ZONE_NORMAL.
*/
-struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
+static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn,
unsigned long nr_pages)
{
struct pglist_data *pgdat = NODE_DATA(nid);
@@ -872,17 +847,40 @@ struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
return &pgdat->node_zones[ZONE_NORMAL];
}
-static inline bool movable_pfn_range(int nid, struct zone *default_zone,
- unsigned long start_pfn, unsigned long nr_pages)
+static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
+ unsigned long nr_pages)
{
- if (!allow_online_pfn_range(nid, start_pfn, nr_pages,
- MMOP_ONLINE_KERNEL))
- return true;
+ struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn,
+ nr_pages);
+ struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
+ bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages);
+ bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages);
- if (!movable_node_is_enabled())
- return false;
+ /*
+ * We inherit the existing zone in a simple case where zones do not
+ * overlap in the given range
+ */
+ if (in_kernel ^ in_movable)
+ return (in_kernel) ? kernel_zone : movable_zone;
- return !zone_intersects(default_zone, start_pfn, nr_pages);
+ /*
+ * If the range doesn't belong to any zone or two zones overlap in the
+ * given range then we use movable zone only if movable_node is
+ * enabled because we always online to a kernel zone by default.
+ */
+ return movable_node_enabled ? movable_zone : kernel_zone;
+}
+
+struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
+ unsigned long nr_pages)
+{
+ if (online_type == MMOP_ONLINE_KERNEL)
+ return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
+
+ if (online_type == MMOP_ONLINE_MOVABLE)
+ return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
+
+ return default_zone_for_pfn(nid, start_pfn, nr_pages);
}
/*
@@ -892,28 +890,14 @@ static inline bool movable_pfn_range(int nid, struct zone *default_zone,
static struct zone * __meminit move_pfn_range(int online_type, int nid,
unsigned long start_pfn, unsigned long nr_pages)
{
- struct pglist_data *pgdat = NODE_DATA(nid);
- struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages);
-
- if (online_type == MMOP_ONLINE_KEEP) {
- struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
- /*
- * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use
- * movable zone if that is not possible (e.g. we are within
- * or past the existing movable zone). movable_node overrides
- * this default and defaults to movable zone
- */
- if (movable_pfn_range(nid, zone, start_pfn, nr_pages))
- zone = movable_zone;
- } else if (online_type == MMOP_ONLINE_MOVABLE) {
- zone = &pgdat->node_zones[ZONE_MOVABLE];
- }
+ struct zone *zone;
+ zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
move_pfn_range_to_zone(zone, start_pfn, nr_pages);
return zone;
}
-/* Must be protected by mem_hotplug_begin() */
+/* Must be protected by mem_hotplug_begin() or a device_lock */
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
{
unsigned long flags;
@@ -925,9 +909,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
struct memory_notify arg;
nid = pfn_to_nid(pfn);
- if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))
- return -EINVAL;
-
/* associate pfn range with the zone */
zone = move_pfn_range(online_type, nid, pfn, nr_pages);
@@ -945,10 +926,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
* This means the page allocator ignores this zone.
* So, zonelist must be updated after online.
*/
- mutex_lock(&zonelists_mutex);
if (!populated_zone(zone)) {
need_zonelists_rebuild = 1;
- build_all_zonelists(NULL, zone);
+ setup_zone_pageset(zone);
}
ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
@@ -956,7 +936,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (ret) {
if (need_zonelists_rebuild)
zone_pcp_reset(zone);
- mutex_unlock(&zonelists_mutex);
goto failed_addition;
}
@@ -969,13 +948,11 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (onlined_pages) {
node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
- build_all_zonelists(NULL, NULL);
+ build_all_zonelists(NULL);
else
zone_pcp_update(zone);
}
- mutex_unlock(&zonelists_mutex);
-
init_per_zone_wmark_min();
if (onlined_pages) {
@@ -1046,9 +1023,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
* The node we allocated has no zone fallback lists. For avoiding
* to access not-initialized zonelist, build here.
*/
- mutex_lock(&zonelists_mutex);
- build_all_zonelists(pgdat, NULL);
- mutex_unlock(&zonelists_mutex);
+ build_all_zonelists(pgdat);
/*
* zone->managed_pages is set to an approximate value in
@@ -1100,13 +1075,6 @@ int try_online_node(int nid)
node_set_online(nid);
ret = register_one_node(nid);
BUG_ON(ret);
-
- if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
- mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL, NULL);
- mutex_unlock(&zonelists_mutex);
- }
-
out:
mem_hotplug_done();
return ret;
@@ -1722,9 +1690,7 @@ repeat:
if (!populated_zone(zone)) {
zone_pcp_reset(zone);
- mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL, NULL);
- mutex_unlock(&zonelists_mutex);
+ build_all_zonelists(NULL);
} else
zone_pcp_update(zone);
@@ -1750,7 +1716,7 @@ failed_removal:
return ret;
}
-/* Must be protected by mem_hotplug_begin() */
+/* Must be protected by mem_hotplug_begin() or a device_lock */
int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
diff --git a/mm/mmap.c b/mm/mmap.c
index f19efcf75418..4c5981651407 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -44,6 +44,7 @@
#include <linux/userfaultfd_k.h>
#include <linux/moduleparam.h>
#include <linux/pkeys.h>
+#include <linux/oom.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -2639,13 +2640,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
if (vma->vm_start >= end)
return 0;
- if (uf) {
- int error = userfaultfd_unmap_prep(vma, start, end, uf);
-
- if (error)
- return error;
- }
-
/*
* If we need to split any vma, do it now to save pain later.
*
@@ -2679,6 +2673,21 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
}
vma = prev ? prev->vm_next : mm->mmap;
+ if (unlikely(uf)) {
+ /*
+ * If userfaultfd_unmap_prep returns an error the vmas
+ * will remain splitted, but userland will get a
+ * highly unexpected error anyway. This is no
+ * different than the case where the first of the two
+ * __split_vma fails, but we don't undo the first
+ * split, despite we could. This is unlikely enough
+ * failure that it's not worth optimizing it for.
+ */
+ int error = userfaultfd_unmap_prep(vma, start, end, uf);
+ if (error)
+ return error;
+ }
+
/*
* unlock any mlock()ed ranges before detaching vmas
*/
@@ -2993,6 +3002,23 @@ void exit_mmap(struct mm_struct *mm)
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ if (unlikely(tsk_is_oom_victim(current))) {
+ /*
+ * Wait for oom_reap_task() to stop working on this
+ * mm. Because MMF_OOM_SKIP is already set before
+ * calling down_read(), oom_reap_task() will not run
+ * on this "mm" post up_write().
+ *
+ * tsk_is_oom_victim() cannot be set from under us
+ * either because current->mm is already set to NULL
+ * under task_lock before calling mmput and oom_mm is
+ * set not NULL by the OOM killer only if current->mm
+ * is found not NULL while holding the task_lock.
+ */
+ down_write(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
+ }
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, 0, -1);
@@ -3514,7 +3540,7 @@ static int init_user_reserve(void)
{
unsigned long free_kbytes;
- free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
return 0;
@@ -3535,7 +3561,7 @@ static int init_admin_reserve(void)
{
unsigned long free_kbytes;
- free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
return 0;
@@ -3579,7 +3605,7 @@ static int reserve_mem_notifier(struct notifier_block *nb,
break;
case MEM_OFFLINE:
- free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
if (sysctl_user_reserve_kbytes > free_kbytes) {
init_user_reserve();
diff --git a/mm/mremap.c b/mm/mremap.c
index 3f23715d3c69..7395564daa6c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -384,6 +384,19 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (!vma || vma->vm_start > addr)
return ERR_PTR(-EFAULT);
+ /*
+ * !old_len is a special case where an attempt is made to 'duplicate'
+ * a mapping. This makes no sense for private mappings as it will
+ * instead create a fresh/new mapping unrelated to the original. This
+ * is contrary to the basic idea of mremap which creates new mappings
+ * based on the original. There are no known use cases for this
+ * behavior. As a result, fail such attempts.
+ */
+ if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
+ pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid);
+ return ERR_PTR(-EINVAL);
+ }
+
if (is_vm_hugetlb_page(vma))
return ERR_PTR(-EINVAL);
diff --git a/mm/nommu.c b/mm/nommu.c
index fc184f597d59..53d5175a5c14 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1962,7 +1962,7 @@ static int __meminit init_user_reserve(void)
{
unsigned long free_kbytes;
- free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
return 0;
@@ -1983,7 +1983,7 @@ static int __meminit init_admin_reserve(void)
{
unsigned long free_kbytes;
- free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
return 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9e8b4f030c1c..99736e026712 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -495,11 +495,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
}
/*
- * increase mm_users only after we know we will reap something so
- * that the mmput_async is called only when we have reaped something
- * and delayed __mmput doesn't matter that much
+ * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
+ * work on the mm anymore. The check for MMF_OOM_SKIP must run
+ * under mmap_sem for reading because it serializes against the
+ * down_write();up_write() cycle in exit_mmap().
*/
- if (!mmget_not_zero(mm)) {
+ if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
up_read(&mm->mmap_sem);
trace_skip_task_reaping(tsk->pid);
goto unlock_oom;
@@ -542,12 +543,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
K(get_mm_counter(mm, MM_SHMEMPAGES)));
up_read(&mm->mmap_sem);
- /*
- * Drop our reference but make sure the mmput slow path is called from a
- * different context because we shouldn't risk we get stuck there and
- * put the oom_reaper out of the way.
- */
- mmput_async(mm);
trace_finish_task_reaping(tsk->pid);
unlock_oom:
mutex_unlock(&oom_lock);
@@ -824,7 +819,8 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
/*
* If the task is already exiting, don't alarm the sysadmin or kill
- * its children or threads, just set TIF_MEMDIE so it can die quickly
+ * its children or threads, just give it access to memory reserves
+ * so it can die quickly
*/
task_lock(p);
if (task_will_free_mem(p)) {
@@ -889,9 +885,9 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
count_memcg_event_mm(mm, OOM_KILL);
/*
- * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
- * the OOM victim from depleting the memory reserves from the user
- * space under its control.
+ * We should send SIGKILL before granting access to memory reserves
+ * in order to prevent the OOM victim from depleting the memory
+ * reserves from the user space under its control.
*/
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
mark_oom_victim(victim);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bf050ab025b7..0b9c5cbe8eba 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -363,7 +363,7 @@ static unsigned long global_dirtyable_memory(void)
{
unsigned long x;
- x = global_page_state(NR_FREE_PAGES);
+ x = global_zone_page_state(NR_FREE_PAGES);
/*
* Pages reserved for the kernel should not be considered
* dirtyable, to prevent a situation where reclaim has to
@@ -1405,7 +1405,7 @@ void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
* will look to see if it needs to start dirty throttling.
*
* If dirty_poll_interval is too low, big NUMA machines will call the expensive
- * global_page_state() too often. So scale it near-sqrt to the safety margin
+ * global_zone_page_state() too often. So scale it near-sqrt to the safety margin
* (the number of pages we may dirty without exceeding the dirty limits).
*/
static unsigned long dirty_poll_interval(unsigned long dirty,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9327a940e373..a9add06fe768 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2951,7 +2951,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
{
long min = mark;
int o;
- const bool alloc_harder = (alloc_flags & ALLOC_HARDER);
+ const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
/* free_pages may go negative - that's OK */
free_pages -= (1 << order) - 1;
@@ -2964,10 +2964,21 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
* the high-atomic reserves. This will over-estimate the size of the
* atomic reserve but it avoids a search.
*/
- if (likely(!alloc_harder))
+ if (likely(!alloc_harder)) {
free_pages -= z->nr_reserved_highatomic;
- else
- min -= min / 4;
+ } else {
+ /*
+ * OOM victims can try even harder than normal ALLOC_HARDER
+ * users on the grounds that it's definitely going to be in
+ * the exit path shortly and free memory. Any allocation it
+ * makes during the free path will be small and short-lived.
+ */
+ if (alloc_flags & ALLOC_OOM)
+ min -= min / 2;
+ else
+ min -= min / 4;
+ }
+
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
@@ -3205,7 +3216,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
* of allowed nodes.
*/
if (!(gfp_mask & __GFP_NOMEMALLOC))
- if (test_thread_flag(TIF_MEMDIE) ||
+ if (tsk_is_oom_victim(current) ||
(current->flags & (PF_MEMALLOC | PF_EXITING)))
filter &= ~SHOW_MEM_FILTER_NODES;
if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
@@ -3668,21 +3679,46 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
return alloc_flags;
}
-bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+static bool oom_reserves_allowed(struct task_struct *tsk)
{
- if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+ if (!tsk_is_oom_victim(tsk))
return false;
+ /*
+ * !MMU doesn't have oom reaper so give access to memory reserves
+ * only to the thread with TIF_MEMDIE set
+ */
+ if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
+ return false;
+
+ return true;
+}
+
+/*
+ * Distinguish requests which really need access to full memory
+ * reserves from oom victims which can live with a portion of it
+ */
+static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
+{
+ if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+ return 0;
if (gfp_mask & __GFP_MEMALLOC)
- return true;
+ return ALLOC_NO_WATERMARKS;
if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
- return true;
- if (!in_interrupt() &&
- ((current->flags & PF_MEMALLOC) ||
- unlikely(test_thread_flag(TIF_MEMDIE))))
- return true;
+ return ALLOC_NO_WATERMARKS;
+ if (!in_interrupt()) {
+ if (current->flags & PF_MEMALLOC)
+ return ALLOC_NO_WATERMARKS;
+ else if (oom_reserves_allowed(current))
+ return ALLOC_OOM;
+ }
- return false;
+ return 0;
+}
+
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+ return !!__gfp_pfmemalloc_flags(gfp_mask);
}
/*
@@ -3835,6 +3871,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
unsigned long alloc_start = jiffies;
unsigned int stall_timeout = 10 * HZ;
unsigned int cpuset_mems_cookie;
+ int reserve_flags;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -3940,15 +3977,16 @@ retry:
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
- if (gfp_pfmemalloc_allowed(gfp_mask))
- alloc_flags = ALLOC_NO_WATERMARKS;
+ reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
+ if (reserve_flags)
+ alloc_flags = reserve_flags;
/*
* Reset the zonelist iterators if memory policies can be ignored.
* These allocations are high priority and system rather than user
* orientated.
*/
- if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
+ if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
@@ -4025,8 +4063,8 @@ retry:
goto got_pg;
/* Avoid allocations with no watermarks from looping endlessly */
- if (test_thread_flag(TIF_MEMDIE) &&
- (alloc_flags == ALLOC_NO_WATERMARKS ||
+ if (tsk_is_oom_victim(current) &&
+ (alloc_flags == ALLOC_OOM ||
(gfp_mask & __GFP_NOMEMALLOC)))
goto nopage;
@@ -4509,7 +4547,7 @@ long si_mem_available(void)
* Estimate the amount of memory available for userspace allocations,
* without causing swapping.
*/
- available = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
+ available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
/*
* Not all the page cache can be freed, otherwise the system will
@@ -4538,7 +4576,7 @@ void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages;
val->sharedram = global_node_page_state(NR_SHMEM);
- val->freeram = global_page_state(NR_FREE_PAGES);
+ val->freeram = global_zone_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages();
val->totalhigh = totalhigh_pages;
val->freehigh = nr_free_highpages();
@@ -4673,11 +4711,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state(NR_SLAB_UNRECLAIMABLE),
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
- global_page_state(NR_PAGETABLE),
- global_page_state(NR_BOUNCE),
- global_page_state(NR_FREE_PAGES),
+ global_zone_page_state(NR_PAGETABLE),
+ global_zone_page_state(NR_BOUNCE),
+ global_zone_page_state(NR_FREE_PAGES),
free_pcp,
- global_page_state(NR_FREE_CMA_PAGES));
+ global_zone_page_state(NR_FREE_CMA_PAGES));
for_each_online_pgdat(pgdat) {
if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
@@ -4839,18 +4877,17 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
*
* Add all populated zones of a node to the zonelist.
*/
-static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
- int nr_zones)
+static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
{
struct zone *zone;
enum zone_type zone_type = MAX_NR_ZONES;
+ int nr_zones = 0;
do {
zone_type--;
zone = pgdat->node_zones + zone_type;
if (managed_zone(zone)) {
- zoneref_set_zone(zone,
- &zonelist->_zonerefs[nr_zones++]);
+ zoneref_set_zone(zone, &zonerefs[nr_zones++]);
check_highest_zone(zone_type);
}
} while (zone_type);
@@ -4858,52 +4895,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
return nr_zones;
}
-
-/*
- * zonelist_order:
- * 0 = automatic detection of better ordering.
- * 1 = order by ([node] distance, -zonetype)
- * 2 = order by (-zonetype, [node] distance)
- *
- * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
- * the same zonelist. So only NUMA can configure this param.
- */
-#define ZONELIST_ORDER_DEFAULT 0
-#define ZONELIST_ORDER_NODE 1
-#define ZONELIST_ORDER_ZONE 2
-
-/* zonelist order in the kernel.
- * set_zonelist_order() will set this to NODE or ZONE.
- */
-static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
-static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
-
-
#ifdef CONFIG_NUMA
-/* The value user specified ....changed by config */
-static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
-/* string for sysctl */
-#define NUMA_ZONELIST_ORDER_LEN 16
-char numa_zonelist_order[16] = "default";
-
-/*
- * interface for configure zonelist ordering.
- * command line option "numa_zonelist_order"
- * = "[dD]efault - default, automatic configuration.
- * = "[nN]ode - order by node locality, then by zone within node
- * = "[zZ]one - order by zone, then by locality within zone
- */
static int __parse_numa_zonelist_order(char *s)
{
- if (*s == 'd' || *s == 'D') {
- user_zonelist_order = ZONELIST_ORDER_DEFAULT;
- } else if (*s == 'n' || *s == 'N') {
- user_zonelist_order = ZONELIST_ORDER_NODE;
- } else if (*s == 'z' || *s == 'Z') {
- user_zonelist_order = ZONELIST_ORDER_ZONE;
- } else {
- pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s);
+ /*
+ * We used to support different zonlists modes but they turned
+ * out to be just not useful. Let's keep the warning in place
+ * if somebody still use the cmd line parameter so that we do
+ * not fail it silently
+ */
+ if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
+ pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);
return -EINVAL;
}
return 0;
@@ -4911,19 +4914,15 @@ static int __parse_numa_zonelist_order(char *s)
static __init int setup_numa_zonelist_order(char *s)
{
- int ret;
-
if (!s)
return 0;
- ret = __parse_numa_zonelist_order(s);
- if (ret == 0)
- strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
-
- return ret;
+ return __parse_numa_zonelist_order(s);
}
early_param("numa_zonelist_order", setup_numa_zonelist_order);
+char numa_zonelist_order[] = "Node";
+
/*
* sysctl handler for numa_zonelist_order
*/
@@ -4931,42 +4930,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
loff_t *ppos)
{
- char saved_string[NUMA_ZONELIST_ORDER_LEN];
+ char *str;
int ret;
- static DEFINE_MUTEX(zl_order_mutex);
- mutex_lock(&zl_order_mutex);
- if (write) {
- if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
- ret = -EINVAL;
- goto out;
- }
- strcpy(saved_string, (char *)table->data);
- }
- ret = proc_dostring(table, write, buffer, length, ppos);
- if (ret)
- goto out;
- if (write) {
- int oldval = user_zonelist_order;
+ if (!write)
+ return proc_dostring(table, write, buffer, length, ppos);
+ str = memdup_user_nul(buffer, 16);
+ if (IS_ERR(str))
+ return PTR_ERR(str);
- ret = __parse_numa_zonelist_order((char *)table->data);
- if (ret) {
- /*
- * bogus value. restore saved string
- */
- strncpy((char *)table->data, saved_string,
- NUMA_ZONELIST_ORDER_LEN);
- user_zonelist_order = oldval;
- } else if (oldval != user_zonelist_order) {
- mem_hotplug_begin();
- mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL, NULL);
- mutex_unlock(&zonelists_mutex);
- mem_hotplug_done();
- }
- }
-out:
- mutex_unlock(&zl_order_mutex);
+ ret = __parse_numa_zonelist_order(str);
+ kfree(str);
return ret;
}
@@ -5040,17 +5014,24 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
* This results in maximum locality--normal zone overflows into local
* DMA zone, if any--but risks exhausting DMA zone.
*/
-static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
+static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
+ unsigned nr_nodes)
{
- int j;
- struct zonelist *zonelist;
+ struct zoneref *zonerefs;
+ int i;
+
+ zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
+
+ for (i = 0; i < nr_nodes; i++) {
+ int nr_zones;
- zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
- for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
- ;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j);
- zonelist->_zonerefs[j].zone = NULL;
- zonelist->_zonerefs[j].zone_idx = 0;
+ pg_data_t *node = NODE_DATA(node_order[i]);
+
+ nr_zones = build_zonerefs_node(node, zonerefs);
+ zonerefs += nr_zones;
+ }
+ zonerefs->zone = NULL;
+ zonerefs->zone_idx = 0;
}
/*
@@ -5058,13 +5039,14 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
*/
static void build_thisnode_zonelists(pg_data_t *pgdat)
{
- int j;
- struct zonelist *zonelist;
+ struct zoneref *zonerefs;
+ int nr_zones;
- zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
- j = build_zonelists_node(pgdat, zonelist, 0);
- zonelist->_zonerefs[j].zone = NULL;
- zonelist->_zonerefs[j].zone_idx = 0;
+ zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
+ nr_zones = build_zonerefs_node(pgdat, zonerefs);
+ zonerefs += nr_zones;
+ zonerefs->zone = NULL;
+ zonerefs->zone_idx = 0;
}
/*
@@ -5073,79 +5055,13 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
* exhausted, but results in overflowing to remote node while memory
* may still exist in local DMA zone.
*/
-static int node_order[MAX_NUMNODES];
-
-static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
-{
- int pos, j, node;
- int zone_type; /* needs to be signed */
- struct zone *z;
- struct zonelist *zonelist;
-
- zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
- pos = 0;
- for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
- for (j = 0; j < nr_nodes; j++) {
- node = node_order[j];
- z = &NODE_DATA(node)->node_zones[zone_type];
- if (managed_zone(z)) {
- zoneref_set_zone(z,
- &zonelist->_zonerefs[pos++]);
- check_highest_zone(zone_type);
- }
- }
- }
- zonelist->_zonerefs[pos].zone = NULL;
- zonelist->_zonerefs[pos].zone_idx = 0;
-}
-
-#if defined(CONFIG_64BIT)
-/*
- * Devices that require DMA32/DMA are relatively rare and do not justify a
- * penalty to every machine in case the specialised case applies. Default
- * to Node-ordering on 64-bit NUMA machines
- */
-static int default_zonelist_order(void)
-{
- return ZONELIST_ORDER_NODE;
-}
-#else
-/*
- * On 32-bit, the Normal zone needs to be preserved for allocations accessible
- * by the kernel. If processes running on node 0 deplete the low memory zone
- * then reclaim will occur more frequency increasing stalls and potentially
- * be easier to OOM if a large percentage of the zone is under writeback or
- * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
- * Hence, default to zone ordering on 32-bit.
- */
-static int default_zonelist_order(void)
-{
- return ZONELIST_ORDER_ZONE;
-}
-#endif /* CONFIG_64BIT */
-
-static void set_zonelist_order(void)
-{
- if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
- current_zonelist_order = default_zonelist_order();
- else
- current_zonelist_order = user_zonelist_order;
-}
static void build_zonelists(pg_data_t *pgdat)
{
- int i, node, load;
+ static int node_order[MAX_NUMNODES];
+ int node, load, nr_nodes = 0;
nodemask_t used_mask;
int local_node, prev_node;
- struct zonelist *zonelist;
- unsigned int order = current_zonelist_order;
-
- /* initialize zonelists */
- for (i = 0; i < MAX_ZONELISTS; i++) {
- zonelist = pgdat->node_zonelists + i;
- zonelist->_zonerefs[0].zone = NULL;
- zonelist->_zonerefs[0].zone_idx = 0;
- }
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
@@ -5154,8 +5070,6 @@ static void build_zonelists(pg_data_t *pgdat)
nodes_clear(used_mask);
memset(node_order, 0, sizeof(node_order));
- i = 0;
-
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
/*
* We don't want to pressure a particular node.
@@ -5166,19 +5080,12 @@ static void build_zonelists(pg_data_t *pgdat)
node_distance(local_node, prev_node))
node_load[node] = load;
+ node_order[nr_nodes++] = node;
prev_node = node;
load--;
- if (order == ZONELIST_ORDER_NODE)
- build_zonelists_in_node_order(pgdat, node);
- else
- node_order[i++] = node; /* remember order */
- }
-
- if (order == ZONELIST_ORDER_ZONE) {
- /* calculate node order -- i.e., DMA last! */
- build_zonelists_in_zone_order(pgdat, i);
}
+ build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
build_thisnode_zonelists(pgdat);
}
@@ -5204,21 +5111,17 @@ static void setup_min_unmapped_ratio(void);
static void setup_min_slab_ratio(void);
#else /* CONFIG_NUMA */
-static void set_zonelist_order(void)
-{
- current_zonelist_order = ZONELIST_ORDER_ZONE;
-}
-
static void build_zonelists(pg_data_t *pgdat)
{
int node, local_node;
- enum zone_type j;
- struct zonelist *zonelist;
+ struct zoneref *zonerefs;
+ int nr_zones;
local_node = pgdat->node_id;
- zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
- j = build_zonelists_node(pgdat, zonelist, 0);
+ zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
+ nr_zones = build_zonerefs_node(pgdat, zonerefs);
+ zonerefs += nr_zones;
/*
* Now we build the zonelist so that it contains the zones
@@ -5231,16 +5134,18 @@ static void build_zonelists(pg_data_t *pgdat)
for (node = local_node + 1; node < MAX_NUMNODES; node++) {
if (!node_online(node))
continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+ nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+ zonerefs += nr_zones;
}
for (node = 0; node < local_node; node++) {
if (!node_online(node))
continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+ nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+ zonerefs += nr_zones;
}
- zonelist->_zonerefs[j].zone = NULL;
- zonelist->_zonerefs[j].zone_idx = 0;
+ zonerefs->zone = NULL;
+ zonerefs->zone_idx = 0;
}
#endif /* CONFIG_NUMA */
@@ -5263,50 +5168,32 @@ static void build_zonelists(pg_data_t *pgdat)
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
-static void setup_zone_pageset(struct zone *zone);
-
-/*
- * Global mutex to protect against size modification of zonelists
- * as well as to serialize pageset setup for the new populated zone.
- */
-DEFINE_MUTEX(zonelists_mutex);
-/* return values int ....just for stop_machine() */
-static int __build_all_zonelists(void *data)
+static void __build_all_zonelists(void *data)
{
int nid;
- int cpu;
+ int __maybe_unused cpu;
pg_data_t *self = data;
+ static DEFINE_SPINLOCK(lock);
+
+ spin_lock(&lock);
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
#endif
+ /*
+ * This node is hotadded and no memory is yet present. So just
+ * building zonelists is fine - no need to touch other nodes.
+ */
if (self && !node_online(self->node_id)) {
build_zonelists(self);
- }
-
- for_each_online_node(nid) {
- pg_data_t *pgdat = NODE_DATA(nid);
-
- build_zonelists(pgdat);
- }
+ } else {
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
- /*
- * Initialize the boot_pagesets that are going to be used
- * for bootstrapping processors. The real pagesets for
- * each zone will be allocated later when the per cpu
- * allocator is available.
- *
- * boot_pagesets are used also for bootstrapping offline
- * cpus if the system is already booted because the pagesets
- * are needed to initialize allocators on a specific cpu too.
- * F.e. the percpu allocator needs the page allocator which
- * needs the percpu allocator in order to allocate its pagesets
- * (a chicken-egg dilemma).
- */
- for_each_possible_cpu(cpu) {
- setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+ build_zonelists(pgdat);
+ }
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
@@ -5317,45 +5204,53 @@ static int __build_all_zonelists(void *data)
* secondary cpus' numa_mem as they come on-line. During
* node/memory hotplug, we'll fixup all on-line cpus.
*/
- if (cpu_online(cpu))
+ for_each_online_cpu(cpu)
set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
#endif
}
- return 0;
+ spin_unlock(&lock);
}
static noinline void __init
build_all_zonelists_init(void)
{
+ int cpu;
+
__build_all_zonelists(NULL);
+
+ /*
+ * Initialize the boot_pagesets that are going to be used
+ * for bootstrapping processors. The real pagesets for
+ * each zone will be allocated later when the per cpu
+ * allocator is available.
+ *
+ * boot_pagesets are used also for bootstrapping offline
+ * cpus if the system is already booted because the pagesets
+ * are needed to initialize allocators on a specific cpu too.
+ * F.e. the percpu allocator needs the page allocator which
+ * needs the percpu allocator in order to allocate its pagesets
+ * (a chicken-egg dilemma).
+ */
+ for_each_possible_cpu(cpu)
+ setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+
mminit_verify_zonelist();
cpuset_init_current_mems_allowed();
}
/*
- * Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
*
- * __ref due to (1) call of __meminit annotated setup_zone_pageset
- * [we're only called with non-NULL zone through __meminit paths] and
- * (2) call of __init annotated helper build_all_zonelists_init
+ * __ref due to call of __init annotated helper build_all_zonelists_init
* [protected by SYSTEM_BOOTING].
*/
-void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
+void __ref build_all_zonelists(pg_data_t *pgdat)
{
- set_zonelist_order();
-
if (system_state == SYSTEM_BOOTING) {
build_all_zonelists_init();
} else {
-#ifdef CONFIG_MEMORY_HOTPLUG
- if (zone)
- setup_zone_pageset(zone);
-#endif
- /* we have to stop all cpus to guarantee there is no user
- of zonelist */
- stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL);
+ __build_all_zonelists(pgdat);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
@@ -5371,9 +5266,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
else
page_group_by_mobility_disabled = 0;
- pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n",
+ pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
nr_online_nodes,
- zonelist_order_name[current_zonelist_order],
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
#ifdef CONFIG_NUMA
@@ -5627,7 +5521,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu)
pageset_set_high_and_batch(zone, pcp);
}
-static void __meminit setup_zone_pageset(struct zone *zone)
+void __meminit setup_zone_pageset(struct zone *zone)
{
int cpu;
zone->pageset = alloc_percpu(struct per_cpu_pageset);
@@ -7081,9 +6975,11 @@ static void __setup_per_zone_wmarks(void)
*/
void setup_per_zone_wmarks(void)
{
- mutex_lock(&zonelists_mutex);
+ static DEFINE_SPINLOCK(lock);
+
+ spin_lock(&lock);
__setup_per_zone_wmarks();
- mutex_unlock(&zonelists_mutex);
+ spin_unlock(&lock);
}
/*
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 88ccc044b09a..32f18911deda 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -222,10 +222,7 @@ static void *__meminit alloc_page_ext(size_t size, int nid)
return addr;
}
- if (node_state(nid, N_HIGH_MEMORY))
- addr = vzalloc_node(size, nid);
- else
- addr = vzalloc(size);
+ addr = vzalloc_node(size, nid);
return addr;
}
@@ -409,6 +406,7 @@ void __init page_ext_init(void)
continue;
if (init_section_page_ext(pfn, nid))
goto oom;
+ cond_resched();
}
}
hotplug_memory_notifier(page_ext_callback, 0);
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 1b0f48c62316..4bd03a8d809e 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -204,7 +204,7 @@ static struct bin_attribute *page_idle_bin_attrs[] = {
NULL,
};
-static struct attribute_group page_idle_attr_group = {
+static const struct attribute_group page_idle_attr_group = {
.bin_attrs = page_idle_bin_attrs,
.name = "page_idle",
};
diff --git a/mm/page_io.c b/mm/page_io.c
index 5f61b54ee1f3..20139b90125a 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -28,16 +28,18 @@
static struct bio *get_swap_bio(gfp_t gfp_flags,
struct page *page, bio_end_io_t end_io)
{
+ int i, nr = hpage_nr_pages(page);
struct bio *bio;
- bio = bio_alloc(gfp_flags, 1);
+ bio = bio_alloc(gfp_flags, nr);
if (bio) {
bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
bio->bi_end_io = end_io;
- bio_add_page(bio, page, PAGE_SIZE, 0);
- BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE);
+ for (i = 0; i < nr; i++)
+ bio_add_page(bio, page + i, PAGE_SIZE, 0);
+ VM_BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE * nr);
}
return bio;
}
@@ -262,6 +264,15 @@ static sector_t swap_page_sector(struct page *page)
return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
}
+static inline void count_swpout_vm_event(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (unlikely(PageTransHuge(page)))
+ count_vm_event(THP_SWPOUT);
+#endif
+ count_vm_events(PSWPOUT, hpage_nr_pages(page));
+}
+
int __swap_writepage(struct page *page, struct writeback_control *wbc,
bio_end_io_t end_write_func)
{
@@ -313,7 +324,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
if (!ret) {
- count_vm_event(PSWPOUT);
+ count_swpout_vm_event(page);
return 0;
}
@@ -326,7 +337,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
goto out;
}
bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
- count_vm_event(PSWPOUT);
+ count_swpout_vm_event(page);
set_page_writeback(page);
unlock_page(page);
submit_bio(bio);
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 0fd9dcf2c5dc..8e2d7137510c 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -30,6 +30,7 @@ DEFINE_STATIC_KEY_FALSE(page_owner_inited);
static depot_stack_handle_t dummy_handle;
static depot_stack_handle_t failure_handle;
+static depot_stack_handle_t early_handle;
static void init_early_allocated_pages(void);
@@ -53,7 +54,7 @@ static bool need_page_owner(void)
return true;
}
-static noinline void register_dummy_stack(void)
+static __always_inline depot_stack_handle_t create_dummy_stack(void)
{
unsigned long entries[4];
struct stack_trace dummy;
@@ -64,21 +65,22 @@ static noinline void register_dummy_stack(void)
dummy.skip = 0;
save_stack_trace(&dummy);
- dummy_handle = depot_save_stack(&dummy, GFP_KERNEL);
+ return depot_save_stack(&dummy, GFP_KERNEL);
}
-static noinline void register_failure_stack(void)
+static noinline void register_dummy_stack(void)
{
- unsigned long entries[4];
- struct stack_trace failure;
+ dummy_handle = create_dummy_stack();
+}
- failure.nr_entries = 0;
- failure.max_entries = ARRAY_SIZE(entries);
- failure.entries = &entries[0];
- failure.skip = 0;
+static noinline void register_failure_stack(void)
+{
+ failure_handle = create_dummy_stack();
+}
- save_stack_trace(&failure);
- failure_handle = depot_save_stack(&failure, GFP_KERNEL);
+static noinline void register_early_stack(void)
+{
+ early_handle = create_dummy_stack();
}
static void init_page_owner(void)
@@ -88,6 +90,7 @@ static void init_page_owner(void)
register_dummy_stack();
register_failure_stack();
+ register_early_stack();
static_branch_enable(&page_owner_inited);
init_early_allocated_pages();
}
@@ -165,17 +168,13 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags)
return handle;
}
-noinline void __set_page_owner(struct page *page, unsigned int order,
- gfp_t gfp_mask)
+static inline void __set_page_owner_handle(struct page_ext *page_ext,
+ depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask)
{
- struct page_ext *page_ext = lookup_page_ext(page);
struct page_owner *page_owner;
- if (unlikely(!page_ext))
- return;
-
page_owner = get_page_owner(page_ext);
- page_owner->handle = save_stack(gfp_mask);
+ page_owner->handle = handle;
page_owner->order = order;
page_owner->gfp_mask = gfp_mask;
page_owner->last_migrate_reason = -1;
@@ -183,6 +182,19 @@ noinline void __set_page_owner(struct page *page, unsigned int order,
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
}
+noinline void __set_page_owner(struct page *page, unsigned int order,
+ gfp_t gfp_mask)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+ depot_stack_handle_t handle;
+
+ if (unlikely(!page_ext))
+ return;
+
+ handle = save_stack(gfp_mask);
+ __set_page_owner_handle(page_ext, handle, order, gfp_mask);
+}
+
void __set_page_owner_migrate_reason(struct page *page, int reason)
{
struct page_ext *page_ext = lookup_page_ext(page);
@@ -550,11 +562,17 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
continue;
/*
- * We are safe to check buddy flag and order, because
- * this is init stage and only single thread runs.
+ * To avoid having to grab zone->lock, be a little
+ * careful when reading buddy page order. The only
+ * danger is that we skip too much and potentially miss
+ * some early allocated pages, which is better than
+ * heavy lock contention.
*/
if (PageBuddy(page)) {
- pfn += (1UL << page_order(page)) - 1;
+ unsigned long order = page_order_unsafe(page);
+
+ if (order > 0 && order < MAX_ORDER)
+ pfn += (1UL << order) - 1;
continue;
}
@@ -565,14 +583,15 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
if (unlikely(!page_ext))
continue;
- /* Maybe overraping zone */
+ /* Maybe overlapping zone */
if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
continue;
/* Found early allocated page */
- set_page_owner(page, 0, 0);
+ __set_page_owner_handle(page_ext, early_handle, 0, 0);
count++;
}
+ cond_resched();
}
pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
@@ -583,15 +602,12 @@ static void init_zones_in_node(pg_data_t *pgdat)
{
struct zone *zone;
struct zone *node_zones = pgdat->node_zones;
- unsigned long flags;
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
if (!populated_zone(zone))
continue;
- spin_lock_irqsave(&zone->lock, flags);
init_pages_in_zone(pgdat, zone);
- spin_unlock_irqrestore(&zone->lock, flags);
}
}
diff --git a/mm/shmem.c b/mm/shmem.c
index fbcb3c96a186..ace53a582be5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -34,6 +34,7 @@
#include <linux/swap.h>
#include <linux/uio.h>
#include <linux/khugepaged.h>
+#include <linux/hugetlb.h>
#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
@@ -188,6 +189,38 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
}
+static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+
+ if (shmem_acct_block(info->flags, pages))
+ return false;
+
+ if (sbinfo->max_blocks) {
+ if (percpu_counter_compare(&sbinfo->used_blocks,
+ sbinfo->max_blocks - pages) > 0)
+ goto unacct;
+ percpu_counter_add(&sbinfo->used_blocks, pages);
+ }
+
+ return true;
+
+unacct:
+ shmem_unacct_blocks(info->flags, pages);
+ return false;
+}
+
+static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+
+ if (sbinfo->max_blocks)
+ percpu_counter_sub(&sbinfo->used_blocks, pages);
+ shmem_unacct_blocks(info->flags, pages);
+}
+
static const struct super_operations shmem_ops;
static const struct address_space_operations shmem_aops;
static const struct file_operations shmem_file_operations;
@@ -249,23 +282,20 @@ static void shmem_recalc_inode(struct inode *inode)
freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
if (freed > 0) {
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- if (sbinfo->max_blocks)
- percpu_counter_add(&sbinfo->used_blocks, -freed);
info->alloced -= freed;
inode->i_blocks -= freed * BLOCKS_PER_PAGE;
- shmem_unacct_blocks(info->flags, freed);
+ shmem_inode_unacct_blocks(inode, freed);
}
}
bool shmem_charge(struct inode *inode, long pages)
{
struct shmem_inode_info *info = SHMEM_I(inode);
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
unsigned long flags;
- if (shmem_acct_block(info->flags, pages))
+ if (!shmem_inode_acct_block(inode, pages))
return false;
+
spin_lock_irqsave(&info->lock, flags);
info->alloced += pages;
inode->i_blocks += pages * BLOCKS_PER_PAGE;
@@ -273,26 +303,12 @@ bool shmem_charge(struct inode *inode, long pages)
spin_unlock_irqrestore(&info->lock, flags);
inode->i_mapping->nrpages += pages;
- if (!sbinfo->max_blocks)
- return true;
- if (percpu_counter_compare(&sbinfo->used_blocks,
- sbinfo->max_blocks - pages) > 0) {
- inode->i_mapping->nrpages -= pages;
- spin_lock_irqsave(&info->lock, flags);
- info->alloced -= pages;
- shmem_recalc_inode(inode);
- spin_unlock_irqrestore(&info->lock, flags);
- shmem_unacct_blocks(info->flags, pages);
- return false;
- }
- percpu_counter_add(&sbinfo->used_blocks, pages);
return true;
}
void shmem_uncharge(struct inode *inode, long pages)
{
struct shmem_inode_info *info = SHMEM_I(inode);
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
unsigned long flags;
spin_lock_irqsave(&info->lock, flags);
@@ -301,9 +317,7 @@ void shmem_uncharge(struct inode *inode, long pages)
shmem_recalc_inode(inode);
spin_unlock_irqrestore(&info->lock, flags);
- if (sbinfo->max_blocks)
- percpu_counter_sub(&sbinfo->used_blocks, pages);
- shmem_unacct_blocks(info->flags, pages);
+ shmem_inode_unacct_blocks(inode, pages);
}
/*
@@ -1452,9 +1466,10 @@ static struct page *shmem_alloc_page(gfp_t gfp,
}
static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
- struct shmem_inode_info *info, struct shmem_sb_info *sbinfo,
+ struct inode *inode,
pgoff_t index, bool huge)
{
+ struct shmem_inode_info *info = SHMEM_I(inode);
struct page *page;
int nr;
int err = -ENOSPC;
@@ -1463,14 +1478,8 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
huge = false;
nr = huge ? HPAGE_PMD_NR : 1;
- if (shmem_acct_block(info->flags, nr))
+ if (!shmem_inode_acct_block(inode, nr))
goto failed;
- if (sbinfo->max_blocks) {
- if (percpu_counter_compare(&sbinfo->used_blocks,
- sbinfo->max_blocks - nr) > 0)
- goto unacct;
- percpu_counter_add(&sbinfo->used_blocks, nr);
- }
if (huge)
page = shmem_alloc_hugepage(gfp, info, index);
@@ -1483,10 +1492,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
}
err = -ENOMEM;
- if (sbinfo->max_blocks)
- percpu_counter_add(&sbinfo->used_blocks, -nr);
-unacct:
- shmem_unacct_blocks(info->flags, nr);
+ shmem_inode_unacct_blocks(inode, nr);
failed:
return ERR_PTR(err);
}
@@ -1644,7 +1650,7 @@ repeat:
if (swap.val) {
/* Look it up and read it in.. */
- page = lookup_swap_cache(swap);
+ page = lookup_swap_cache(swap, NULL, 0);
if (!page) {
/* Or update major stats only when swapin succeeds?? */
if (fault_type) {
@@ -1751,10 +1757,9 @@ repeat:
}
alloc_huge:
- page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
- index, true);
+ page = shmem_alloc_and_acct_page(gfp, inode, index, true);
if (IS_ERR(page)) {
-alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
+alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode,
index, false);
}
if (IS_ERR(page)) {
@@ -1876,10 +1881,7 @@ clear:
* Error recovery.
*/
unacct:
- if (sbinfo->max_blocks)
- percpu_counter_sub(&sbinfo->used_blocks,
- 1 << compound_order(page));
- shmem_unacct_blocks(info->flags, 1 << compound_order(page));
+ shmem_inode_unacct_blocks(inode, 1 << compound_order(page));
if (PageTransHuge(page)) {
unlock_page(page);
@@ -2206,16 +2208,16 @@ bool shmem_mapping(struct address_space *mapping)
return mapping->a_ops == &shmem_aops;
}
-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- struct page **pagep)
+static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ bool zeropage,
+ struct page **pagep)
{
struct inode *inode = file_inode(dst_vma->vm_file);
struct shmem_inode_info *info = SHMEM_I(inode);
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
gfp_t gfp = mapping_gfp_mask(mapping);
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
@@ -2227,33 +2229,30 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
int ret;
ret = -ENOMEM;
- if (shmem_acct_block(info->flags, 1))
+ if (!shmem_inode_acct_block(inode, 1))
goto out;
- if (sbinfo->max_blocks) {
- if (percpu_counter_compare(&sbinfo->used_blocks,
- sbinfo->max_blocks) >= 0)
- goto out_unacct_blocks;
- percpu_counter_inc(&sbinfo->used_blocks);
- }
if (!*pagep) {
page = shmem_alloc_page(gfp, info, pgoff);
if (!page)
- goto out_dec_used_blocks;
-
- page_kaddr = kmap_atomic(page);
- ret = copy_from_user(page_kaddr, (const void __user *)src_addr,
- PAGE_SIZE);
- kunmap_atomic(page_kaddr);
-
- /* fallback to copy_from_user outside mmap_sem */
- if (unlikely(ret)) {
- *pagep = page;
- if (sbinfo->max_blocks)
- percpu_counter_add(&sbinfo->used_blocks, -1);
- shmem_unacct_blocks(info->flags, 1);
- /* don't free the page */
- return -EFAULT;
+ goto out_unacct_blocks;
+
+ if (!zeropage) { /* mcopy_atomic */
+ page_kaddr = kmap_atomic(page);
+ ret = copy_from_user(page_kaddr,
+ (const void __user *)src_addr,
+ PAGE_SIZE);
+ kunmap_atomic(page_kaddr);
+
+ /* fallback to copy_from_user outside mmap_sem */
+ if (unlikely(ret)) {
+ *pagep = page;
+ shmem_inode_unacct_blocks(inode, 1);
+ /* don't free the page */
+ return -EFAULT;
+ }
+ } else { /* mfill_zeropage_atomic */
+ clear_highpage(page);
}
} else {
page = *pagep;
@@ -2314,14 +2313,33 @@ out_release_uncharge:
out_release:
unlock_page(page);
put_page(page);
-out_dec_used_blocks:
- if (sbinfo->max_blocks)
- percpu_counter_add(&sbinfo->used_blocks, -1);
out_unacct_blocks:
- shmem_unacct_blocks(info->flags, 1);
+ shmem_inode_unacct_blocks(inode, 1);
goto out;
}
+int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **pagep)
+{
+ return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr, src_addr, false, pagep);
+}
+
+int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr)
+{
+ struct page *page = NULL;
+
+ return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr, 0, true, &page);
+}
+
#ifdef CONFIG_TMPFS
static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;
@@ -3635,7 +3653,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING)
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
@@ -3647,8 +3665,18 @@ SYSCALL_DEFINE2(memfd_create,
char *name;
long len;
- if (flags & ~(unsigned int)MFD_ALL_FLAGS)
- return -EINVAL;
+ if (!(flags & MFD_HUGETLB)) {
+ if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+ return -EINVAL;
+ } else {
+ /* Sealing not supported in hugetlbfs (MFD_HUGETLB) */
+ if (flags & MFD_ALLOW_SEALING)
+ return -EINVAL;
+ /* Allow huge page size encoding in flags. */
+ if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
+ (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
+ return -EINVAL;
+ }
/* length includes terminating zero */
len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
@@ -3679,16 +3707,30 @@ SYSCALL_DEFINE2(memfd_create,
goto err_name;
}
- file = shmem_file_setup(name, 0, VM_NORESERVE);
+ if (flags & MFD_HUGETLB) {
+ struct user_struct *user = NULL;
+
+ file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
+ HUGETLB_ANONHUGE_INODE,
+ (flags >> MFD_HUGE_SHIFT) &
+ MFD_HUGE_MASK);
+ } else
+ file = shmem_file_setup(name, 0, VM_NORESERVE);
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto err_fd;
}
- info = SHMEM_I(file_inode(file));
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_RDWR | O_LARGEFILE;
- if (flags & MFD_ALLOW_SEALING)
+
+ if (flags & MFD_ALLOW_SEALING) {
+ /*
+ * flags check at beginning of function ensures
+ * this is not a hugetlbfs (MFD_HUGETLB) file.
+ */
+ info = SHMEM_I(file_inode(file));
info->seals &= ~F_SEAL_SEAL;
+ }
fd_install(fd, file);
kfree(name);
diff --git a/mm/slub.c b/mm/slub.c
index e8b4e31162ca..ddb04576b342 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -34,6 +34,7 @@
#include <linux/stacktrace.h>
#include <linux/prefetch.h>
#include <linux/memcontrol.h>
+#include <linux/random.h>
#include <trace/events/kmem.h>
@@ -238,30 +239,62 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
* Core slab cache functions
*******************************************************************/
+/*
+ * Returns freelist pointer (ptr). With hardening, this is obfuscated
+ * with an XOR of the address where the pointer is held and a per-cache
+ * random number.
+ */
+static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
+ unsigned long ptr_addr)
+{
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+ return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr);
+#else
+ return ptr;
+#endif
+}
+
+/* Returns the freelist pointer recorded at location ptr_addr. */
+static inline void *freelist_dereference(const struct kmem_cache *s,
+ void *ptr_addr)
+{
+ return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr),
+ (unsigned long)ptr_addr);
+}
+
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
- return *(void **)(object + s->offset);
+ return freelist_dereference(s, object + s->offset);
}
static void prefetch_freepointer(const struct kmem_cache *s, void *object)
{
- prefetch(object + s->offset);
+ if (object)
+ prefetch(freelist_dereference(s, object + s->offset));
}
static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
{
+ unsigned long freepointer_addr;
void *p;
if (!debug_pagealloc_enabled())
return get_freepointer(s, object);
- probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
- return p;
+ freepointer_addr = (unsigned long)object + s->offset;
+ probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p));
+ return freelist_ptr(s, p, freepointer_addr);
}
static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
{
- *(void **)(object + s->offset) = fp;
+ unsigned long freeptr_addr = (unsigned long)object + s->offset;
+
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+ BUG_ON(object == fp); /* naive detection of double free or corruption */
+#endif
+
+ *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
}
/* Loop over all objects in a slab */
@@ -3358,8 +3391,8 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
struct kmem_cache_node *n;
for_each_kmem_cache_node(s, node, n) {
- kmem_cache_free(kmem_cache_node, n);
s->node[node] = NULL;
+ kmem_cache_free(kmem_cache_node, n);
}
}
@@ -3389,8 +3422,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
return 0;
}
- s->node[node] = n;
init_kmem_cache_node(n);
+ s->node[node] = n;
}
return 1;
}
@@ -3563,6 +3596,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
{
s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
s->reserved = 0;
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+ s->random = get_random_long();
+#endif
if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU))
s->reserved = sizeof(struct rcu_head);
@@ -5423,7 +5459,7 @@ static struct attribute *slab_attrs[] = {
NULL
};
-static struct attribute_group slab_attr_group = {
+static const struct attribute_group slab_attr_group = {
.attrs = slab_attrs,
};
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index c50b1a14d55e..d1a39b8051e0 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -54,14 +54,9 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
if (slab_is_available()) {
struct page *page;
- if (node_state(node, N_HIGH_MEMORY))
- page = alloc_pages_node(
- node, GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
- get_order(size));
- else
- page = alloc_pages(
- GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
- get_order(size));
+ page = alloc_pages_node(node,
+ GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
+ get_order(size));
if (page)
return page_address(page);
return NULL;
diff --git a/mm/sparse.c b/mm/sparse.c
index 7b4be3fd5cac..a9783acf2bb9 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -65,14 +65,10 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid)
unsigned long array_size = SECTIONS_PER_ROOT *
sizeof(struct mem_section);
- if (slab_is_available()) {
- if (node_state(nid, N_HIGH_MEMORY))
- section = kzalloc_node(array_size, GFP_KERNEL, nid);
- else
- section = kzalloc(array_size, GFP_KERNEL);
- } else {
+ if (slab_is_available())
+ section = kzalloc_node(array_size, GFP_KERNEL, nid);
+ else
section = memblock_virt_alloc_node(array_size, nid);
- }
return section;
}
diff --git a/mm/swap.c b/mm/swap.c
index 60b1d2a75852..62d96b8e5eb3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -946,28 +946,34 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
}
/**
- * pagevec_lookup - gang pagecache lookup
+ * pagevec_lookup_range - gang pagecache lookup
* @pvec: Where the resulting pages are placed
* @mapping: The address_space to search
* @start: The starting page index
+ * @end: The final page index
* @nr_pages: The maximum number of pages
*
- * pagevec_lookup() will search for and return a group of up to @nr_pages pages
- * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a
+ * pagevec_lookup_range() will search for and return a group of up to @nr_pages
+ * pages in the mapping starting from index @start and upto index @end
+ * (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a
* reference against the pages in @pvec.
*
* The search returns a group of mapping-contiguous pages with ascending
- * indexes. There may be holes in the indices due to not-present pages.
+ * indexes. There may be holes in the indices due to not-present pages. We
+ * also update @start to index the next page for the traversal.
*
- * pagevec_lookup() returns the number of pages which were found.
+ * pagevec_lookup_range() returns the number of pages which were found. If this
+ * number is smaller than @nr_pages, the end of specified range has been
+ * reached.
*/
-unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
- pgoff_t start, unsigned nr_pages)
+unsigned pagevec_lookup_range(struct pagevec *pvec,
+ struct address_space *mapping, pgoff_t *start, pgoff_t end)
{
- pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
+ pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE,
+ pvec->pages);
return pagevec_count(pvec);
}
-EXPORT_SYMBOL(pagevec_lookup);
+EXPORT_SYMBOL(pagevec_lookup_range);
unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
pgoff_t *index, int tag, unsigned nr_pages)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b68c93014f50..71ce2d1ccbf7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -37,6 +37,29 @@ static const struct address_space_operations swap_aops = {
struct address_space *swapper_spaces[MAX_SWAPFILES];
static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
+bool swap_vma_readahead = true;
+
+#define SWAP_RA_MAX_ORDER_DEFAULT 3
+
+static int swap_ra_max_order = SWAP_RA_MAX_ORDER_DEFAULT;
+
+#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2)
+#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1)
+#define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK
+#define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK)
+
+#define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK)
+#define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
+#define SWAP_RA_ADDR(v) ((v) & PAGE_MASK)
+
+#define SWAP_RA_VAL(addr, win, hits) \
+ (((addr) & PAGE_MASK) | \
+ (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \
+ ((hits) & SWAP_RA_HITS_MASK))
+
+/* Initial readahead hits is 4 to start up with a small window */
+#define GET_SWAP_RA_VAL(vma) \
+ (atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0)
@@ -297,19 +320,36 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
* lock getting page table operations atomic even if we drop the page
* lock before returning.
*/
-struct page * lookup_swap_cache(swp_entry_t entry)
+struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
+ unsigned long addr)
{
struct page *page;
+ unsigned long ra_info;
+ int win, hits, readahead;
page = find_get_page(swap_address_space(entry), swp_offset(entry));
- if (page && likely(!PageTransCompound(page))) {
+ INC_CACHE_INFO(find_total);
+ if (page) {
INC_CACHE_INFO(find_success);
- if (TestClearPageReadahead(page))
- atomic_inc(&swapin_readahead_hits);
+ if (unlikely(PageTransCompound(page)))
+ return page;
+ readahead = TestClearPageReadahead(page);
+ if (vma) {
+ ra_info = GET_SWAP_RA_VAL(vma);
+ win = SWAP_RA_WIN(ra_info);
+ hits = SWAP_RA_HITS(ra_info);
+ if (readahead)
+ hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
+ atomic_long_set(&vma->swap_readahead_info,
+ SWAP_RA_VAL(addr, win, hits));
+ }
+ if (readahead) {
+ count_vm_event(SWAP_RA_HIT);
+ if (!vma)
+ atomic_inc(&swapin_readahead_hits);
+ }
}
-
- INC_CACHE_INFO(find_total);
return page;
}
@@ -424,22 +464,20 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return retpage;
}
-static unsigned long swapin_nr_pages(unsigned long offset)
+static unsigned int __swapin_nr_pages(unsigned long prev_offset,
+ unsigned long offset,
+ int hits,
+ int max_pages,
+ int prev_win)
{
- static unsigned long prev_offset;
- unsigned int pages, max_pages, last_ra;
- static atomic_t last_readahead_pages;
-
- max_pages = 1 << READ_ONCE(page_cluster);
- if (max_pages <= 1)
- return 1;
+ unsigned int pages, last_ra;
/*
* This heuristic has been found to work well on both sequential and
* random loads, swapping to hard disk or to SSD: please don't ask
* what the "+ 2" means, it just happens to work well, that's all.
*/
- pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
+ pages = hits + 2;
if (pages == 2) {
/*
* We can have no readahead hits to judge by: but must not get
@@ -448,7 +486,6 @@ static unsigned long swapin_nr_pages(unsigned long offset)
*/
if (offset != prev_offset + 1 && offset != prev_offset - 1)
pages = 1;
- prev_offset = offset;
} else {
unsigned int roundup = 4;
while (roundup < pages)
@@ -460,9 +497,28 @@ static unsigned long swapin_nr_pages(unsigned long offset)
pages = max_pages;
/* Don't shrink readahead too fast */
- last_ra = atomic_read(&last_readahead_pages) / 2;
+ last_ra = prev_win / 2;
if (pages < last_ra)
pages = last_ra;
+
+ return pages;
+}
+
+static unsigned long swapin_nr_pages(unsigned long offset)
+{
+ static unsigned long prev_offset;
+ unsigned int hits, pages, max_pages;
+ static atomic_t last_readahead_pages;
+
+ max_pages = 1 << READ_ONCE(page_cluster);
+ if (max_pages <= 1)
+ return 1;
+
+ hits = atomic_xchg(&swapin_readahead_hits, 0);
+ pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages,
+ atomic_read(&last_readahead_pages));
+ if (!hits)
+ prev_offset = offset;
atomic_set(&last_readahead_pages, pages);
return pages;
@@ -496,7 +552,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
unsigned long start_offset, end_offset;
unsigned long mask;
struct blk_plug plug;
- bool do_poll = true;
+ bool do_poll = true, page_allocated;
mask = swapin_nr_pages(offset) - 1;
if (!mask)
@@ -512,12 +568,19 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
blk_start_plug(&plug);
for (offset = start_offset; offset <= end_offset ; offset++) {
/* Ok, do the async read-ahead now */
- page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
- gfp_mask, vma, addr, false);
+ page = __read_swap_cache_async(
+ swp_entry(swp_type(entry), offset),
+ gfp_mask, vma, addr, &page_allocated);
if (!page)
continue;
- if (offset != entry_offset && likely(!PageTransCompound(page)))
- SetPageReadahead(page);
+ if (page_allocated) {
+ swap_readpage(page, false);
+ if (offset != entry_offset &&
+ likely(!PageTransCompound(page))) {
+ SetPageReadahead(page);
+ count_vm_event(SWAP_RA);
+ }
+ }
put_page(page);
}
blk_finish_plug(&plug);
@@ -561,3 +624,210 @@ void exit_swap_address_space(unsigned int type)
synchronize_rcu();
kvfree(spaces);
}
+
+static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
+ unsigned long faddr,
+ unsigned long lpfn,
+ unsigned long rpfn,
+ unsigned long *start,
+ unsigned long *end)
+{
+ *start = max3(lpfn, PFN_DOWN(vma->vm_start),
+ PFN_DOWN(faddr & PMD_MASK));
+ *end = min3(rpfn, PFN_DOWN(vma->vm_end),
+ PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
+}
+
+struct page *swap_readahead_detect(struct vm_fault *vmf,
+ struct vma_swap_readahead *swap_ra)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long swap_ra_info;
+ struct page *page;
+ swp_entry_t entry;
+ unsigned long faddr, pfn, fpfn;
+ unsigned long start, end;
+ pte_t *pte;
+ unsigned int max_win, hits, prev_win, win, left;
+#ifndef CONFIG_64BIT
+ pte_t *tpte;
+#endif
+
+ faddr = vmf->address;
+ entry = pte_to_swp_entry(vmf->orig_pte);
+ if ((unlikely(non_swap_entry(entry))))
+ return NULL;
+ page = lookup_swap_cache(entry, vma, faddr);
+ if (page)
+ return page;
+
+ max_win = 1 << READ_ONCE(swap_ra_max_order);
+ if (max_win == 1) {
+ swap_ra->win = 1;
+ return NULL;
+ }
+
+ fpfn = PFN_DOWN(faddr);
+ swap_ra_info = GET_SWAP_RA_VAL(vma);
+ pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info));
+ prev_win = SWAP_RA_WIN(swap_ra_info);
+ hits = SWAP_RA_HITS(swap_ra_info);
+ swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits,
+ max_win, prev_win);
+ atomic_long_set(&vma->swap_readahead_info,
+ SWAP_RA_VAL(faddr, win, 0));
+
+ if (win == 1)
+ return NULL;
+
+ /* Copy the PTEs because the page table may be unmapped */
+ if (fpfn == pfn + 1)
+ swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
+ else if (pfn == fpfn + 1)
+ swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
+ &start, &end);
+ else {
+ left = (win - 1) / 2;
+ swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
+ &start, &end);
+ }
+ swap_ra->nr_pte = end - start;
+ swap_ra->offset = fpfn - start;
+ pte = vmf->pte - swap_ra->offset;
+#ifdef CONFIG_64BIT
+ swap_ra->ptes = pte;
+#else
+ tpte = swap_ra->ptes;
+ for (pfn = start; pfn != end; pfn++)
+ *tpte++ = *pte++;
+#endif
+
+ return NULL;
+}
+
+struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
+ struct vm_fault *vmf,
+ struct vma_swap_readahead *swap_ra)
+{
+ struct blk_plug plug;
+ struct vm_area_struct *vma = vmf->vma;
+ struct page *page;
+ pte_t *pte, pentry;
+ swp_entry_t entry;
+ unsigned int i;
+ bool page_allocated;
+
+ if (swap_ra->win == 1)
+ goto skip;
+
+ blk_start_plug(&plug);
+ for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte;
+ i++, pte++) {
+ pentry = *pte;
+ if (pte_none(pentry))
+ continue;
+ if (pte_present(pentry))
+ continue;
+ entry = pte_to_swp_entry(pentry);
+ if (unlikely(non_swap_entry(entry)))
+ continue;
+ page = __read_swap_cache_async(entry, gfp_mask, vma,
+ vmf->address, &page_allocated);
+ if (!page)
+ continue;
+ if (page_allocated) {
+ swap_readpage(page, false);
+ if (i != swap_ra->offset &&
+ likely(!PageTransCompound(page))) {
+ SetPageReadahead(page);
+ count_vm_event(SWAP_RA);
+ }
+ }
+ put_page(page);
+ }
+ blk_finish_plug(&plug);
+ lru_add_drain();
+skip:
+ return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
+ swap_ra->win == 1);
+}
+
+#ifdef CONFIG_SYSFS
+static ssize_t vma_ra_enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false");
+}
+static ssize_t vma_ra_enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
+ swap_vma_readahead = true;
+ else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
+ swap_vma_readahead = false;
+ else
+ return -EINVAL;
+
+ return count;
+}
+static struct kobj_attribute vma_ra_enabled_attr =
+ __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show,
+ vma_ra_enabled_store);
+
+static ssize_t vma_ra_max_order_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", swap_ra_max_order);
+}
+static ssize_t vma_ra_max_order_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err, v;
+
+ err = kstrtoint(buf, 10, &v);
+ if (err || v > SWAP_RA_ORDER_CEILING || v <= 0)
+ return -EINVAL;
+
+ swap_ra_max_order = v;
+
+ return count;
+}
+static struct kobj_attribute vma_ra_max_order_attr =
+ __ATTR(vma_ra_max_order, 0644, vma_ra_max_order_show,
+ vma_ra_max_order_store);
+
+static struct attribute *swap_attrs[] = {
+ &vma_ra_enabled_attr.attr,
+ &vma_ra_max_order_attr.attr,
+ NULL,
+};
+
+static struct attribute_group swap_attr_group = {
+ .attrs = swap_attrs,
+};
+
+static int __init swap_init_sysfs(void)
+{
+ int err;
+ struct kobject *swap_kobj;
+
+ swap_kobj = kobject_create_and_add("swap", mm_kobj);
+ if (!swap_kobj) {
+ pr_err("failed to create swap kobject\n");
+ return -ENOMEM;
+ }
+ err = sysfs_create_group(swap_kobj, &swap_attr_group);
+ if (err) {
+ pr_err("failed to register swap group\n");
+ goto delete_obj;
+ }
+ return 0;
+
+delete_obj:
+ kobject_put(swap_kobj);
+ return err;
+}
+subsys_initcall(swap_init_sysfs);
+#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6ba4aab2db0b..d483278ee35b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -60,7 +60,7 @@ atomic_long_t nr_swap_pages;
EXPORT_SYMBOL_GPL(nr_swap_pages);
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
-static int least_priority;
+static int least_priority = -1;
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
@@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head);
* is held and the locking order requires swap_lock to be taken
* before any swap_info_struct->lock.
*/
-static PLIST_HEAD(swap_avail_head);
+struct plist_head *swap_avail_heads;
static DEFINE_SPINLOCK(swap_avail_lock);
struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -96,6 +96,8 @@ static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
/* Activity counter to indicate that a swapon or swapoff has occurred */
static atomic_t proc_poll_event = ATOMIC_INIT(0);
+atomic_t nr_rotate_swap = ATOMIC_INIT(0);
+
static inline unsigned char swap_count(unsigned char ent)
{
return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
@@ -265,6 +267,16 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
info->data = 0;
}
+static inline bool cluster_is_huge(struct swap_cluster_info *info)
+{
+ return info->flags & CLUSTER_FLAG_HUGE;
+}
+
+static inline void cluster_clear_huge(struct swap_cluster_info *info)
+{
+ info->flags &= ~CLUSTER_FLAG_HUGE;
+}
+
static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
unsigned long offset)
{
@@ -580,6 +592,21 @@ new_cluster:
return found_free;
}
+static void __del_from_avail_list(struct swap_info_struct *p)
+{
+ int nid;
+
+ for_each_node(nid)
+ plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
+}
+
+static void del_from_avail_list(struct swap_info_struct *p)
+{
+ spin_lock(&swap_avail_lock);
+ __del_from_avail_list(p);
+ spin_unlock(&swap_avail_lock);
+}
+
static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
unsigned int nr_entries)
{
@@ -593,10 +620,20 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
if (si->inuse_pages == si->pages) {
si->lowest_bit = si->max;
si->highest_bit = 0;
- spin_lock(&swap_avail_lock);
- plist_del(&si->avail_list, &swap_avail_head);
- spin_unlock(&swap_avail_lock);
+ del_from_avail_list(si);
+ }
+}
+
+static void add_to_avail_list(struct swap_info_struct *p)
+{
+ int nid;
+
+ spin_lock(&swap_avail_lock);
+ for_each_node(nid) {
+ WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
+ plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
}
+ spin_unlock(&swap_avail_lock);
}
static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
@@ -611,13 +648,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
bool was_full = !si->highest_bit;
si->highest_bit = end;
- if (was_full && (si->flags & SWP_WRITEOK)) {
- spin_lock(&swap_avail_lock);
- WARN_ON(!plist_node_empty(&si->avail_list));
- if (plist_node_empty(&si->avail_list))
- plist_add(&si->avail_list, &swap_avail_head);
- spin_unlock(&swap_avail_lock);
- }
+ if (was_full && (si->flags & SWP_WRITEOK))
+ add_to_avail_list(si);
}
atomic_long_add(nr_entries, &nr_swap_pages);
si->inuse_pages -= nr_entries;
@@ -846,7 +878,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
offset = idx * SWAPFILE_CLUSTER;
ci = lock_cluster(si, offset);
alloc_cluster(si, idx);
- cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0);
+ cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
map = si->swap_map + offset;
for (i = 0; i < SWAPFILE_CLUSTER; i++)
@@ -898,6 +930,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
struct swap_info_struct *si, *next;
long avail_pgs;
int n_ret = 0;
+ int node;
/* Only single cluster request supported */
WARN_ON_ONCE(n_goal > 1 && cluster);
@@ -917,14 +950,15 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
spin_lock(&swap_avail_lock);
start_over:
- plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
+ node = numa_node_id();
+ plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
/* requeue si to after same-priority siblings */
- plist_requeue(&si->avail_list, &swap_avail_head);
+ plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
spin_unlock(&swap_avail_lock);
spin_lock(&si->lock);
if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
spin_lock(&swap_avail_lock);
- if (plist_node_empty(&si->avail_list)) {
+ if (plist_node_empty(&si->avail_lists[node])) {
spin_unlock(&si->lock);
goto nextsi;
}
@@ -934,13 +968,14 @@ start_over:
WARN(!(si->flags & SWP_WRITEOK),
"swap_info %d in list but !SWP_WRITEOK\n",
si->type);
- plist_del(&si->avail_list, &swap_avail_head);
+ __del_from_avail_list(si);
spin_unlock(&si->lock);
goto nextsi;
}
- if (cluster)
- n_ret = swap_alloc_cluster(si, swp_entries);
- else
+ if (cluster) {
+ if (!(si->flags & SWP_FILE))
+ n_ret = swap_alloc_cluster(si, swp_entries);
+ } else
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
n_goal, swp_entries);
spin_unlock(&si->lock);
@@ -962,7 +997,7 @@ nextsi:
* swap_avail_head list then try it, otherwise start over
* if we have not gotten any slots.
*/
- if (plist_node_empty(&next->avail_list))
+ if (plist_node_empty(&next->avail_lists[node]))
goto start_over;
}
@@ -1168,22 +1203,57 @@ static void swapcache_free_cluster(swp_entry_t entry)
struct swap_cluster_info *ci;
struct swap_info_struct *si;
unsigned char *map;
- unsigned int i;
+ unsigned int i, free_entries = 0;
+ unsigned char val;
- si = swap_info_get(entry);
+ si = _swap_info_get(entry);
if (!si)
return;
ci = lock_cluster(si, offset);
+ VM_BUG_ON(!cluster_is_huge(ci));
map = si->swap_map + offset;
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
- VM_BUG_ON(map[i] != SWAP_HAS_CACHE);
- map[i] = 0;
+ val = map[i];
+ VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+ if (val == SWAP_HAS_CACHE)
+ free_entries++;
+ }
+ if (!free_entries) {
+ for (i = 0; i < SWAPFILE_CLUSTER; i++)
+ map[i] &= ~SWAP_HAS_CACHE;
}
+ cluster_clear_huge(ci);
unlock_cluster(ci);
- mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
- swap_free_cluster(si, idx);
- spin_unlock(&si->lock);
+ if (free_entries == SWAPFILE_CLUSTER) {
+ spin_lock(&si->lock);
+ ci = lock_cluster(si, offset);
+ memset(map, 0, SWAPFILE_CLUSTER);
+ unlock_cluster(ci);
+ mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+ swap_free_cluster(si, idx);
+ spin_unlock(&si->lock);
+ } else if (free_entries) {
+ for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
+ if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
+ free_swap_slot(entry);
+ }
+ }
+}
+
+int split_swap_cluster(swp_entry_t entry)
+{
+ struct swap_info_struct *si;
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+
+ si = _swap_info_get(entry);
+ if (!si)
+ return -EBUSY;
+ ci = lock_cluster(si, offset);
+ cluster_clear_huge(ci);
+ unlock_cluster(ci);
+ return 0;
}
#else
static inline void swapcache_free_cluster(swp_entry_t entry)
@@ -1332,29 +1402,161 @@ out:
return count;
}
+#ifdef CONFIG_THP_SWAP
+static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
+ swp_entry_t entry)
+{
+ struct swap_cluster_info *ci;
+ unsigned char *map = si->swap_map;
+ unsigned long roffset = swp_offset(entry);
+ unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
+ int i;
+ bool ret = false;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ if (!ci || !cluster_is_huge(ci)) {
+ if (map[roffset] != SWAP_HAS_CACHE)
+ ret = true;
+ goto unlock_out;
+ }
+ for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+ if (map[offset + i] != SWAP_HAS_CACHE) {
+ ret = true;
+ break;
+ }
+ }
+unlock_out:
+ unlock_cluster_or_swap_info(si, ci);
+ return ret;
+}
+
+static bool page_swapped(struct page *page)
+{
+ swp_entry_t entry;
+ struct swap_info_struct *si;
+
+ if (likely(!PageTransCompound(page)))
+ return page_swapcount(page) != 0;
+
+ page = compound_head(page);
+ entry.val = page_private(page);
+ si = _swap_info_get(entry);
+ if (si)
+ return swap_page_trans_huge_swapped(si, entry);
+ return false;
+}
+
+static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+ int *total_swapcount)
+{
+ int i, map_swapcount, _total_mapcount, _total_swapcount;
+ unsigned long offset = 0;
+ struct swap_info_struct *si;
+ struct swap_cluster_info *ci = NULL;
+ unsigned char *map = NULL;
+ int mapcount, swapcount = 0;
+
+ /* hugetlbfs shouldn't call it */
+ VM_BUG_ON_PAGE(PageHuge(page), page);
+
+ if (likely(!PageTransCompound(page))) {
+ mapcount = atomic_read(&page->_mapcount) + 1;
+ if (total_mapcount)
+ *total_mapcount = mapcount;
+ if (PageSwapCache(page))
+ swapcount = page_swapcount(page);
+ if (total_swapcount)
+ *total_swapcount = swapcount;
+ return mapcount + swapcount;
+ }
+
+ page = compound_head(page);
+
+ _total_mapcount = _total_swapcount = map_swapcount = 0;
+ if (PageSwapCache(page)) {
+ swp_entry_t entry;
+
+ entry.val = page_private(page);
+ si = _swap_info_get(entry);
+ if (si) {
+ map = si->swap_map;
+ offset = swp_offset(entry);
+ }
+ }
+ if (map)
+ ci = lock_cluster(si, offset);
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ mapcount = atomic_read(&page[i]._mapcount) + 1;
+ _total_mapcount += mapcount;
+ if (map) {
+ swapcount = swap_count(map[offset + i]);
+ _total_swapcount += swapcount;
+ }
+ map_swapcount = max(map_swapcount, mapcount + swapcount);
+ }
+ unlock_cluster(ci);
+ if (PageDoubleMap(page)) {
+ map_swapcount -= 1;
+ _total_mapcount -= HPAGE_PMD_NR;
+ }
+ mapcount = compound_mapcount(page);
+ map_swapcount += mapcount;
+ _total_mapcount += mapcount;
+ if (total_mapcount)
+ *total_mapcount = _total_mapcount;
+ if (total_swapcount)
+ *total_swapcount = _total_swapcount;
+
+ return map_swapcount;
+}
+#else
+#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry)
+#define page_swapped(page) (page_swapcount(page) != 0)
+
+static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+ int *total_swapcount)
+{
+ int mapcount, swapcount = 0;
+
+ /* hugetlbfs shouldn't call it */
+ VM_BUG_ON_PAGE(PageHuge(page), page);
+
+ mapcount = page_trans_huge_mapcount(page, total_mapcount);
+ if (PageSwapCache(page))
+ swapcount = page_swapcount(page);
+ if (total_swapcount)
+ *total_swapcount = swapcount;
+ return mapcount + swapcount;
+}
+#endif
+
/*
* We can write to an anon page without COW if there are no other references
* to it. And as a side-effect, free up its swap: because the old content
* on disk will never be read, and seeking back there to write new content
* later would only waste time away from clustering.
*
- * NOTE: total_mapcount should not be relied upon by the caller if
+ * NOTE: total_map_swapcount should not be relied upon by the caller if
* reuse_swap_page() returns false, but it may be always overwritten
* (see the other implementation for CONFIG_SWAP=n).
*/
-bool reuse_swap_page(struct page *page, int *total_mapcount)
+bool reuse_swap_page(struct page *page, int *total_map_swapcount)
{
- int count;
+ int count, total_mapcount, total_swapcount;
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (unlikely(PageKsm(page)))
return false;
- count = page_trans_huge_mapcount(page, total_mapcount);
- if (count <= 1 && PageSwapCache(page)) {
- count += page_swapcount(page);
- if (count != 1)
- goto out;
+ count = page_trans_huge_map_swapcount(page, &total_mapcount,
+ &total_swapcount);
+ if (total_map_swapcount)
+ *total_map_swapcount = total_mapcount + total_swapcount;
+ if (count == 1 && PageSwapCache(page) &&
+ (likely(!PageTransCompound(page)) ||
+ /* The remaining swap count will be freed soon */
+ total_swapcount == page_swapcount(page))) {
if (!PageWriteback(page)) {
+ page = compound_head(page);
delete_from_swap_cache(page);
SetPageDirty(page);
} else {
@@ -1370,7 +1572,7 @@ bool reuse_swap_page(struct page *page, int *total_mapcount)
spin_unlock(&p->lock);
}
}
-out:
+
return count <= 1;
}
@@ -1386,7 +1588,7 @@ int try_to_free_swap(struct page *page)
return 0;
if (PageWriteback(page))
return 0;
- if (page_swapcount(page))
+ if (page_swapped(page))
return 0;
/*
@@ -1407,6 +1609,7 @@ int try_to_free_swap(struct page *page)
if (pm_suspended_storage())
return 0;
+ page = compound_head(page);
delete_from_swap_cache(page);
SetPageDirty(page);
return 1;
@@ -1428,7 +1631,8 @@ int free_swap_and_cache(swp_entry_t entry)
p = _swap_info_get(entry);
if (p) {
count = __swap_entry_free(p, entry, 1);
- if (count == SWAP_HAS_CACHE) {
+ if (count == SWAP_HAS_CACHE &&
+ !swap_page_trans_huge_swapped(p, entry)) {
page = find_get_page(swap_address_space(entry),
swp_offset(entry));
if (page && !trylock_page(page)) {
@@ -1445,7 +1649,8 @@ int free_swap_and_cache(swp_entry_t entry)
*/
if (PageSwapCache(page) && !PageWriteback(page) &&
(!page_mapped(page) || mem_cgroup_swap_full(page)) &&
- !swap_swapcount(p, entry)) {
+ !swap_page_trans_huge_swapped(p, entry)) {
+ page = compound_head(page);
delete_from_swap_cache(page);
SetPageDirty(page);
}
@@ -1999,7 +2204,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
.sync_mode = WB_SYNC_NONE,
};
- swap_writepage(page, &wbc);
+ swap_writepage(compound_head(page), &wbc);
lock_page(page);
wait_on_page_writeback(page);
}
@@ -2012,8 +2217,9 @@ int try_to_unuse(unsigned int type, bool frontswap,
* delete, since it may not have been written out to swap yet.
*/
if (PageSwapCache(page) &&
- likely(page_private(page) == entry.val))
- delete_from_swap_cache(page);
+ likely(page_private(page) == entry.val) &&
+ !page_swapped(page))
+ delete_from_swap_cache(compound_head(page));
/*
* So we could skip searching mms once swap count went
@@ -2226,10 +2432,24 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
return generic_swapfile_activate(sis, swap_file, span);
}
+static int swap_node(struct swap_info_struct *p)
+{
+ struct block_device *bdev;
+
+ if (p->bdev)
+ bdev = p->bdev;
+ else
+ bdev = p->swap_file->f_inode->i_sb->s_bdev;
+
+ return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
+}
+
static void _enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info)
{
+ int i;
+
if (prio >= 0)
p->prio = prio;
else
@@ -2239,7 +2459,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
* low-to-high, while swap ordering is high-to-low
*/
p->list.prio = -p->prio;
- p->avail_list.prio = -p->prio;
+ for_each_node(i) {
+ if (p->prio >= 0)
+ p->avail_lists[i].prio = -p->prio;
+ else {
+ if (swap_node(p) == i)
+ p->avail_lists[i].prio = 1;
+ else
+ p->avail_lists[i].prio = -p->prio;
+ }
+ }
p->swap_map = swap_map;
p->cluster_info = cluster_info;
p->flags |= SWP_WRITEOK;
@@ -2258,9 +2487,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
* swap_info_struct.
*/
plist_add(&p->list, &swap_active_head);
- spin_lock(&swap_avail_lock);
- plist_add(&p->avail_list, &swap_avail_head);
- spin_unlock(&swap_avail_lock);
+ add_to_avail_list(p);
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -2345,17 +2572,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
- spin_lock(&swap_avail_lock);
- plist_del(&p->avail_list, &swap_avail_head);
- spin_unlock(&swap_avail_lock);
+ del_from_avail_list(p);
spin_lock(&p->lock);
if (p->prio < 0) {
struct swap_info_struct *si = p;
+ int nid;
plist_for_each_entry_continue(si, &swap_active_head, list) {
si->prio++;
si->list.prio--;
- si->avail_list.prio--;
+ for_each_node(nid) {
+ if (si->avail_lists[nid].prio != 1)
+ si->avail_lists[nid].prio--;
+ }
}
least_priority++;
}
@@ -2387,6 +2616,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (p->flags & SWP_CONTINUED)
free_swap_count_continuations(p);
+ if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
+ atomic_dec(&nr_rotate_swap);
+
mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
spin_lock(&p->lock);
@@ -2596,6 +2828,7 @@ static struct swap_info_struct *alloc_swap_info(void)
{
struct swap_info_struct *p;
unsigned int type;
+ int i;
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p)
@@ -2631,7 +2864,8 @@ static struct swap_info_struct *alloc_swap_info(void)
}
INIT_LIST_HEAD(&p->first_swap_extent.list);
plist_node_init(&p->list, 0);
- plist_node_init(&p->avail_list, 0);
+ for_each_node(i)
+ plist_node_init(&p->avail_lists[i], 0);
p->flags = SWP_USED;
spin_unlock(&swap_lock);
spin_lock_init(&p->lock);
@@ -2873,6 +3107,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!swap_avail_heads)
+ return -ENOMEM;
+
p = alloc_swap_info();
if (IS_ERR(p))
return PTR_ERR(p);
@@ -2963,7 +3200,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
cluster = per_cpu_ptr(p->percpu_cluster, cpu);
cluster_set_null(&cluster->index);
}
- }
+ } else
+ atomic_inc(&nr_rotate_swap);
error = swap_cgroup_swapon(p->type, maxpages);
if (error)
@@ -3457,3 +3695,21 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
}
}
}
+
+static int __init swapfile_init(void)
+{
+ int nid;
+
+ swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
+ GFP_KERNEL);
+ if (!swap_avail_heads) {
+ pr_emerg("Not enough memory for swap heads, swap is disabled\n");
+ return -ENOMEM;
+ }
+
+ for_each_node(nid)
+ plist_head_init(&swap_avail_heads[nid]);
+
+ return 0;
+}
+subsys_initcall(swapfile_init);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 8bcb501bce60..81192701964d 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -371,6 +371,36 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
bool zeropage);
#endif /* CONFIG_HUGETLB_PAGE */
+static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **page,
+ bool zeropage)
+{
+ ssize_t err;
+
+ if (vma_is_anonymous(dst_vma)) {
+ if (!zeropage)
+ err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr, src_addr, page);
+ else
+ err = mfill_zeropage_pte(dst_mm, dst_pmd,
+ dst_vma, dst_addr);
+ } else {
+ if (!zeropage)
+ err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
+ dst_vma, dst_addr,
+ src_addr, page);
+ else
+ err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
+ dst_vma, dst_addr);
+ }
+
+ return err;
+}
+
static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long src_start,
@@ -487,22 +517,8 @@ retry:
BUG_ON(pmd_none(*dst_pmd));
BUG_ON(pmd_trans_huge(*dst_pmd));
- if (vma_is_anonymous(dst_vma)) {
- if (!zeropage)
- err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, src_addr,
- &page);
- else
- err = mfill_zeropage_pte(dst_mm, dst_pmd,
- dst_vma, dst_addr);
- } else {
- err = -EINVAL; /* if zeropage is true return -EINVAL */
- if (likely(!zeropage))
- err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
- dst_vma, dst_addr,
- src_addr, &page);
- }
-
+ err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ src_addr, &page, zeropage);
cond_resched();
if (unlikely(err == -EFAULT)) {
diff --git a/mm/util.c b/mm/util.c
index 9ecddf568fe3..34e57fae959d 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -614,7 +614,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
return 0;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- free = global_page_state(NR_FREE_PAGES);
+ free = global_zone_page_state(NR_FREE_PAGES);
free += global_node_page_state(NR_FILE_PAGES);
/*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a47e3894c775..8a43db6284eb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -49,12 +49,10 @@ static void __vunmap(const void *, int);
static void free_work(struct work_struct *w)
{
struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
- struct llist_node *llnode = llist_del_all(&p->list);
- while (llnode) {
- void *p = llnode;
- llnode = llist_next(llnode);
- __vunmap(p, 1);
- }
+ struct llist_node *t, *llnode;
+
+ llist_for_each_safe(llnode, t, llist_del_all(&p->list))
+ __vunmap((void *)llnode, 1);
}
/*** Page table manipulation functions ***/
@@ -2482,7 +2480,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
* matching slot. While scanning, if any of the areas overlaps with
* existing vmap_area, the base address is pulled down to fit the
* area. Scanning is repeated till all the areas fit and then all
- * necessary data structres are inserted and the result is returned.
+ * necessary data structures are inserted and the result is returned.
*/
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
const size_t *sizes, int nr_vms,
@@ -2510,15 +2508,11 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
if (start > offsets[last_area])
last_area = area;
- for (area2 = 0; area2 < nr_vms; area2++) {
+ for (area2 = area + 1; area2 < nr_vms; area2++) {
unsigned long start2 = offsets[area2];
unsigned long end2 = start2 + sizes[area2];
- if (area2 == area)
- continue;
-
- BUG_ON(start2 >= start && start2 < end);
- BUG_ON(end2 <= end && end2 > start);
+ BUG_ON(start2 < end && start < end2);
}
}
last_end = offsets[last_area] + sizes[last_area];
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f957afe900ec..13d711dd8776 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -393,14 +393,15 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
unsigned long nr_to_scan = min(batch_size, total_scan);
shrinkctl->nr_to_scan = nr_to_scan;
+ shrinkctl->nr_scanned = nr_to_scan;
ret = shrinker->scan_objects(shrinker, shrinkctl);
if (ret == SHRINK_STOP)
break;
freed += ret;
- count_vm_events(SLABS_SCANNED, nr_to_scan);
- total_scan -= nr_to_scan;
- scanned += nr_to_scan;
+ count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
+ total_scan -= shrinkctl->nr_scanned;
+ scanned += shrinkctl->nr_scanned;
cond_resched();
}
@@ -535,7 +536,9 @@ static inline int is_page_cache_freeable(struct page *page)
* that isolated the page, the page cache radix tree and
* optional buffer heads at page->private.
*/
- return page_count(page) - page_has_private(page) == 2;
+ int radix_pins = PageTransHuge(page) && PageSwapCache(page) ?
+ HPAGE_PMD_NR : 1;
+ return page_count(page) - page_has_private(page) == 1 + radix_pins;
}
static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
@@ -665,6 +668,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
bool reclaimed)
{
unsigned long flags;
+ int refcount;
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
@@ -695,11 +699,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under tree_lock, then this ordering is not required.
*/
- if (!page_ref_freeze(page, 2))
+ if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
+ refcount = 1 + HPAGE_PMD_NR;
+ else
+ refcount = 2;
+ if (!page_ref_freeze(page, refcount))
goto cannot_free;
/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
if (unlikely(PageDirty(page))) {
- page_ref_unfreeze(page, 2);
+ page_ref_unfreeze(page, refcount);
goto cannot_free;
}
@@ -1121,58 +1129,59 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* Try to allocate it some swap space here.
* Lazyfree page could be freed directly
*/
- if (PageAnon(page) && PageSwapBacked(page) &&
- !PageSwapCache(page)) {
- if (!(sc->gfp_mask & __GFP_IO))
- goto keep_locked;
- if (PageTransHuge(page)) {
- /* cannot split THP, skip it */
- if (!can_split_huge_page(page, NULL))
- goto activate_locked;
- /*
- * Split pages without a PMD map right
- * away. Chances are some or all of the
- * tail pages can be freed without IO.
- */
- if (!compound_mapcount(page) &&
- split_huge_page_to_list(page, page_list))
- goto activate_locked;
- }
- if (!add_to_swap(page)) {
- if (!PageTransHuge(page))
- goto activate_locked;
- /* Split THP and swap individual base pages */
- if (split_huge_page_to_list(page, page_list))
- goto activate_locked;
- if (!add_to_swap(page))
- goto activate_locked;
- }
-
- /* XXX: We don't support THP writes */
- if (PageTransHuge(page) &&
- split_huge_page_to_list(page, page_list)) {
- delete_from_swap_cache(page);
- goto activate_locked;
- }
+ if (PageAnon(page) && PageSwapBacked(page)) {
+ if (!PageSwapCache(page)) {
+ if (!(sc->gfp_mask & __GFP_IO))
+ goto keep_locked;
+ if (PageTransHuge(page)) {
+ /* cannot split THP, skip it */
+ if (!can_split_huge_page(page, NULL))
+ goto activate_locked;
+ /*
+ * Split pages without a PMD map right
+ * away. Chances are some or all of the
+ * tail pages can be freed without IO.
+ */
+ if (!compound_mapcount(page) &&
+ split_huge_page_to_list(page,
+ page_list))
+ goto activate_locked;
+ }
+ if (!add_to_swap(page)) {
+ if (!PageTransHuge(page))
+ goto activate_locked;
+ /* Fallback to swap normal pages */
+ if (split_huge_page_to_list(page,
+ page_list))
+ goto activate_locked;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ count_vm_event(THP_SWPOUT_FALLBACK);
+#endif
+ if (!add_to_swap(page))
+ goto activate_locked;
+ }
- may_enter_fs = 1;
+ may_enter_fs = 1;
- /* Adding to swap updated mapping */
- mapping = page_mapping(page);
+ /* Adding to swap updated mapping */
+ mapping = page_mapping(page);
+ }
} else if (unlikely(PageTransHuge(page))) {
/* Split file THP */
if (split_huge_page_to_list(page, page_list))
goto keep_locked;
}
- VM_BUG_ON_PAGE(PageTransHuge(page), page);
-
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
if (page_mapped(page)) {
- if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) {
+ enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
+
+ if (unlikely(PageTransHuge(page)))
+ flags |= TTU_SPLIT_HUGE_PMD;
+ if (!try_to_unmap(page, flags)) {
nr_unmap_fail++;
goto activate_locked;
}
@@ -1312,7 +1321,11 @@ free_it:
* Is there need to periodically free_page_list? It would
* appear not as the counts should be low
*/
- list_add(&page->lru, &free_pages);
+ if (unlikely(PageTransHuge(page))) {
+ mem_cgroup_uncharge(page);
+ (*get_compound_page_dtor(page))(page);
+ } else
+ list_add(&page->lru, &free_pages);
continue;
activate_locked:
@@ -1742,9 +1755,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
int file = is_file_lru(lru);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+ bool stalled = false;
while (unlikely(too_many_isolated(pgdat, file, sc))) {
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ if (stalled)
+ return 0;
+
+ /* wait a bit for the reclaimer. */
+ msleep(100);
+ stalled = true;
/* We are about to die and free our memory. Return now. */
if (fatal_signal_pending(current))
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9a4441bbeef2..c7e4b8458023 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -870,6 +870,9 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in
{
unsigned long requested = 1UL << order;
+ if (WARN_ON_ONCE(order >= MAX_ORDER))
+ return 0;
+
if (!info->free_blocks_total)
return 0;
@@ -1071,6 +1074,8 @@ const char * const vmstat_text[] = {
#endif
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
+ "thp_swpout",
+ "thp_swpout_fallback",
#endif
#ifdef CONFIG_MEMORY_BALLOON
"balloon_inflate",
@@ -1093,6 +1098,10 @@ const char * const vmstat_text[] = {
"vmacache_find_hits",
"vmacache_full_flushes",
#endif
+#ifdef CONFIG_SWAP
+ "swap_ra",
+ "swap_ra_hit",
+#endif
#endif /* CONFIG_VM_EVENTS_COUNTERS */
};
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
@@ -1250,7 +1259,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
seq_putc(m, '\n');
}
-/* Print out the free pages at each order for each migratetype */
+/* Print out the number of pageblocks for each migratetype */
static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
{
int mtype;
@@ -1500,7 +1509,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
if (!v)
return ERR_PTR(-ENOMEM);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- v[i] = global_page_state(i);
+ v[i] = global_zone_page_state(i);
v += NR_VM_ZONE_STAT_ITEMS;
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
@@ -1589,7 +1598,7 @@ int vmstat_refresh(struct ctl_table *table, int write,
* which can equally be echo'ed to or cat'ted from (by root),
* can be used to update the stats just before reading them.
*
- * Oh, and since global_page_state() etc. are so careful to hide
+ * Oh, and since global_zone_page_state() etc. are so careful to hide
* transiently negative values, report an error here if any of
* the stats is negative, so we know to go looking for imbalance.
*/
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 54f63c4a809a..486550df32be 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -23,10 +23,13 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/atomic.h>
+#include <linux/sched.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/percpu.h>
#include <linux/preempt.h>
+#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/zpool.h>
@@ -48,11 +51,15 @@ enum buddy {
};
/*
- * struct z3fold_header - z3fold page metadata occupying the first chunk of each
+ * struct z3fold_header - z3fold page metadata occupying first chunks of each
* z3fold page, except for HEADLESS pages
- * @buddy: links the z3fold page into the relevant list in the pool
+ * @buddy: links the z3fold page into the relevant list in the
+ * pool
* @page_lock: per-page lock
- * @refcount: reference cound for the z3fold page
+ * @refcount: reference count for the z3fold page
+ * @work: work_struct for page layout optimization
+ * @pool: pointer to the pool which this page belongs to
+ * @cpu: CPU which this page "belongs" to
* @first_chunks: the size of the first buddy in chunks, 0 if free
* @middle_chunks: the size of the middle buddy in chunks, 0 if free
* @last_chunks: the size of the last buddy in chunks, 0 if free
@@ -62,6 +69,9 @@ struct z3fold_header {
struct list_head buddy;
spinlock_t page_lock;
struct kref refcount;
+ struct work_struct work;
+ struct z3fold_pool *pool;
+ short cpu;
unsigned short first_chunks;
unsigned short middle_chunks;
unsigned short last_chunks;
@@ -92,28 +102,39 @@ struct z3fold_header {
/**
* struct z3fold_pool - stores metadata for each z3fold pool
- * @lock: protects all pool fields and first|last_chunk fields of any
- * z3fold page in the pool
- * @unbuddied: array of lists tracking z3fold pages that contain 2- buddies;
- * the lists each z3fold page is added to depends on the size of
- * its free region.
+ * @name: pool name
+ * @lock: protects pool unbuddied/lru lists
+ * @stale_lock: protects pool stale page list
+ * @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2-
+ * buddies; the list each z3fold page is added to depends on
+ * the size of its free region.
* @lru: list tracking the z3fold pages in LRU order by most recently
* added buddy.
+ * @stale: list of pages marked for freeing
* @pages_nr: number of z3fold pages in the pool.
* @ops: pointer to a structure of user defined operations specified at
* pool creation time.
+ * @compact_wq: workqueue for page layout background optimization
+ * @release_wq: workqueue for safe page release
+ * @work: work_struct for safe page release
*
* This structure is allocated at pool creation time and maintains metadata
* pertaining to a particular z3fold pool.
*/
struct z3fold_pool {
+ const char *name;
spinlock_t lock;
- struct list_head unbuddied[NCHUNKS];
+ spinlock_t stale_lock;
+ struct list_head *unbuddied;
struct list_head lru;
+ struct list_head stale;
atomic64_t pages_nr;
const struct z3fold_ops *ops;
struct zpool *zpool;
const struct zpool_ops *zpool_ops;
+ struct workqueue_struct *compact_wq;
+ struct workqueue_struct *release_wq;
+ struct work_struct work;
};
/*
@@ -122,9 +143,10 @@ struct z3fold_pool {
enum z3fold_page_flags {
PAGE_HEADLESS = 0,
MIDDLE_CHUNK_MAPPED,
+ NEEDS_COMPACTING,
+ PAGE_STALE
};
-
/*****************
* Helpers
*****************/
@@ -138,14 +160,19 @@ static int size_to_chunks(size_t size)
#define for_each_unbuddied_list(_iter, _begin) \
for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
+static void compact_page_work(struct work_struct *w);
+
/* Initializes the z3fold header of a newly allocated z3fold page */
-static struct z3fold_header *init_z3fold_page(struct page *page)
+static struct z3fold_header *init_z3fold_page(struct page *page,
+ struct z3fold_pool *pool)
{
struct z3fold_header *zhdr = page_address(page);
INIT_LIST_HEAD(&page->lru);
clear_bit(PAGE_HEADLESS, &page->private);
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+ clear_bit(NEEDS_COMPACTING, &page->private);
+ clear_bit(PAGE_STALE, &page->private);
spin_lock_init(&zhdr->page_lock);
kref_init(&zhdr->refcount);
@@ -154,7 +181,10 @@ static struct z3fold_header *init_z3fold_page(struct page *page)
zhdr->last_chunks = 0;
zhdr->first_num = 0;
zhdr->start_middle = 0;
+ zhdr->cpu = -1;
+ zhdr->pool = pool;
INIT_LIST_HEAD(&zhdr->buddy);
+ INIT_WORK(&zhdr->work, compact_page_work);
return zhdr;
}
@@ -164,21 +194,6 @@ static void free_z3fold_page(struct page *page)
__free_page(page);
}
-static void release_z3fold_page(struct kref *ref)
-{
- struct z3fold_header *zhdr;
- struct page *page;
-
- zhdr = container_of(ref, struct z3fold_header, refcount);
- page = virt_to_page(zhdr);
-
- if (!list_empty(&zhdr->buddy))
- list_del(&zhdr->buddy);
- if (!list_empty(&page->lru))
- list_del(&page->lru);
- free_z3fold_page(page);
-}
-
/* Lock a z3fold page */
static inline void z3fold_page_lock(struct z3fold_header *zhdr)
{
@@ -228,6 +243,76 @@ static enum buddy handle_to_buddy(unsigned long handle)
return (handle - zhdr->first_num) & BUDDY_MASK;
}
+static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
+{
+ struct page *page = virt_to_page(zhdr);
+ struct z3fold_pool *pool = zhdr->pool;
+
+ WARN_ON(!list_empty(&zhdr->buddy));
+ set_bit(PAGE_STALE, &page->private);
+ spin_lock(&pool->lock);
+ if (!list_empty(&page->lru))
+ list_del(&page->lru);
+ spin_unlock(&pool->lock);
+ if (locked)
+ z3fold_page_unlock(zhdr);
+ spin_lock(&pool->stale_lock);
+ list_add(&zhdr->buddy, &pool->stale);
+ queue_work(pool->release_wq, &pool->work);
+ spin_unlock(&pool->stale_lock);
+}
+
+static void __attribute__((__unused__))
+ release_z3fold_page(struct kref *ref)
+{
+ struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
+ refcount);
+ __release_z3fold_page(zhdr, false);
+}
+
+static void release_z3fold_page_locked(struct kref *ref)
+{
+ struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
+ refcount);
+ WARN_ON(z3fold_page_trylock(zhdr));
+ __release_z3fold_page(zhdr, true);
+}
+
+static void release_z3fold_page_locked_list(struct kref *ref)
+{
+ struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
+ refcount);
+ spin_lock(&zhdr->pool->lock);
+ list_del_init(&zhdr->buddy);
+ spin_unlock(&zhdr->pool->lock);
+
+ WARN_ON(z3fold_page_trylock(zhdr));
+ __release_z3fold_page(zhdr, true);
+}
+
+static void free_pages_work(struct work_struct *w)
+{
+ struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work);
+
+ spin_lock(&pool->stale_lock);
+ while (!list_empty(&pool->stale)) {
+ struct z3fold_header *zhdr = list_first_entry(&pool->stale,
+ struct z3fold_header, buddy);
+ struct page *page = virt_to_page(zhdr);
+
+ list_del(&zhdr->buddy);
+ if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
+ continue;
+ clear_bit(NEEDS_COMPACTING, &page->private);
+ spin_unlock(&pool->stale_lock);
+ cancel_work_sync(&zhdr->work);
+ free_z3fold_page(page);
+ cond_resched();
+ spin_lock(&pool->stale_lock);
+ }
+ spin_unlock(&pool->stale_lock);
+}
+
/*
* Returns the number of free chunks in a z3fold page.
* NB: can't be used with HEADLESS pages.
@@ -252,46 +337,6 @@ static int num_free_chunks(struct z3fold_header *zhdr)
return nfree;
}
-/*****************
- * API Functions
-*****************/
-/**
- * z3fold_create_pool() - create a new z3fold pool
- * @gfp: gfp flags when allocating the z3fold pool structure
- * @ops: user-defined operations for the z3fold pool
- *
- * Return: pointer to the new z3fold pool or NULL if the metadata allocation
- * failed.
- */
-static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
- const struct z3fold_ops *ops)
-{
- struct z3fold_pool *pool;
- int i;
-
- pool = kzalloc(sizeof(struct z3fold_pool), gfp);
- if (!pool)
- return NULL;
- spin_lock_init(&pool->lock);
- for_each_unbuddied_list(i, 0)
- INIT_LIST_HEAD(&pool->unbuddied[i]);
- INIT_LIST_HEAD(&pool->lru);
- atomic64_set(&pool->pages_nr, 0);
- pool->ops = ops;
- return pool;
-}
-
-/**
- * z3fold_destroy_pool() - destroys an existing z3fold pool
- * @pool: the z3fold pool to be destroyed
- *
- * The pool should be emptied before this function is called.
- */
-static void z3fold_destroy_pool(struct z3fold_pool *pool)
-{
- kfree(pool);
-}
-
static inline void *mchunk_memmove(struct z3fold_header *zhdr,
unsigned short dst_chunk)
{
@@ -347,6 +392,117 @@ static int z3fold_compact_page(struct z3fold_header *zhdr)
return 0;
}
+static void do_compact_page(struct z3fold_header *zhdr, bool locked)
+{
+ struct z3fold_pool *pool = zhdr->pool;
+ struct page *page;
+ struct list_head *unbuddied;
+ int fchunks;
+
+ page = virt_to_page(zhdr);
+ if (locked)
+ WARN_ON(z3fold_page_trylock(zhdr));
+ else
+ z3fold_page_lock(zhdr);
+ if (test_bit(PAGE_STALE, &page->private) ||
+ !test_and_clear_bit(NEEDS_COMPACTING, &page->private)) {
+ z3fold_page_unlock(zhdr);
+ return;
+ }
+ spin_lock(&pool->lock);
+ list_del_init(&zhdr->buddy);
+ spin_unlock(&pool->lock);
+
+ z3fold_compact_page(zhdr);
+ unbuddied = get_cpu_ptr(pool->unbuddied);
+ fchunks = num_free_chunks(zhdr);
+ if (fchunks < NCHUNKS &&
+ (!zhdr->first_chunks || !zhdr->middle_chunks ||
+ !zhdr->last_chunks)) {
+ /* the page's not completely free and it's unbuddied */
+ spin_lock(&pool->lock);
+ list_add(&zhdr->buddy, &unbuddied[fchunks]);
+ spin_unlock(&pool->lock);
+ zhdr->cpu = smp_processor_id();
+ }
+ put_cpu_ptr(pool->unbuddied);
+ z3fold_page_unlock(zhdr);
+}
+
+static void compact_page_work(struct work_struct *w)
+{
+ struct z3fold_header *zhdr = container_of(w, struct z3fold_header,
+ work);
+
+ do_compact_page(zhdr, false);
+}
+
+
+/*
+ * API Functions
+ */
+
+/**
+ * z3fold_create_pool() - create a new z3fold pool
+ * @name: pool name
+ * @gfp: gfp flags when allocating the z3fold pool structure
+ * @ops: user-defined operations for the z3fold pool
+ *
+ * Return: pointer to the new z3fold pool or NULL if the metadata allocation
+ * failed.
+ */
+static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
+ const struct z3fold_ops *ops)
+{
+ struct z3fold_pool *pool = NULL;
+ int i, cpu;
+
+ pool = kzalloc(sizeof(struct z3fold_pool), gfp);
+ if (!pool)
+ goto out;
+ spin_lock_init(&pool->lock);
+ spin_lock_init(&pool->stale_lock);
+ pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
+ for_each_possible_cpu(cpu) {
+ struct list_head *unbuddied =
+ per_cpu_ptr(pool->unbuddied, cpu);
+ for_each_unbuddied_list(i, 0)
+ INIT_LIST_HEAD(&unbuddied[i]);
+ }
+ INIT_LIST_HEAD(&pool->lru);
+ INIT_LIST_HEAD(&pool->stale);
+ atomic64_set(&pool->pages_nr, 0);
+ pool->name = name;
+ pool->compact_wq = create_singlethread_workqueue(pool->name);
+ if (!pool->compact_wq)
+ goto out;
+ pool->release_wq = create_singlethread_workqueue(pool->name);
+ if (!pool->release_wq)
+ goto out_wq;
+ INIT_WORK(&pool->work, free_pages_work);
+ pool->ops = ops;
+ return pool;
+
+out_wq:
+ destroy_workqueue(pool->compact_wq);
+out:
+ kfree(pool);
+ return NULL;
+}
+
+/**
+ * z3fold_destroy_pool() - destroys an existing z3fold pool
+ * @pool: the z3fold pool to be destroyed
+ *
+ * The pool should be emptied before this function is called.
+ */
+static void z3fold_destroy_pool(struct z3fold_pool *pool)
+{
+ destroy_workqueue(pool->release_wq);
+ destroy_workqueue(pool->compact_wq);
+ kfree(pool);
+}
+
/**
* z3fold_alloc() - allocates a region of a given size
* @pool: z3fold pool from which to allocate
@@ -371,8 +527,9 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
{
int chunks = 0, i, freechunks;
struct z3fold_header *zhdr = NULL;
+ struct page *page = NULL;
enum buddy bud;
- struct page *page;
+ bool can_sleep = (gfp & __GFP_RECLAIM) == __GFP_RECLAIM;
if (!size || (gfp & __GFP_HIGHMEM))
return -EINVAL;
@@ -383,23 +540,57 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
bud = HEADLESS;
else {
+ struct list_head *unbuddied;
chunks = size_to_chunks(size);
+lookup:
/* First, try to find an unbuddied z3fold page. */
- zhdr = NULL;
+ unbuddied = get_cpu_ptr(pool->unbuddied);
for_each_unbuddied_list(i, chunks) {
- spin_lock(&pool->lock);
- zhdr = list_first_entry_or_null(&pool->unbuddied[i],
+ struct list_head *l = &unbuddied[i];
+
+ zhdr = list_first_entry_or_null(READ_ONCE(l),
struct z3fold_header, buddy);
- if (!zhdr || !z3fold_page_trylock(zhdr)) {
- spin_unlock(&pool->lock);
+
+ if (!zhdr)
continue;
+
+ /* Re-check under lock. */
+ spin_lock(&pool->lock);
+ l = &unbuddied[i];
+ if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
+ struct z3fold_header, buddy)) ||
+ !z3fold_page_trylock(zhdr)) {
+ spin_unlock(&pool->lock);
+ put_cpu_ptr(pool->unbuddied);
+ goto lookup;
}
- kref_get(&zhdr->refcount);
list_del_init(&zhdr->buddy);
+ zhdr->cpu = -1;
spin_unlock(&pool->lock);
page = virt_to_page(zhdr);
+ if (test_bit(NEEDS_COMPACTING, &page->private)) {
+ z3fold_page_unlock(zhdr);
+ zhdr = NULL;
+ put_cpu_ptr(pool->unbuddied);
+ if (can_sleep)
+ cond_resched();
+ goto lookup;
+ }
+
+ /*
+ * this page could not be removed from its unbuddied
+ * list while pool lock was held, and then we've taken
+ * page lock so kref_put could not be called before
+ * we got here, so it's safe to just call kref_get()
+ */
+ kref_get(&zhdr->refcount);
+ break;
+ }
+ put_cpu_ptr(pool->unbuddied);
+
+ if (zhdr) {
if (zhdr->first_chunks == 0) {
if (zhdr->middle_chunks != 0 &&
chunks >= zhdr->start_middle)
@@ -411,32 +602,49 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
else if (zhdr->middle_chunks == 0)
bud = MIDDLE;
else {
- z3fold_page_unlock(zhdr);
- spin_lock(&pool->lock);
if (kref_put(&zhdr->refcount,
- release_z3fold_page))
+ release_z3fold_page_locked))
atomic64_dec(&pool->pages_nr);
- spin_unlock(&pool->lock);
+ else
+ z3fold_page_unlock(zhdr);
pr_err("No free chunks in unbuddied\n");
WARN_ON(1);
- continue;
+ goto lookup;
}
goto found;
}
bud = FIRST;
}
- /* Couldn't find unbuddied z3fold page, create new one */
- page = alloc_page(gfp);
+ spin_lock(&pool->stale_lock);
+ zhdr = list_first_entry_or_null(&pool->stale,
+ struct z3fold_header, buddy);
+ /*
+ * Before allocating a page, let's see if we can take one from the
+ * stale pages list. cancel_work_sync() can sleep so we must make
+ * sure it won't be called in case we're in atomic context.
+ */
+ if (zhdr && (can_sleep || !work_pending(&zhdr->work) ||
+ !unlikely(work_busy(&zhdr->work)))) {
+ list_del(&zhdr->buddy);
+ clear_bit(NEEDS_COMPACTING, &page->private);
+ spin_unlock(&pool->stale_lock);
+ if (can_sleep)
+ cancel_work_sync(&zhdr->work);
+ page = virt_to_page(zhdr);
+ } else {
+ spin_unlock(&pool->stale_lock);
+ page = alloc_page(gfp);
+ }
+
if (!page)
return -ENOMEM;
atomic64_inc(&pool->pages_nr);
- zhdr = init_z3fold_page(page);
+ zhdr = init_z3fold_page(page, pool);
if (bud == HEADLESS) {
set_bit(PAGE_HEADLESS, &page->private);
- spin_lock(&pool->lock);
goto headless;
}
z3fold_page_lock(zhdr);
@@ -451,15 +659,21 @@ found:
zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
}
- spin_lock(&pool->lock);
if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
zhdr->middle_chunks == 0) {
+ struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
+
/* Add to unbuddied list */
freechunks = num_free_chunks(zhdr);
- list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+ spin_lock(&pool->lock);
+ list_add(&zhdr->buddy, &unbuddied[freechunks]);
+ spin_unlock(&pool->lock);
+ zhdr->cpu = smp_processor_id();
+ put_cpu_ptr(pool->unbuddied);
}
headless:
+ spin_lock(&pool->lock);
/* Add/move z3fold page to beginning of LRU */
if (!list_empty(&page->lru))
list_del(&page->lru);
@@ -487,7 +701,6 @@ headless:
static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
{
struct z3fold_header *zhdr;
- int freechunks;
struct page *page;
enum buddy bud;
@@ -526,25 +739,27 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
spin_unlock(&pool->lock);
free_z3fold_page(page);
atomic64_dec(&pool->pages_nr);
- } else {
- if (zhdr->first_chunks != 0 || zhdr->middle_chunks != 0 ||
- zhdr->last_chunks != 0) {
- z3fold_compact_page(zhdr);
- /* Add to the unbuddied list */
- spin_lock(&pool->lock);
- if (!list_empty(&zhdr->buddy))
- list_del(&zhdr->buddy);
- freechunks = num_free_chunks(zhdr);
- list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
- spin_unlock(&pool->lock);
- }
+ return;
+ }
+
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
+ atomic64_dec(&pool->pages_nr);
+ return;
+ }
+ if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
z3fold_page_unlock(zhdr);
+ return;
+ }
+ if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
spin_lock(&pool->lock);
- if (kref_put(&zhdr->refcount, release_z3fold_page))
- atomic64_dec(&pool->pages_nr);
+ list_del_init(&zhdr->buddy);
spin_unlock(&pool->lock);
+ zhdr->cpu = -1;
+ do_compact_page(zhdr, true);
+ return;
}
-
+ queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
+ z3fold_page_unlock(zhdr);
}
/**
@@ -585,9 +800,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
*/
static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
{
- int i, ret = 0, freechunks;
- struct z3fold_header *zhdr;
- struct page *page;
+ int i, ret = 0;
+ struct z3fold_header *zhdr = NULL;
+ struct page *page = NULL;
+ struct list_head *pos;
unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
spin_lock(&pool->lock);
@@ -600,16 +816,24 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
spin_unlock(&pool->lock);
return -EINVAL;
}
- page = list_last_entry(&pool->lru, struct page, lru);
+ list_for_each_prev(pos, &pool->lru) {
+ page = list_entry(pos, struct page, lru);
+ if (test_bit(PAGE_HEADLESS, &page->private))
+ /* candidate found */
+ break;
+
+ zhdr = page_address(page);
+ if (!z3fold_page_trylock(zhdr))
+ continue; /* can't evict at this point */
+ kref_get(&zhdr->refcount);
+ list_del_init(&zhdr->buddy);
+ zhdr->cpu = -1;
+ }
+
list_del_init(&page->lru);
+ spin_unlock(&pool->lock);
- zhdr = page_address(page);
if (!test_bit(PAGE_HEADLESS, &page->private)) {
- if (!list_empty(&zhdr->buddy))
- list_del_init(&zhdr->buddy);
- kref_get(&zhdr->refcount);
- spin_unlock(&pool->lock);
- z3fold_page_lock(zhdr);
/*
* We need encode the handles before unlocking, since
* we can race with free that will set
@@ -624,11 +848,14 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
middle_handle = encode_handle(zhdr, MIDDLE);
if (zhdr->last_chunks)
last_handle = encode_handle(zhdr, LAST);
+ /*
+ * it's safe to unlock here because we hold a
+ * reference to this page
+ */
z3fold_page_unlock(zhdr);
} else {
first_handle = encode_handle(zhdr, HEADLESS);
last_handle = middle_handle = 0;
- spin_unlock(&pool->lock);
}
/* Issue the eviction callback(s) */
@@ -652,31 +879,12 @@ next:
if (ret == 0) {
free_z3fold_page(page);
return 0;
- } else {
- spin_lock(&pool->lock);
- }
- } else {
- z3fold_page_lock(zhdr);
- if ((zhdr->first_chunks || zhdr->last_chunks ||
- zhdr->middle_chunks) &&
- !(zhdr->first_chunks && zhdr->last_chunks &&
- zhdr->middle_chunks)) {
- z3fold_compact_page(zhdr);
- /* add to unbuddied list */
- spin_lock(&pool->lock);
- freechunks = num_free_chunks(zhdr);
- list_add(&zhdr->buddy,
- &pool->unbuddied[freechunks]);
- spin_unlock(&pool->lock);
- }
- z3fold_page_unlock(zhdr);
- spin_lock(&pool->lock);
- if (kref_put(&zhdr->refcount, release_z3fold_page)) {
- spin_unlock(&pool->lock);
- atomic64_dec(&pool->pages_nr);
- return 0;
}
+ } else if (kref_put(&zhdr->refcount, release_z3fold_page)) {
+ atomic64_dec(&pool->pages_nr);
+ return 0;
}
+ spin_lock(&pool->lock);
/*
* Add to the beginning of LRU.
@@ -795,7 +1003,8 @@ static void *z3fold_zpool_create(const char *name, gfp_t gfp,
{
struct z3fold_pool *pool;
- pool = z3fold_create_pool(gfp, zpool_ops ? &z3fold_zpool_ops : NULL);
+ pool = z3fold_create_pool(name, gfp,
+ zpool_ops ? &z3fold_zpool_ops : NULL);
if (pool) {
pool->zpool = zpool;
pool->zpool_ops = zpool_ops;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 308acb9d814b..62457eb82330 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1983,8 +1983,11 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage,
spin_lock(&class->lock);
if (!get_zspage_inuse(zspage)) {
- ret = -EBUSY;
- goto unlock_class;
+ /*
+ * Set "offset" to end of the page so that every loops
+ * skips unnecessary object scanning.
+ */
+ offset = PAGE_SIZE;
}
pos = offset;
@@ -2052,7 +2055,6 @@ unpin_objects:
}
}
kunmap_atomic(s_addr);
-unlock_class:
spin_unlock(&class->lock);
migrate_write_unlock(zspage);