summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig8
-rw-r--r--mm/backing-dev.c1
-rw-r--r--mm/compaction.c73
-rw-r--r--mm/debug.c4
-rw-r--r--mm/filemap.c89
-rw-r--r--mm/gup.c39
-rw-r--r--mm/huge_memory.c231
-rw-r--r--mm/hugetlb.c25
-rw-r--r--mm/init-mm.c2
-rw-r--r--mm/internal.h2
-rw-r--r--mm/kasan/kasan.c9
-rw-r--r--mm/kasan/quarantine.c94
-rw-r--r--mm/kasan/report.c2
-rw-r--r--mm/khugepaged.c62
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/madvise.c1
-rw-r--r--mm/memcontrol.c39
-rw-r--r--mm/memory.c932
-rw-r--r--mm/memory_hotplug.c20
-rw-r--r--mm/mempolicy.c30
-rw-r--r--mm/migrate.c19
-rw-r--r--mm/mprotect.c19
-rw-r--r--mm/nommu.c13
-rw-r--r--mm/page-writeback.c29
-rw-r--r--mm/page_alloc.c142
-rw-r--r--mm/page_io.c5
-rw-r--r--mm/percpu.c19
-rw-r--r--mm/process_vm_access.c12
-rw-r--r--mm/readahead.c39
-rw-r--r--mm/rmap.c69
-rw-r--r--mm/shmem.c62
-rw-r--r--mm/slab.c136
-rw-r--r--mm/slab.h20
-rw-r--r--mm/slab_common.c33
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c21
-rw-r--r--mm/swapfile.c13
-rw-r--r--mm/truncate.c21
-rw-r--r--mm/vmalloc.c196
-rw-r--r--mm/vmscan.c42
-rw-r--r--mm/vmstat.c95
-rw-r--r--mm/workingset.c114
-rw-r--r--mm/zsmalloc.c67
-rw-r--r--mm/zswap.c172
44 files changed, 1570 insertions, 1455 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 86e3e0e74d20..9b8fccb969dc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -153,7 +153,7 @@ config MOVABLE_NODE
bool "Enable to assign a node which has only movable memory"
depends on HAVE_MEMBLOCK
depends on NO_BOOTMEM
- depends on X86_64
+ depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG
depends on NUMA
default n
help
@@ -447,13 +447,9 @@ choice
benefit.
endchoice
-#
-# We don't deposit page tables on file THP mapping,
-# but Power makes use of them to address MMU quirk.
-#
config TRANSPARENT_HUGE_PAGECACHE
def_bool y
- depends on TRANSPARENT_HUGEPAGE && !PPC
+ depends on TRANSPARENT_HUGEPAGE
#
# UP and nommu archs use km based percpu allocator
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8fde443f36d7..3bfed5ab2475 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -310,6 +310,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
spin_lock_init(&wb->work_lock);
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
+ wb->dirty_sleep = jiffies;
wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
if (!wb->congested)
diff --git a/mm/compaction.c b/mm/compaction.c
index 0409a4ad6ea1..949198d01260 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -634,22 +634,6 @@ isolate_freepages_range(struct compact_control *cc,
return pfn;
}
-/* Update the number of anon and file isolated pages in the zone */
-static void acct_isolated(struct zone *zone, struct compact_control *cc)
-{
- struct page *page;
- unsigned int count[2] = { 0, };
-
- if (list_empty(&cc->migratepages))
- return;
-
- list_for_each_entry(page, &cc->migratepages, lru)
- count[!!page_is_file_cache(page)]++;
-
- mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]);
- mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]);
-}
-
/* Similar to reclaim, but different enough that they don't share logic */
static bool too_many_isolated(struct zone *zone)
{
@@ -834,6 +818,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
page_count(page) > page_mapcount(page))
goto isolate_fail;
+ /*
+ * Only allow to migrate anonymous pages in GFP_NOFS context
+ * because those do not depend on fs locks.
+ */
+ if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
+ goto isolate_fail;
+
/* If we already hold the lock, we can skip some rechecking */
if (!locked) {
locked = compact_trylock_irqsave(zone_lru_lock(zone),
@@ -866,6 +857,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Successfully isolated */
del_page_from_lru_list(page, lruvec, page_lru(page));
+ inc_node_page_state(page,
+ NR_ISOLATED_ANON + page_is_file_cache(page));
isolate_success:
list_add(&page->lru, &cc->migratepages);
@@ -902,7 +895,6 @@ isolate_fail:
spin_unlock_irqrestore(zone_lru_lock(zone), flags);
locked = false;
}
- acct_isolated(zone, cc);
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
cc->last_migrated_pfn = 0;
@@ -988,7 +980,6 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
break;
}
- acct_isolated(cc->zone, cc);
return pfn;
}
@@ -1258,10 +1249,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
low_pfn = isolate_migratepages_block(cc, low_pfn,
block_end_pfn, isolate_mode);
- if (!low_pfn || cc->contended) {
- acct_isolated(zone, cc);
+ if (!low_pfn || cc->contended)
return ISOLATE_ABORT;
- }
/*
* Either we isolated something and proceed with migration. Or
@@ -1271,7 +1260,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
break;
}
- acct_isolated(zone, cc);
/* Record where migration scanner will be restarted. */
cc->migrate_pfn = low_pfn;
@@ -1696,14 +1684,16 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio)
{
- int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
struct zone *zone;
enum compact_result rc = COMPACT_SKIPPED;
- /* Check if the GFP flags allow compaction */
- if (!may_enter_fs || !may_perform_io)
+ /*
+ * Check if the GFP flags allow compaction - GFP_NOIO is really
+ * tricky context because the migration might require IO
+ */
+ if (!may_perform_io)
return COMPACT_SKIPPED;
trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
@@ -1770,6 +1760,7 @@ static void compact_node(int nid)
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
.whole_zone = true,
+ .gfp_mask = GFP_KERNEL,
};
@@ -1895,6 +1886,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
.classzone_idx = pgdat->kcompactd_classzone_idx,
.mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = true,
+ .gfp_mask = GFP_KERNEL,
};
trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
@@ -2043,33 +2035,38 @@ void kcompactd_stop(int nid)
* away, we get changed to run anywhere: as the first one comes back,
* restore their cpu bindings.
*/
-static int cpu_callback(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
+static int kcompactd_cpu_online(unsigned int cpu)
{
int nid;
- if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
- for_each_node_state(nid, N_MEMORY) {
- pg_data_t *pgdat = NODE_DATA(nid);
- const struct cpumask *mask;
+ for_each_node_state(nid, N_MEMORY) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ const struct cpumask *mask;
- mask = cpumask_of_node(pgdat->node_id);
+ mask = cpumask_of_node(pgdat->node_id);
- if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
- /* One of our CPUs online: restore mask */
- set_cpus_allowed_ptr(pgdat->kcompactd, mask);
- }
+ if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
+ /* One of our CPUs online: restore mask */
+ set_cpus_allowed_ptr(pgdat->kcompactd, mask);
}
- return NOTIFY_OK;
+ return 0;
}
static int __init kcompactd_init(void)
{
int nid;
+ int ret;
+
+ ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "mm/compaction:online",
+ kcompactd_cpu_online, NULL);
+ if (ret < 0) {
+ pr_err("kcompactd: failed to register hotplug callbacks.\n");
+ return ret;
+ }
for_each_node_state(nid, N_MEMORY)
kcompactd_run(nid);
- hotcpu_notifier(cpu_callback, 0);
return 0;
}
subsys_initcall(kcompactd_init)
diff --git a/mm/debug.c b/mm/debug.c
index 9feb699c5d25..db1cd26d8752 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -59,6 +59,10 @@ void __dump_page(struct page *page, const char *reason)
pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
+ print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32,
+ sizeof(unsigned long), page,
+ sizeof(struct page), false);
+
if (reason)
pr_alert("page dumped because: %s\n", reason);
diff --git a/mm/filemap.c b/mm/filemap.c
index 50b52fe51937..32be3c8f3a11 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -132,44 +132,28 @@ static int page_cache_tree_insert(struct address_space *mapping,
if (!dax_mapping(mapping)) {
if (shadowp)
*shadowp = p;
- if (node)
- workingset_node_shadows_dec(node);
} else {
/* DAX can replace empty locked entry with a hole */
WARN_ON_ONCE(p !=
- (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
- RADIX_DAX_ENTRY_LOCK));
- /* DAX accounts exceptional entries as normal pages */
- if (node)
- workingset_node_pages_dec(node);
+ dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
/* Wakeup waiters for exceptional entry lock */
- dax_wake_mapping_entry_waiter(mapping, page->index,
+ dax_wake_mapping_entry_waiter(mapping, page->index, p,
false);
}
}
- radix_tree_replace_slot(slot, page);
+ __radix_tree_replace(&mapping->page_tree, node, slot, page,
+ workingset_update_node, mapping);
mapping->nrpages++;
- if (node) {
- workingset_node_pages_inc(node);
- /*
- * Don't track node that contains actual pages.
- *
- * Avoid acquiring the list_lru lock if already
- * untracked. The list_empty() test is safe as
- * node->private_list is protected by
- * mapping->tree_lock.
- */
- if (!list_empty(&node->private_list))
- list_lru_del(&workingset_shadow_nodes,
- &node->private_list);
- }
return 0;
}
static void page_cache_tree_delete(struct address_space *mapping,
struct page *page, void *shadow)
{
- int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
+ int i, nr;
+
+ /* hugetlb pages are represented by one entry in the radix tree */
+ nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -182,44 +166,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
__radix_tree_lookup(&mapping->page_tree, page->index + i,
&node, &slot);
- radix_tree_clear_tags(&mapping->page_tree, node, slot);
-
- if (!node) {
- VM_BUG_ON_PAGE(nr != 1, page);
- /*
- * We need a node to properly account shadow
- * entries. Don't plant any without. XXX
- */
- shadow = NULL;
- }
-
- radix_tree_replace_slot(slot, shadow);
+ VM_BUG_ON_PAGE(!node && nr != 1, page);
- if (!node)
- break;
-
- workingset_node_pages_dec(node);
- if (shadow)
- workingset_node_shadows_inc(node);
- else
- if (__radix_tree_delete_node(&mapping->page_tree, node))
- continue;
-
- /*
- * Track node that only contains shadow entries. DAX mappings
- * contain no shadow entries and may contain other exceptional
- * entries so skip those.
- *
- * Avoid acquiring the list_lru lock if already tracked.
- * The list_empty() test is safe as node->private_list is
- * protected by mapping->tree_lock.
- */
- if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
- list_empty(&node->private_list)) {
- node->private_data = mapping;
- list_lru_add(&workingset_shadow_nodes,
- &node->private_list);
- }
+ radix_tree_clear_tags(&mapping->page_tree, node, slot);
+ __radix_tree_replace(&mapping->page_tree, node, slot, shadow,
+ workingset_update_node, mapping);
}
if (shadow) {
@@ -1686,7 +1637,7 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
int error = 0;
if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
- return -EINVAL;
+ return 0;
iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
index = *ppos >> PAGE_SHIFT;
@@ -2213,12 +2164,12 @@ page_not_uptodate:
}
EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct fault_env *fe,
+void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
struct radix_tree_iter iter;
void **slot;
- struct file *file = fe->vma->vm_file;
+ struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
loff_t size;
@@ -2274,11 +2225,11 @@ repeat:
if (file->f_ra.mmap_miss > 0)
file->f_ra.mmap_miss--;
- fe->address += (iter.index - last_pgoff) << PAGE_SHIFT;
- if (fe->pte)
- fe->pte += iter.index - last_pgoff;
+ vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT;
+ if (vmf->pte)
+ vmf->pte += iter.index - last_pgoff;
last_pgoff = iter.index;
- if (alloc_set_pte(fe, NULL, page))
+ if (alloc_set_pte(vmf, NULL, page))
goto unlock;
unlock_page(page);
goto next;
@@ -2288,7 +2239,7 @@ skip:
put_page(page);
next:
/* Huge page is mapped? No need to proceed. */
- if (pmd_trans_huge(*fe->pmd))
+ if (pmd_trans_huge(*vmf->pmd))
break;
if (iter.index == end_pgoff)
break;
diff --git a/mm/gup.c b/mm/gup.c
index ec4f82704b6f..55315555489d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -632,7 +632,8 @@ next_page:
return i;
}
-bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags)
+static bool vma_permits_fault(struct vm_area_struct *vma,
+ unsigned int fault_flags)
{
bool write = !!(fault_flags & FAULT_FLAG_WRITE);
bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
@@ -857,18 +858,17 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
EXPORT_SYMBOL(get_user_pages_locked);
/*
- * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to
- * pass additional gup_flags as last parameter (like FOLL_HWPOISON).
+ * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows for
+ * tsk, mm to be specified.
*
* NOTE: here FOLL_TOUCH is not set implicitly and must be set by the
- * caller if required (just like with __get_user_pages). "FOLL_GET",
- * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed
- * according to the parameters "pages", "write", "force"
- * respectively.
+ * caller if required (just like with __get_user_pages). "FOLL_GET"
+ * is set implicitly if "pages" is non-NULL.
*/
-__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- struct page **pages, unsigned int gup_flags)
+static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long nr_pages, struct page **pages,
+ unsigned int gup_flags)
{
long ret;
int locked = 1;
@@ -880,7 +880,6 @@ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct m
up_read(&mm->mmap_sem);
return ret;
}
-EXPORT_SYMBOL(__get_user_pages_unlocked);
/*
* get_user_pages_unlocked() is suitable to replace the form:
@@ -894,10 +893,8 @@ EXPORT_SYMBOL(__get_user_pages_unlocked);
* get_user_pages_unlocked(tsk, mm, ..., pages);
*
* It is functionally equivalent to get_user_pages_fast so
- * get_user_pages_fast should be used instead, if the two parameters
- * "tsk" and "mm" are respectively equal to current and current->mm,
- * or if "force" shall be set to 1 (get_user_pages_fast misses the
- * "force" parameter).
+ * get_user_pages_fast should be used instead if specific gup_flags
+ * (e.g. FOLL_FORCE) are not required.
*/
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags)
@@ -920,6 +917,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
+ * @locked: pointer to lock flag indicating whether lock is held and
+ * subsequently whether VM_FAULT_RETRY functionality can be
+ * utilised. Lock must initially be held.
*
* Returns number of pages pinned. This may be fewer than the number
* requested. If nr_pages is 0 or negative, returns 0. If no pages
@@ -963,10 +963,10 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas)
+ struct vm_area_struct **vmas, int *locked)
{
return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
- NULL, false,
+ locked, true,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
EXPORT_SYMBOL(get_user_pages_remote);
@@ -974,8 +974,9 @@ EXPORT_SYMBOL(get_user_pages_remote);
/*
* This is the same as get_user_pages_remote(), just with a
* less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's. We also
- * obviously don't pass FOLL_REMOTE in here.
+ * and mm being operated on are the current task's and don't allow
+ * passing of a locked parameter. We also obviously don't pass
+ * FOLL_REMOTE in here.
*/
long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d4a6e4001512..10eedbf14421 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -285,6 +285,15 @@ static ssize_t use_zero_page_store(struct kobject *kobj,
}
static struct kobj_attribute use_zero_page_attr =
__ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
+
+static ssize_t hpage_pmd_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE);
+}
+static struct kobj_attribute hpage_pmd_size_attr =
+ __ATTR_RO(hpage_pmd_size);
+
#ifdef CONFIG_DEBUG_VM
static ssize_t debug_cow_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -307,6 +316,7 @@ static struct attribute *hugepage_attr[] = {
&enabled_attr.attr,
&defrag_attr.attr,
&use_zero_page_attr.attr,
+ &hpage_pmd_size_attr.attr,
#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
&shmem_enabled_attr.attr,
#endif
@@ -532,13 +542,13 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
-static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
+static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
gfp_t gfp)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
pgtable_t pgtable;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
VM_BUG_ON_PAGE(!PageCompound(page), page);
@@ -563,9 +573,9 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
*/
__SetPageUptodate(page);
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_none(*fe->pmd))) {
- spin_unlock(fe->ptl);
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd))) {
+ spin_unlock(vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(vma->vm_mm, pgtable);
@@ -576,11 +586,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
if (userfaultfd_missing(vma)) {
int ret;
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(vma->vm_mm, pgtable);
- ret = handle_userfault(fe, VM_UFFD_MISSING);
+ ret = handle_userfault(vmf, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
return ret;
}
@@ -590,11 +600,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
page_add_new_anon_rmap(page, vma, haddr, true);
mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma);
- pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
atomic_long_inc(&vma->vm_mm->nr_ptes);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
}
@@ -641,12 +651,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return true;
}
-int do_huge_pmd_anonymous_page(struct fault_env *fe)
+int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
gfp_t gfp;
struct page *page;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
return VM_FAULT_FALLBACK;
@@ -654,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
return VM_FAULT_OOM;
- if (!(fe->flags & FAULT_FLAG_WRITE) &&
+ if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm) &&
transparent_hugepage_use_zero_page()) {
pgtable_t pgtable;
@@ -670,22 +680,22 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
ret = 0;
set = false;
- if (pmd_none(*fe->pmd)) {
+ if (pmd_none(*vmf->pmd)) {
if (userfaultfd_missing(vma)) {
- spin_unlock(fe->ptl);
- ret = handle_userfault(fe, VM_UFFD_MISSING);
+ spin_unlock(vmf->ptl);
+ ret = handle_userfault(vmf, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
} else {
set_huge_zero_page(pgtable, vma->vm_mm, vma,
- haddr, fe->pmd, zero_page);
- spin_unlock(fe->ptl);
+ haddr, vmf->pmd, zero_page);
+ spin_unlock(vmf->ptl);
set = true;
}
} else
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
if (!set)
pte_free(vma->vm_mm, pgtable);
return ret;
@@ -697,7 +707,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
return VM_FAULT_FALLBACK;
}
prep_transhuge_page(page);
- return __do_huge_pmd_anonymous_page(fe, page, gfp);
+ return __do_huge_pmd_anonymous_page(vmf, page, gfp);
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -737,8 +747,9 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
- if (track_pfn_insert(vma, &pgprot, pfn))
- return VM_FAULT_SIGBUS;
+
+ track_pfn_insert(vma, &pgprot, pfn);
+
insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
return VM_FAULT_NOPAGE;
}
@@ -868,30 +879,30 @@ out:
return ret;
}
-void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd)
+void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
{
pmd_t entry;
unsigned long haddr;
- fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
+ vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto unlock;
entry = pmd_mkyoung(orig_pmd);
- haddr = fe->address & HPAGE_PMD_MASK;
- if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry,
- fe->flags & FAULT_FLAG_WRITE))
- update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd);
+ haddr = vmf->address & HPAGE_PMD_MASK;
+ if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry,
+ vmf->flags & FAULT_FLAG_WRITE))
+ update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
unlock:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
}
-static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
+static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
struct page *page)
{
- struct vm_area_struct *vma = fe->vma;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
struct mem_cgroup *memcg;
pgtable_t pgtable;
pmd_t _pmd;
@@ -910,7 +921,7 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
for (i = 0; i < HPAGE_PMD_NR; i++) {
pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
__GFP_OTHER_NODE, vma,
- fe->address, page_to_nid(page));
+ vmf->address, page_to_nid(page));
if (unlikely(!pages[i] ||
mem_cgroup_try_charge(pages[i], vma->vm_mm,
GFP_KERNEL, &memcg, false))) {
@@ -941,15 +952,15 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page);
- pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
/* leave pmd empty until pte is filled */
- pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd);
+ pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
pmd_populate(vma->vm_mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -958,20 +969,20 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- page_add_new_anon_rmap(pages[i], fe->vma, haddr, false);
+ page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
mem_cgroup_commit_charge(pages[i], memcg, false, false);
lru_cache_add_active_or_unevictable(pages[i], vma);
- fe->pte = pte_offset_map(&_pmd, haddr);
- VM_BUG_ON(!pte_none(*fe->pte));
- set_pte_at(vma->vm_mm, haddr, fe->pte, entry);
- pte_unmap(fe->pte);
+ vmf->pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*vmf->pte));
+ set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
+ pte_unmap(vmf->pte);
}
kfree(pages);
smp_wmb(); /* make pte visible before pmd */
- pmd_populate(vma->vm_mm, fe->pmd, pgtable);
+ pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
page_remove_rmap(page, true);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
@@ -982,7 +993,7 @@ out:
return ret;
out_free_pages:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
for (i = 0; i < HPAGE_PMD_NR; i++) {
memcg = (void *)page_private(pages[i]);
@@ -994,23 +1005,23 @@ out_free_pages:
goto out;
}
-int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
+int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *new_page;
struct mem_cgroup *memcg;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
gfp_t huge_gfp; /* for allocation and charge */
int ret = 0;
- fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd);
+ vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
- spin_lock(fe->ptl);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
+ spin_lock(vmf->ptl);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto out_unlock;
page = pmd_page(orig_pmd);
@@ -1023,13 +1034,13 @@ int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1))
- update_mmu_cache_pmd(vma, fe->address, fe->pmd);
+ if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
ret |= VM_FAULT_WRITE;
goto out_unlock;
}
get_page(page);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
@@ -1042,12 +1053,12 @@ alloc:
prep_transhuge_page(new_page);
} else {
if (!page) {
- split_huge_pmd(vma, fe->pmd, fe->address);
+ split_huge_pmd(vma, vmf->pmd, vmf->address);
ret |= VM_FAULT_FALLBACK;
} else {
- ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page);
+ ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
if (ret & VM_FAULT_OOM) {
- split_huge_pmd(vma, fe->pmd, fe->address);
+ split_huge_pmd(vma, vmf->pmd, vmf->address);
ret |= VM_FAULT_FALLBACK;
}
put_page(page);
@@ -1059,7 +1070,7 @@ alloc:
if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
huge_gfp, &memcg, true))) {
put_page(new_page);
- split_huge_pmd(vma, fe->pmd, fe->address);
+ split_huge_pmd(vma, vmf->pmd, vmf->address);
if (page)
put_page(page);
ret |= VM_FAULT_FALLBACK;
@@ -1079,11 +1090,11 @@ alloc:
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
- spin_lock(fe->ptl);
+ spin_lock(vmf->ptl);
if (page)
put_page(page);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) {
- spin_unlock(fe->ptl);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ spin_unlock(vmf->ptl);
mem_cgroup_cancel_charge(new_page, memcg, true);
put_page(new_page);
goto out_mn;
@@ -1091,12 +1102,12 @@ alloc:
pmd_t entry;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
page_add_new_anon_rmap(new_page, vma, haddr, true);
mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
- update_mmu_cache_pmd(vma, fe->address, fe->pmd);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
if (!page) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
} else {
@@ -1106,13 +1117,13 @@ alloc:
}
ret |= VM_FAULT_WRITE;
}
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
out_mn:
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
out:
return ret;
out_unlock:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
return ret;
}
@@ -1185,12 +1196,12 @@ out:
}
/* NUMA hinting page fault entry point for trans huge pmds */
-int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
+int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct anon_vma *anon_vma = NULL;
struct page *page;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
int page_nid = -1, this_nid = numa_node_id();
int target_nid, last_cpupid = -1;
bool page_locked;
@@ -1198,8 +1209,8 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
bool was_writable;
int flags = 0;
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_same(pmd, *fe->pmd)))
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(pmd, *vmf->pmd)))
goto out_unlock;
/*
@@ -1207,9 +1218,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
* without disrupting NUMA hinting information. Do not relock and
* check_same as the page may no longer be mapped.
*/
- if (unlikely(pmd_trans_migrating(*fe->pmd))) {
- page = pmd_page(*fe->pmd);
- spin_unlock(fe->ptl);
+ if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
+ page = pmd_page(*vmf->pmd);
+ spin_unlock(vmf->ptl);
wait_on_page_locked(page);
goto out;
}
@@ -1242,7 +1253,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
wait_on_page_locked(page);
page_nid = -1;
goto out;
@@ -1253,12 +1264,12 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
* to serialises splits
*/
get_page(page);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
anon_vma = page_lock_anon_vma_read(page);
/* Confirm the PMD did not change while page_table_lock was released */
- spin_lock(fe->ptl);
- if (unlikely(!pmd_same(pmd, *fe->pmd))) {
+ spin_lock(vmf->ptl);
+ if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
unlock_page(page);
put_page(page);
page_nid = -1;
@@ -1276,9 +1287,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
* Migrate the THP to the requested node, returns with page unlocked
* and access rights restored.
*/
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
- fe->pmd, pmd, fe->address, page, target_nid);
+ vmf->pmd, pmd, vmf->address, page, target_nid);
if (migrated) {
flags |= TNF_MIGRATED;
page_nid = target_nid;
@@ -1293,18 +1304,19 @@ clear_pmdnuma:
pmd = pmd_mkyoung(pmd);
if (was_writable)
pmd = pmd_mkwrite(pmd);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd);
- update_mmu_cache_pmd(vma, fe->address, fe->pmd);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
unlock_page(page);
out_unlock:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
out:
if (anon_vma)
page_unlock_anon_vma_read(anon_vma);
if (page_nid != -1)
- task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags);
+ task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
+ vmf->flags);
return 0;
}
@@ -1322,6 +1334,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct mm_struct *mm = tlb->mm;
bool ret = false;
+ tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
+
ptl = pmd_trans_huge_lock(pmd, vma);
if (!ptl)
goto out_unlocked;
@@ -1377,12 +1391,23 @@ out_unlocked:
return ret;
}
+static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
+{
+ pgtable_t pgtable;
+
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pte_free(mm, pgtable);
+ atomic_long_dec(&mm->nr_ptes);
+}
+
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
pmd_t orig_pmd;
spinlock_t *ptl;
+ tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
+
ptl = __pmd_trans_huge_lock(pmd, vma);
if (!ptl)
return 0;
@@ -1398,12 +1423,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (vma_is_dax(vma)) {
spin_unlock(ptl);
if (is_huge_zero_pmd(orig_pmd))
- tlb_remove_page(tlb, pmd_page(orig_pmd));
+ tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
} else if (is_huge_zero_pmd(orig_pmd)) {
pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
atomic_long_dec(&tlb->mm->nr_ptes);
spin_unlock(ptl);
- tlb_remove_page(tlb, pmd_page(orig_pmd));
+ tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
} else {
struct page *page = pmd_page(orig_pmd);
page_remove_rmap(page, true);
@@ -1416,6 +1441,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
atomic_long_dec(&tlb->mm->nr_ptes);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
} else {
+ if (arch_needs_pgtable_deposit())
+ zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
}
spin_unlock(ptl);
@@ -1424,6 +1451,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
return 1;
}
+#ifndef pmd_move_must_withdraw
+static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
+ spinlock_t *old_pmd_ptl,
+ struct vm_area_struct *vma)
+{
+ /*
+ * With split pmd lock we also need to move preallocated
+ * PTE page table if new_pmd is on different PMD page table.
+ *
+ * We also don't deposit and withdraw tables for file pages.
+ */
+ return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
+}
+#endif
+
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
@@ -1461,8 +1503,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
force_flush = true;
VM_BUG_ON(!pmd_none(*new_pmd));
- if (pmd_move_must_withdraw(new_ptl, old_ptl) &&
- vma_is_anonymous(vma)) {
+ if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
pgtable_t pgtable;
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
@@ -1588,6 +1629,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (!vma_is_anonymous(vma)) {
_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ /*
+ * We are going to unmap this huge page. So
+ * just go ahead and zap it
+ */
+ if (arch_needs_pgtable_deposit())
+ zap_deposited_table(mm, pmd);
if (vma_is_dax(vma))
return;
page = pmd_page(_pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 418bf01a50ed..3edb759c5c7d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3286,6 +3286,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
BUG_ON(start & ~huge_page_mask(h));
BUG_ON(end & ~huge_page_mask(h));
+ /*
+ * This is a hugetlb vma, all the pte entries should point
+ * to huge page.
+ */
+ tlb_remove_check_page_size_change(tlb, sz);
tlb_start_vma(tlb, vma);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
address = start;
@@ -3336,7 +3341,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
pte = huge_ptep_get_and_clear(mm, address, ptep);
- tlb_remove_tlb_entry(tlb, ptep, address);
+ tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
@@ -3450,15 +3455,17 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep, pte_t pte,
- struct page *pagecache_page, spinlock_t *ptl)
+ unsigned long address, pte_t *ptep,
+ struct page *pagecache_page, spinlock_t *ptl)
{
+ pte_t pte;
struct hstate *h = hstate_vma(vma);
struct page *old_page, *new_page;
int ret = 0, outside_reserve = 0;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ pte = huge_ptep_get(ptep);
old_page = pte_page(pte);
retry_avoidcopy:
@@ -3711,8 +3718,7 @@ retry:
vma_end_reservation(h, vma, address);
}
- ptl = huge_pte_lockptr(h, mm, ptep);
- spin_lock(ptl);
+ ptl = huge_pte_lock(h, mm, ptep);
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto backout;
@@ -3733,7 +3739,7 @@ retry:
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
+ ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
}
spin_unlock(ptl);
@@ -3888,8 +3894,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & FAULT_FLAG_WRITE) {
if (!huge_pte_write(entry)) {
- ret = hugetlb_cow(mm, vma, address, ptep, entry,
- pagecache_page, ptl);
+ ret = hugetlb_cow(mm, vma, address, ptep,
+ pagecache_page, ptl);
goto out_put_page;
}
entry = huge_pte_mkdirty(entry);
@@ -4330,8 +4336,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!spte)
goto out;
- ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
- spin_lock(ptl);
+ ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
if (pud_none(*pud)) {
pud_populate(mm, pud,
(pmd_t *)((unsigned long)spte & PAGE_MASK));
diff --git a/mm/init-mm.c b/mm/init-mm.c
index a56a851908d2..975e49f00f34 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -6,6 +6,7 @@
#include <linux/cpumask.h>
#include <linux/atomic.h>
+#include <linux/user_namespace.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
@@ -21,5 +22,6 @@ struct mm_struct init_mm = {
.mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
+ .user_ns = &init_user_ns,
INIT_MM_CONTEXT(init_mm)
};
diff --git a/mm/internal.h b/mm/internal.h
index 537ac9951f5f..44d68895a9b9 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -36,7 +36,7 @@
/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
-int do_swap_page(struct fault_env *fe, pte_t orig_pte);
+int do_swap_page(struct vm_fault *vmf);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 0e9505f66ec1..b2a0cff2bb35 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -80,7 +80,14 @@ void kasan_unpoison_task_stack(struct task_struct *task)
/* Unpoison the stack for the current task beyond a watermark sp value. */
asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
{
- __kasan_unpoison_stack(current, watermark);
+ /*
+ * Calculate the task stack base address. Avoid using 'current'
+ * because this function is called by early resume code which hasn't
+ * yet set up the percpu register (%gs).
+ */
+ void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
+
+ kasan_unpoison_shadow(base, watermark - base);
}
/*
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index baabaad4a4aa..dae929c02bbb 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -86,24 +86,9 @@ static void qlist_move_all(struct qlist_head *from, struct qlist_head *to)
qlist_init(from);
}
-static void qlist_move(struct qlist_head *from, struct qlist_node *last,
- struct qlist_head *to, size_t size)
-{
- if (unlikely(last == from->tail)) {
- qlist_move_all(from, to);
- return;
- }
- if (qlist_empty(to))
- to->head = from->head;
- else
- to->tail->next = from->head;
- to->tail = last;
- from->head = last->next;
- last->next = NULL;
- from->bytes -= size;
- to->bytes += size;
-}
-
+#define QUARANTINE_PERCPU_SIZE (1 << 20)
+#define QUARANTINE_BATCHES \
+ (1024 > 4 * CONFIG_NR_CPUS ? 1024 : 4 * CONFIG_NR_CPUS)
/*
* The object quarantine consists of per-cpu queues and a global queue,
@@ -111,11 +96,22 @@ static void qlist_move(struct qlist_head *from, struct qlist_node *last,
*/
static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine);
-static struct qlist_head global_quarantine;
+/* Round-robin FIFO array of batches. */
+static struct qlist_head global_quarantine[QUARANTINE_BATCHES];
+static int quarantine_head;
+static int quarantine_tail;
+/* Total size of all objects in global_quarantine across all batches. */
+static unsigned long quarantine_size;
static DEFINE_SPINLOCK(quarantine_lock);
/* Maximum size of the global queue. */
-static unsigned long quarantine_size;
+static unsigned long quarantine_max_size;
+
+/*
+ * Target size of a batch in global_quarantine.
+ * Usually equal to QUARANTINE_PERCPU_SIZE unless we have too much RAM.
+ */
+static unsigned long quarantine_batch_size;
/*
* The fraction of physical memory the quarantine is allowed to occupy.
@@ -124,9 +120,6 @@ static unsigned long quarantine_size;
*/
#define QUARANTINE_FRACTION 32
-#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4)
-#define QUARANTINE_PERCPU_SIZE (1 << 20)
-
static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
{
return virt_to_head_page(qlink)->slab_cache;
@@ -191,21 +184,30 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
if (unlikely(!qlist_empty(&temp))) {
spin_lock_irqsave(&quarantine_lock, flags);
- qlist_move_all(&temp, &global_quarantine);
+ WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes);
+ qlist_move_all(&temp, &global_quarantine[quarantine_tail]);
+ if (global_quarantine[quarantine_tail].bytes >=
+ READ_ONCE(quarantine_batch_size)) {
+ int new_tail;
+
+ new_tail = quarantine_tail + 1;
+ if (new_tail == QUARANTINE_BATCHES)
+ new_tail = 0;
+ if (new_tail != quarantine_head)
+ quarantine_tail = new_tail;
+ }
spin_unlock_irqrestore(&quarantine_lock, flags);
}
}
void quarantine_reduce(void)
{
- size_t new_quarantine_size, percpu_quarantines;
+ size_t total_size, new_quarantine_size, percpu_quarantines;
unsigned long flags;
struct qlist_head to_free = QLIST_INIT;
- size_t size_to_free = 0;
- struct qlist_node *last;
- if (likely(READ_ONCE(global_quarantine.bytes) <=
- READ_ONCE(quarantine_size)))
+ if (likely(READ_ONCE(quarantine_size) <=
+ READ_ONCE(quarantine_max_size)))
return;
spin_lock_irqsave(&quarantine_lock, flags);
@@ -214,24 +216,23 @@ void quarantine_reduce(void)
* Update quarantine size in case of hotplug. Allocate a fraction of
* the installed memory to quarantine minus per-cpu queue limits.
*/
- new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
+ total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
QUARANTINE_FRACTION;
percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
- new_quarantine_size = (new_quarantine_size < percpu_quarantines) ?
- 0 : new_quarantine_size - percpu_quarantines;
- WRITE_ONCE(quarantine_size, new_quarantine_size);
-
- last = global_quarantine.head;
- while (last) {
- struct kmem_cache *cache = qlink_to_cache(last);
-
- size_to_free += cache->size;
- if (!last->next || size_to_free >
- global_quarantine.bytes - QUARANTINE_LOW_SIZE)
- break;
- last = last->next;
+ new_quarantine_size = (total_size < percpu_quarantines) ?
+ 0 : total_size - percpu_quarantines;
+ WRITE_ONCE(quarantine_max_size, new_quarantine_size);
+ /* Aim at consuming at most 1/2 of slots in quarantine. */
+ WRITE_ONCE(quarantine_batch_size, max((size_t)QUARANTINE_PERCPU_SIZE,
+ 2 * total_size / QUARANTINE_BATCHES));
+
+ if (likely(quarantine_size > quarantine_max_size)) {
+ qlist_move_all(&global_quarantine[quarantine_head], &to_free);
+ WRITE_ONCE(quarantine_size, quarantine_size - to_free.bytes);
+ quarantine_head++;
+ if (quarantine_head == QUARANTINE_BATCHES)
+ quarantine_head = 0;
}
- qlist_move(&global_quarantine, last, &to_free, size_to_free);
spin_unlock_irqrestore(&quarantine_lock, flags);
@@ -275,13 +276,14 @@ static void per_cpu_remove_cache(void *arg)
void quarantine_remove_cache(struct kmem_cache *cache)
{
- unsigned long flags;
+ unsigned long flags, i;
struct qlist_head to_free = QLIST_INIT;
on_each_cpu(per_cpu_remove_cache, cache, 1);
spin_lock_irqsave(&quarantine_lock, flags);
- qlist_move_cache(&global_quarantine, &to_free, cache);
+ for (i = 0; i < QUARANTINE_BATCHES; i++)
+ qlist_move_cache(&global_quarantine[i], &to_free, cache);
spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, cache);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 073325aedc68..b82b3e215157 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -136,6 +136,8 @@ static void kasan_end_report(unsigned long *flags)
pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, *flags);
+ if (panic_on_warn)
+ panic("panic_on_warn set ...\n");
kasan_enable_current();
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 87e1a7ca3846..e32389a97030 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -875,13 +875,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
unsigned long address, pmd_t *pmd,
int referenced)
{
- pte_t pteval;
int swapped_in = 0, ret = 0;
- struct fault_env fe = {
+ struct vm_fault vmf = {
.vma = vma,
.address = address,
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
+ .pgoff = linear_page_index(vma, address),
};
/* we only decide to swapin, if there is enough young ptes */
@@ -889,19 +889,19 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
}
- fe.pte = pte_offset_map(pmd, address);
- for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE;
- fe.pte++, fe.address += PAGE_SIZE) {
- pteval = *fe.pte;
- if (!is_swap_pte(pteval))
+ vmf.pte = pte_offset_map(pmd, address);
+ for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
+ vmf.pte++, vmf.address += PAGE_SIZE) {
+ vmf.orig_pte = *vmf.pte;
+ if (!is_swap_pte(vmf.orig_pte))
continue;
swapped_in++;
- ret = do_swap_page(&fe, pteval);
+ ret = do_swap_page(&vmf);
/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
if (ret & VM_FAULT_RETRY) {
down_read(&mm->mmap_sem);
- if (hugepage_vma_revalidate(mm, address, &fe.vma)) {
+ if (hugepage_vma_revalidate(mm, address, &vmf.vma)) {
/* vma is no longer available, don't continue to swapin */
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
@@ -915,10 +915,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
return false;
}
/* pte is unmapped now, we need to map it */
- fe.pte = pte_offset_map(pmd, fe.address);
+ vmf.pte = pte_offset_map(pmd, vmf.address);
}
- fe.pte--;
- pte_unmap(fe.pte);
+ vmf.pte--;
+ pte_unmap(vmf.pte);
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
return true;
}
@@ -1242,6 +1242,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
struct vm_area_struct *vma;
unsigned long addr;
pmd_t *pmd, _pmd;
+ bool deposited = false;
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
@@ -1266,10 +1267,26 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
/* assume page table is clear */
_pmd = pmdp_collapse_flush(vma, addr, pmd);
+ /*
+ * now deposit the pgtable for arch that need it
+ * otherwise free it.
+ */
+ if (arch_needs_pgtable_deposit()) {
+ /*
+ * The deposit should be visibile only after
+ * collapse is seen by others.
+ */
+ smp_wmb();
+ pgtable_trans_huge_deposit(vma->vm_mm, pmd,
+ pmd_pgtable(_pmd));
+ deposited = true;
+ }
spin_unlock(ptl);
up_write(&vma->vm_mm->mmap_sem);
- atomic_long_dec(&vma->vm_mm->nr_ptes);
- pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+ if (!deposited) {
+ atomic_long_dec(&vma->vm_mm->nr_ptes);
+ pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+ }
}
}
i_mmap_unlock_write(mapping);
@@ -1403,6 +1420,9 @@ static void collapse_shmem(struct mm_struct *mm,
spin_lock_irq(&mapping->tree_lock);
+ slot = radix_tree_lookup_slot(&mapping->page_tree, index);
+ VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot,
+ &mapping->tree_lock), page);
VM_BUG_ON_PAGE(page_mapped(page), page);
/*
@@ -1423,9 +1443,10 @@ static void collapse_shmem(struct mm_struct *mm,
list_add_tail(&page->lru, &pagelist);
/* Finally, replace with the new page. */
- radix_tree_replace_slot(slot,
+ radix_tree_replace_slot(&mapping->page_tree, slot,
new_page + (index % HPAGE_PMD_NR));
+ slot = radix_tree_iter_resume(slot, &iter);
index++;
continue;
out_lru:
@@ -1521,9 +1542,10 @@ tree_unlocked:
if (!page || iter.index < page->index) {
if (!nr_none)
break;
- /* Put holes back where they were */
- radix_tree_replace_slot(slot, NULL);
nr_none--;
+ /* Put holes back where they were */
+ radix_tree_delete(&mapping->page_tree,
+ iter.index);
continue;
}
@@ -1532,7 +1554,9 @@ tree_unlocked:
/* Unfreeze the page. */
list_del(&page->lru);
page_ref_unfreeze(page, 2);
- radix_tree_replace_slot(slot, page);
+ radix_tree_replace_slot(&mapping->page_tree,
+ slot, page);
+ slot = radix_tree_iter_resume(slot, &iter);
spin_unlock_irq(&mapping->tree_lock);
putback_lru_page(page);
unlock_page(page);
@@ -1616,8 +1640,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
present++;
if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index d1380ed93fdf..da3436953022 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -19,7 +19,7 @@
*
*
* For more information on the algorithm and kmemleak usage, please see
- * Documentation/kmemleak.txt.
+ * Documentation/dev-tools/kmemleak.rst.
*
* Notes on locking
* ----------------
diff --git a/mm/madvise.c b/mm/madvise.c
index 93fb63e88b5e..0e3828eae9f8 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -281,6 +281,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (pmd_trans_unstable(pmd))
return 0;
+ tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
for (; addr != end; pte++, addr += PAGE_SIZE) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0f870ba43942..175ec51c346d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1816,22 +1816,13 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
mutex_unlock(&percpu_charge_mutex);
}
-static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
- unsigned long action,
- void *hcpu)
+static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
- int cpu = (unsigned long)hcpu;
struct memcg_stock_pcp *stock;
- if (action == CPU_ONLINE)
- return NOTIFY_OK;
-
- if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
- return NOTIFY_OK;
-
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
- return NOTIFY_OK;
+ return 0;
}
static void reclaim_high(struct mem_cgroup *memcg,
@@ -2154,6 +2145,8 @@ struct memcg_kmem_cache_create_work {
struct work_struct work;
};
+static struct workqueue_struct *memcg_kmem_cache_create_wq;
+
static void memcg_kmem_cache_create_func(struct work_struct *w)
{
struct memcg_kmem_cache_create_work *cw =
@@ -2185,7 +2178,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
cw->cachep = cachep;
INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
- schedule_work(&cw->work);
+ queue_work(memcg_kmem_cache_create_wq, &cw->work);
}
static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
@@ -5774,16 +5767,28 @@ __setup("cgroup.memory=", cgroup_memory);
/*
* subsys_initcall() for memory controller.
*
- * Some parts like hotcpu_notifier() have to be initialized from this context
- * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
- * everything that doesn't depend on a specific mem_cgroup structure should
- * be initialized from here.
+ * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
+ * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
+ * basically everything that doesn't depend on a specific mem_cgroup structure
+ * should be initialized from here.
*/
static int __init mem_cgroup_init(void)
{
int cpu, node;
- hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
+#ifndef CONFIG_SLOB
+ /*
+ * Kmem cache creation is mostly done with the slab_mutex held,
+ * so use a special workqueue to avoid stalling all worker
+ * threads in case lots of cgroups are created simultaneously.
+ */
+ memcg_kmem_cache_create_wq =
+ alloc_ordered_workqueue("memcg_kmem_cache_create", 0);
+ BUG_ON(!memcg_kmem_cache_create_wq);
+#endif
+
+ cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
+ memcg_hotplug_cpu_dead);
for_each_possible_cpu(cpu)
INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
diff --git a/mm/memory.c b/mm/memory.c
index e18c57bdc75c..455c3e628d52 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -300,15 +300,14 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
struct mmu_gather_batch *batch;
VM_BUG_ON(!tlb->end);
-
- if (!tlb->page_size)
- tlb->page_size = page_size;
- else {
- if (page_size != tlb->page_size)
- return true;
- }
+ VM_WARN_ON(tlb->page_size != page_size);
batch = tlb->active;
+ /*
+ * Add the page and check if we are full. If so
+ * force a flush.
+ */
+ batch->pages[batch->nr++] = page;
if (batch->nr == batch->max) {
if (!tlb_next_batch(tlb))
return true;
@@ -316,7 +315,6 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
}
VM_BUG_ON_PAGE(batch->nr > batch->max, page);
- batch->pages[batch->nr++] = page;
return false;
}
@@ -528,7 +526,11 @@ void free_pgd_range(struct mmu_gather *tlb,
end -= PMD_SIZE;
if (addr > end - 1)
return;
-
+ /*
+ * We add page table cache pages with PAGE_SIZE,
+ * (see pte_free_tlb()), flush the tlb if we need
+ */
+ tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
pgd = pgd_offset(tlb->mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -1118,8 +1120,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pte_t *start_pte;
pte_t *pte;
swp_entry_t entry;
- struct page *pending_page = NULL;
+ tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
again:
init_rss_vec(rss);
start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
@@ -1172,7 +1174,6 @@ again:
print_bad_pte(vma, addr, ptent, page);
if (unlikely(__tlb_remove_page(tlb, page))) {
force_flush = 1;
- pending_page = page;
addr += PAGE_SIZE;
break;
}
@@ -1213,11 +1214,6 @@ again:
if (force_flush) {
force_flush = 0;
tlb_flush_mmu_free(tlb);
- if (pending_page) {
- /* remove the page with new size */
- __tlb_remove_pte_page(tlb, pending_page);
- pending_page = NULL;
- }
if (addr != end)
goto again;
}
@@ -1240,7 +1236,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
if (next - addr != HPAGE_PMD_SIZE) {
VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
- split_huge_pmd(vma, pmd, addr);
+ __split_huge_pmd(vma, pmd, addr, false, NULL);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
@@ -1637,8 +1633,8 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
- if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)))
- return -EINVAL;
+
+ track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
@@ -1655,8 +1651,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
- if (track_pfn_insert(vma, &pgprot, pfn))
- return -EINVAL;
+
+ track_pfn_insert(vma, &pgprot, pfn);
/*
* If we don't have pte special, then we have to use the pfn_valid()
@@ -2038,20 +2034,17 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
*
* We do this without the lock held, so that it can sleep if it needs to.
*/
-static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
- unsigned long address)
+static int do_page_mkwrite(struct vm_fault *vmf)
{
- struct vm_fault vmf;
int ret;
+ struct page *page = vmf->page;
+ unsigned int old_flags = vmf->flags;
- vmf.virtual_address = (void __user *)(address & PAGE_MASK);
- vmf.pgoff = page->index;
- vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
- vmf.gfp_mask = __get_fault_gfp_mask(vma);
- vmf.page = page;
- vmf.cow_page = NULL;
+ vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
- ret = vma->vm_ops->page_mkwrite(vma, &vmf);
+ ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf);
+ /* Restore original flags so that caller is not surprised */
+ vmf->flags = old_flags;
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
if (unlikely(!(ret & VM_FAULT_LOCKED))) {
@@ -2067,6 +2060,41 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
}
/*
+ * Handle dirtying of a page in shared file mapping on a write fault.
+ *
+ * The function expects the page to be locked and unlocks it.
+ */
+static void fault_dirty_shared_page(struct vm_area_struct *vma,
+ struct page *page)
+{
+ struct address_space *mapping;
+ bool dirtied;
+ bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
+
+ dirtied = set_page_dirty(page);
+ VM_BUG_ON_PAGE(PageAnon(page), page);
+ /*
+ * Take a local copy of the address_space - page.mapping may be zeroed
+ * by truncate after unlock_page(). The address_space itself remains
+ * pinned by vma->vm_file's reference. We rely on unlock_page()'s
+ * release semantics to prevent the compiler from undoing this copying.
+ */
+ mapping = page_rmapping(page);
+ unlock_page(page);
+
+ if ((dirtied || page_mkwrite) && mapping) {
+ /*
+ * Some device drivers do not set page.mapping
+ * but still dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
+ }
+
+ if (!page_mkwrite)
+ file_update_time(vma->vm_file);
+}
+
+/*
* Handle write page faults for pages that can be reused in the current vma
*
* This can happen either due to the mapping being with the VM_SHARED flag,
@@ -2074,11 +2102,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
* case, all we need to do here is to mark the page as writable and update
* any related book-keeping.
*/
-static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
- struct page *page, int page_mkwrite, int dirty_shared)
- __releases(fe->ptl)
+static inline void wp_page_reuse(struct vm_fault *vmf)
+ __releases(vmf->ptl)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
+ struct page *page = vmf->page;
pte_t entry;
/*
* Clear the pages cpupid information as the existing
@@ -2088,39 +2116,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
if (page)
page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
- flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
- entry = pte_mkyoung(orig_pte);
+ flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
+ entry = pte_mkyoung(vmf->orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1))
- update_mmu_cache(vma, fe->address, fe->pte);
- pte_unmap_unlock(fe->pte, fe->ptl);
-
- if (dirty_shared) {
- struct address_space *mapping;
- int dirtied;
-
- if (!page_mkwrite)
- lock_page(page);
-
- dirtied = set_page_dirty(page);
- VM_BUG_ON_PAGE(PageAnon(page), page);
- mapping = page->mapping;
- unlock_page(page);
- put_page(page);
-
- if ((dirtied || page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
-
- if (!page_mkwrite)
- file_update_time(vma->vm_file);
- }
-
- return VM_FAULT_WRITE;
+ if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
}
/*
@@ -2139,31 +2140,32 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
* held to the old page, as well as updating the rmap.
* - In any case, unlock the PTL and drop the reference we took to the old page.
*/
-static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
- struct page *old_page)
+static int wp_page_copy(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
+ struct page *old_page = vmf->page;
struct page *new_page = NULL;
pte_t entry;
int page_copied = 0;
- const unsigned long mmun_start = fe->address & PAGE_MASK;
+ const unsigned long mmun_start = vmf->address & PAGE_MASK;
const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- if (is_zero_pfn(pte_pfn(orig_pte))) {
- new_page = alloc_zeroed_user_highpage_movable(vma, fe->address);
+ if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
+ new_page = alloc_zeroed_user_highpage_movable(vma,
+ vmf->address);
if (!new_page)
goto oom;
} else {
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
- fe->address);
+ vmf->address);
if (!new_page)
goto oom;
- cow_user_page(new_page, old_page, fe->address, vma);
+ cow_user_page(new_page, old_page, vmf->address, vma);
}
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
@@ -2176,8 +2178,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
/*
* Re-check the pte - we dropped the lock
*/
- fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl);
- if (likely(pte_same(*fe->pte, orig_pte))) {
+ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
+ if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
dec_mm_counter_fast(mm,
@@ -2187,7 +2189,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
} else {
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
- flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
+ flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*
@@ -2196,8 +2198,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
- ptep_clear_flush_notify(vma, fe->address, fe->pte);
- page_add_new_anon_rmap(new_page, vma, fe->address, false);
+ ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
+ page_add_new_anon_rmap(new_page, vma, vmf->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
/*
@@ -2205,8 +2207,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
* mmu page tables (such as kvm shadow page tables), we want the
* new page to be mapped directly into the secondary page table.
*/
- set_pte_at_notify(mm, fe->address, fe->pte, entry);
- update_mmu_cache(vma, fe->address, fe->pte);
+ set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
if (old_page) {
/*
* Only after switching the pte to the new page may
@@ -2243,7 +2245,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
if (new_page)
put_page(new_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (old_page) {
/*
@@ -2267,79 +2269,91 @@ oom:
return VM_FAULT_OOM;
}
+/**
+ * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
+ * writeable once the page is prepared
+ *
+ * @vmf: structure describing the fault
+ *
+ * This function handles all that is needed to finish a write page fault in a
+ * shared mapping due to PTE being read-only once the mapped page is prepared.
+ * It handles locking of PTE and modifying it. The function returns
+ * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE
+ * lock.
+ *
+ * The function expects the page to be locked or other protection against
+ * concurrent faults / writeback (such as DAX radix tree locks).
+ */
+int finish_mkwrite_fault(struct vm_fault *vmf)
+{
+ WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ /*
+ * We might have raced with another page fault while we released the
+ * pte_offset_map_lock.
+ */
+ if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return VM_FAULT_NOPAGE;
+ }
+ wp_page_reuse(vmf);
+ return 0;
+}
+
/*
* Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
* mapping
*/
-static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte)
+static int wp_pfn_shared(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
- struct vm_fault vmf = {
- .page = NULL,
- .pgoff = linear_page_index(vma, fe->address),
- .virtual_address =
- (void __user *)(fe->address & PAGE_MASK),
- .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
- };
int ret;
- pte_unmap_unlock(fe->pte, fe->ptl);
- ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
- if (ret & VM_FAULT_ERROR)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ vmf->flags |= FAULT_FLAG_MKWRITE;
+ ret = vma->vm_ops->pfn_mkwrite(vma, vmf);
+ if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
return ret;
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- /*
- * We might have raced with another page fault while we
- * released the pte_offset_map_lock.
- */
- if (!pte_same(*fe->pte, orig_pte)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
- return 0;
- }
+ return finish_mkwrite_fault(vmf);
}
- return wp_page_reuse(fe, orig_pte, NULL, 0, 0);
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
}
-static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
- struct page *old_page)
- __releases(fe->ptl)
+static int wp_page_shared(struct vm_fault *vmf)
+ __releases(vmf->ptl)
{
- struct vm_area_struct *vma = fe->vma;
- int page_mkwrite = 0;
+ struct vm_area_struct *vma = vmf->vma;
- get_page(old_page);
+ get_page(vmf->page);
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
int tmp;
- pte_unmap_unlock(fe->pte, fe->ptl);
- tmp = do_page_mkwrite(vma, old_page, fe->address);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp || (tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(old_page);
+ put_page(vmf->page);
return tmp;
}
- /*
- * Since we dropped the lock we need to revalidate
- * the PTE as someone else may have changed it. If
- * they did, we just return, as we can count on the
- * MMU to tell us if they didn't also make it writable.
- */
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (!pte_same(*fe->pte, orig_pte)) {
- unlock_page(old_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- put_page(old_page);
- return 0;
+ tmp = finish_mkwrite_fault(vmf);
+ if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ return tmp;
}
- page_mkwrite = 1;
+ } else {
+ wp_page_reuse(vmf);
+ lock_page(vmf->page);
}
+ fault_dirty_shared_page(vma, vmf->page);
+ put_page(vmf->page);
- return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1);
+ return VM_FAULT_WRITE;
}
/*
@@ -2360,14 +2374,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
* but allow concurrent faults), with pte both mapped and locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
- __releases(fe->ptl)
+static int do_wp_page(struct vm_fault *vmf)
+ __releases(vmf->ptl)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *old_page;
+ struct vm_area_struct *vma = vmf->vma;
- old_page = vm_normal_page(vma, fe->address, orig_pte);
- if (!old_page) {
+ vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
+ if (!vmf->page) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
* VM_PFNMAP VMA.
@@ -2377,33 +2390,33 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
*/
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
- return wp_pfn_shared(fe, orig_pte);
+ return wp_pfn_shared(vmf);
- pte_unmap_unlock(fe->pte, fe->ptl);
- return wp_page_copy(fe, orig_pte, old_page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return wp_page_copy(vmf);
}
/*
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
- if (PageAnon(old_page) && !PageKsm(old_page)) {
+ if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
int total_mapcount;
- if (!trylock_page(old_page)) {
- get_page(old_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- lock_page(old_page);
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
- fe->address, &fe->ptl);
- if (!pte_same(*fe->pte, orig_pte)) {
- unlock_page(old_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- put_page(old_page);
+ if (!trylock_page(vmf->page)) {
+ get_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ lock_page(vmf->page);
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ unlock_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ put_page(vmf->page);
return 0;
}
- put_page(old_page);
+ put_page(vmf->page);
}
- if (reuse_swap_page(old_page, &total_mapcount)) {
+ if (reuse_swap_page(vmf->page, &total_mapcount)) {
if (total_mapcount == 1) {
/*
* The page is all ours. Move it to
@@ -2412,24 +2425,25 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
* Protected against the rmap code by
* the page lock.
*/
- page_move_anon_rmap(old_page, vma);
+ page_move_anon_rmap(vmf->page, vma);
}
- unlock_page(old_page);
- return wp_page_reuse(fe, orig_pte, old_page, 0, 0);
+ unlock_page(vmf->page);
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
}
- unlock_page(old_page);
+ unlock_page(vmf->page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
- return wp_page_shared(fe, orig_pte, old_page);
+ return wp_page_shared(vmf);
}
/*
* Ok, we need to copy. Oh, well..
*/
- get_page(old_page);
+ get_page(vmf->page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- return wp_page_copy(fe, orig_pte, old_page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return wp_page_copy(vmf);
}
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
@@ -2517,9 +2531,9 @@ EXPORT_SYMBOL(unmap_mapping_range);
* We return with the mmap_sem locked or unlocked in the same cases
* as does filemap_fault().
*/
-int do_swap_page(struct fault_env *fe, pte_t orig_pte)
+int do_swap_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct page *page, *swapcache;
struct mem_cgroup *memcg;
swp_entry_t entry;
@@ -2528,17 +2542,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
int exclusive = 0;
int ret = 0;
- if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte))
+ if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
goto out;
- entry = pte_to_swp_entry(orig_pte);
+ entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
- migration_entry_wait(vma->vm_mm, fe->pmd, fe->address);
+ migration_entry_wait(vma->vm_mm, vmf->pmd,
+ vmf->address);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else {
- print_bad_pte(vma, fe->address, orig_pte, NULL);
+ print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
ret = VM_FAULT_SIGBUS;
}
goto out;
@@ -2546,16 +2561,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry);
if (!page) {
- page = swapin_readahead(entry,
- GFP_HIGHUSER_MOVABLE, vma, fe->address);
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
+ vmf->address);
if (!page) {
/*
* Back out if somebody else faulted in this pte
* while we released the pte lock.
*/
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
- fe->address, &fe->ptl);
- if (likely(pte_same(*fe->pte, orig_pte)))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
ret = VM_FAULT_OOM;
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
goto unlock;
@@ -2577,7 +2592,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
}
swapcache = page;
- locked = lock_page_or_retry(page, vma->vm_mm, fe->flags);
+ locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
if (!locked) {
@@ -2594,7 +2609,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
goto out_page;
- page = ksm_might_need_to_copy(page, vma, fe->address);
+ page = ksm_might_need_to_copy(page, vma, vmf->address);
if (unlikely(!page)) {
ret = VM_FAULT_OOM;
page = swapcache;
@@ -2610,9 +2625,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
/*
* Back out if somebody else already faulted in this pte.
*/
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (unlikely(!pte_same(*fe->pte, orig_pte)))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
goto out_nomap;
if (unlikely(!PageUptodate(page))) {
@@ -2633,22 +2648,23 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
+ if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
- fe->flags &= ~FAULT_FLAG_WRITE;
+ vmf->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
exclusive = RMAP_EXCLUSIVE;
}
flush_icache_page(vma, page);
- if (pte_swp_soft_dirty(orig_pte))
+ if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
- set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+ vmf->orig_pte = pte;
if (page == swapcache) {
- do_page_add_anon_rmap(page, vma, fe->address, exclusive);
+ do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
@@ -2671,22 +2687,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
put_page(swapcache);
}
- if (fe->flags & FAULT_FLAG_WRITE) {
- ret |= do_wp_page(fe, pte);
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ ret |= do_wp_page(vmf);
if (ret & VM_FAULT_ERROR)
ret &= VM_FAULT_ERROR;
goto out;
}
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, fe->address, fe->pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
return ret;
out_nomap:
mem_cgroup_cancel_charge(page, memcg, false);
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
unlock_page(page);
out_release:
@@ -2737,9 +2753,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_anonymous_page(struct fault_env *fe)
+static int do_anonymous_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
struct page *page;
pte_t entry;
@@ -2749,7 +2765,7 @@ static int do_anonymous_page(struct fault_env *fe)
return VM_FAULT_SIGBUS;
/* Check if we need to add a guard page to the stack */
- if (check_stack_guard_page(vma, fe->address) < 0)
+ if (check_stack_guard_page(vma, vmf->address) < 0)
return VM_FAULT_SIGSEGV;
/*
@@ -2762,26 +2778,26 @@ static int do_anonymous_page(struct fault_env *fe)
*
* Here we only have down_read(mmap_sem).
*/
- if (pte_alloc(vma->vm_mm, fe->pmd, fe->address))
+ if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
return VM_FAULT_OOM;
/* See the comment in pte_alloc_one_map() */
- if (unlikely(pmd_trans_unstable(fe->pmd)))
+ if (unlikely(pmd_trans_unstable(vmf->pmd)))
return 0;
/* Use the zero-page for reads */
- if (!(fe->flags & FAULT_FLAG_WRITE) &&
+ if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm)) {
- entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address),
+ entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
vma->vm_page_prot));
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (!pte_none(*fe->pte))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (!pte_none(*vmf->pte))
goto unlock;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
- return handle_userfault(fe, VM_UFFD_MISSING);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return handle_userfault(vmf, VM_UFFD_MISSING);
}
goto setpte;
}
@@ -2789,7 +2805,7 @@ static int do_anonymous_page(struct fault_env *fe)
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- page = alloc_zeroed_user_highpage_movable(vma, fe->address);
+ page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
if (!page)
goto oom;
@@ -2807,30 +2823,30 @@ static int do_anonymous_page(struct fault_env *fe)
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (!pte_none(*fe->pte))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ if (!pte_none(*vmf->pte))
goto release;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
- return handle_userfault(fe, VM_UFFD_MISSING);
+ return handle_userfault(vmf, VM_UFFD_MISSING);
}
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
setpte:
- set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, fe->address, fe->pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
release:
mem_cgroup_cancel_charge(page, memcg, false);
@@ -2847,62 +2863,50 @@ oom:
* released depending on flags and vma->vm_ops->fault() return value.
* See filemap_fault() and __lock_page_retry().
*/
-static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
- struct page *cow_page, struct page **page, void **entry)
+static int __do_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct vm_fault vmf;
+ struct vm_area_struct *vma = vmf->vma;
int ret;
- vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK);
- vmf.pgoff = pgoff;
- vmf.flags = fe->flags;
- vmf.page = NULL;
- vmf.gfp_mask = __get_fault_gfp_mask(vma);
- vmf.cow_page = cow_page;
-
- ret = vma->vm_ops->fault(vma, &vmf);
- if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
- return ret;
- if (ret & VM_FAULT_DAX_LOCKED) {
- *entry = vmf.entry;
+ ret = vma->vm_ops->fault(vma, vmf);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
+ VM_FAULT_DONE_COW)))
return ret;
- }
- if (unlikely(PageHWPoison(vmf.page))) {
+ if (unlikely(PageHWPoison(vmf->page))) {
if (ret & VM_FAULT_LOCKED)
- unlock_page(vmf.page);
- put_page(vmf.page);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ vmf->page = NULL;
return VM_FAULT_HWPOISON;
}
if (unlikely(!(ret & VM_FAULT_LOCKED)))
- lock_page(vmf.page);
+ lock_page(vmf->page);
else
- VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
+ VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
- *page = vmf.page;
return ret;
}
-static int pte_alloc_one_map(struct fault_env *fe)
+static int pte_alloc_one_map(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
- if (!pmd_none(*fe->pmd))
+ if (!pmd_none(*vmf->pmd))
goto map_pte;
- if (fe->prealloc_pte) {
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_none(*fe->pmd))) {
- spin_unlock(fe->ptl);
+ if (vmf->prealloc_pte) {
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd))) {
+ spin_unlock(vmf->ptl);
goto map_pte;
}
atomic_long_inc(&vma->vm_mm->nr_ptes);
- pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte);
- spin_unlock(fe->ptl);
- fe->prealloc_pte = 0;
- } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) {
+ pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
+ spin_unlock(vmf->ptl);
+ vmf->prealloc_pte = 0;
+ } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
return VM_FAULT_OOM;
}
map_pte:
@@ -2917,11 +2921,11 @@ map_pte:
* through an atomic read in C, which is what pmd_trans_unstable()
* provides.
*/
- if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+ if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
return VM_FAULT_NOPAGE;
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
return 0;
}
@@ -2939,11 +2943,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
return true;
}
-static int do_set_pmd(struct fault_env *fe, struct page *page)
+static void deposit_prealloc_pte(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
+ /*
+ * We are going to consume the prealloc table,
+ * count that as nr_ptes.
+ */
+ atomic_long_inc(&vma->vm_mm->nr_ptes);
+ vmf->prealloc_pte = 0;
+}
+
+static int do_set_pmd(struct vm_fault *vmf, struct page *page)
{
- struct vm_area_struct *vma = fe->vma;
- bool write = fe->flags & FAULT_FLAG_WRITE;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t entry;
int i, ret;
@@ -2953,8 +2970,19 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
ret = VM_FAULT_FALLBACK;
page = compound_head(page);
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_none(*fe->pmd)))
+ /*
+ * Archs like ppc64 need additonal space to store information
+ * related to pte entry. Use the preallocated table for that.
+ */
+ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
+ vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
+ if (!vmf->prealloc_pte)
+ return VM_FAULT_OOM;
+ smp_wmb(); /* See comment in __pte_alloc() */
+ }
+
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd)))
goto out;
for (i = 0; i < HPAGE_PMD_NR; i++)
@@ -2966,20 +2994,32 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
page_add_file_rmap(page, true);
+ /*
+ * deposit and withdraw with pmd lock held
+ */
+ if (arch_needs_pgtable_deposit())
+ deposit_prealloc_pte(vmf);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
- update_mmu_cache_pmd(vma, haddr, fe->pmd);
+ update_mmu_cache_pmd(vma, haddr, vmf->pmd);
/* fault is handled */
ret = 0;
count_vm_event(THP_FILE_MAPPED);
out:
- spin_unlock(fe->ptl);
+ /*
+ * If we are going to fallback to pte mapping, do a
+ * withdraw with pmd lock held.
+ */
+ if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK)
+ vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
+ vmf->pmd);
+ spin_unlock(vmf->ptl);
return ret;
}
#else
-static int do_set_pmd(struct fault_env *fe, struct page *page)
+static int do_set_pmd(struct vm_fault *vmf, struct page *page)
{
BUILD_BUG();
return 0;
@@ -2990,42 +3030,45 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
* alloc_set_pte - setup new PTE entry for given page and add reverse page
* mapping. If needed, the fucntion allocates page table or use pre-allocated.
*
- * @fe: fault environment
+ * @vmf: fault environment
* @memcg: memcg to charge page (only for private mappings)
* @page: page to map
*
- * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return.
+ * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
+ * return.
*
* Target users are page handler itself and implementations of
* vm_ops->map_pages.
*/
-int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
+int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page)
{
- struct vm_area_struct *vma = fe->vma;
- bool write = fe->flags & FAULT_FLAG_WRITE;
+ struct vm_area_struct *vma = vmf->vma;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
pte_t entry;
int ret;
- if (pmd_none(*fe->pmd) && PageTransCompound(page) &&
+ if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
/* THP on COW? */
VM_BUG_ON_PAGE(memcg, page);
- ret = do_set_pmd(fe, page);
+ ret = do_set_pmd(vmf, page);
if (ret != VM_FAULT_FALLBACK)
- return ret;
+ goto fault_handled;
}
- if (!fe->pte) {
- ret = pte_alloc_one_map(fe);
+ if (!vmf->pte) {
+ ret = pte_alloc_one_map(vmf);
if (ret)
- return ret;
+ goto fault_handled;
}
/* Re-check under ptl */
- if (unlikely(!pte_none(*fe->pte)))
- return VM_FAULT_NOPAGE;
+ if (unlikely(!pte_none(*vmf->pte))) {
+ ret = VM_FAULT_NOPAGE;
+ goto fault_handled;
+ }
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);
@@ -3034,19 +3077,58 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
}
- set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, fe->address, fe->pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+ ret = 0;
- return 0;
+fault_handled:
+ /* preallocated pagetable is unused: free it */
+ if (vmf->prealloc_pte) {
+ pte_free(vmf->vma->vm_mm, vmf->prealloc_pte);
+ vmf->prealloc_pte = 0;
+ }
+ return ret;
+}
+
+
+/**
+ * finish_fault - finish page fault once we have prepared the page to fault
+ *
+ * @vmf: structure describing the fault
+ *
+ * This function handles all that is needed to finish a page fault once the
+ * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
+ * given page, adds reverse page mapping, handles memcg charges and LRU
+ * addition. The function returns 0 on success, VM_FAULT_ code in case of
+ * error.
+ *
+ * The function expects the page to be locked and on success it consumes a
+ * reference of a page being mapped (for the PTE which maps it).
+ */
+int finish_fault(struct vm_fault *vmf)
+{
+ struct page *page;
+ int ret;
+
+ /* Did we COW the page? */
+ if ((vmf->flags & FAULT_FLAG_WRITE) &&
+ !(vmf->vma->vm_flags & VM_SHARED))
+ page = vmf->cow_page;
+ else
+ page = vmf->page;
+ ret = alloc_set_pte(vmf, vmf->memcg, page);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return ret;
}
static unsigned long fault_around_bytes __read_mostly =
@@ -3113,17 +3195,18 @@ late_initcall(fault_around_debugfs);
* fault_around_pages() value (and therefore to page order). This way it's
* easier to guarantee that we don't cross page table boundaries.
*/
-static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
+static int do_fault_around(struct vm_fault *vmf)
{
- unsigned long address = fe->address, nr_pages, mask;
+ unsigned long address = vmf->address, nr_pages, mask;
+ pgoff_t start_pgoff = vmf->pgoff;
pgoff_t end_pgoff;
int off, ret = 0;
nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
- fe->address = max(address & mask, fe->vma->vm_start);
- off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+ vmf->address = max(address & mask, vmf->vma->vm_start);
+ off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
start_pgoff -= off;
/*
@@ -3131,50 +3214,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
* or fault_around_pages() from start_pgoff, depending what is nearest.
*/
end_pgoff = start_pgoff -
- ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
+ ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
PTRS_PER_PTE - 1;
- end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1,
+ end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
start_pgoff + nr_pages - 1);
- if (pmd_none(*fe->pmd)) {
- fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address);
- if (!fe->prealloc_pte)
+ if (pmd_none(*vmf->pmd)) {
+ vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
+ vmf->address);
+ if (!vmf->prealloc_pte)
goto out;
smp_wmb(); /* See comment in __pte_alloc() */
}
- fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
+ vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
- /* preallocated pagetable is unused: free it */
- if (fe->prealloc_pte) {
- pte_free(fe->vma->vm_mm, fe->prealloc_pte);
- fe->prealloc_pte = 0;
- }
/* Huge page is mapped? Page fault is solved */
- if (pmd_trans_huge(*fe->pmd)) {
+ if (pmd_trans_huge(*vmf->pmd)) {
ret = VM_FAULT_NOPAGE;
goto out;
}
/* ->map_pages() haven't done anything useful. Cold page cache? */
- if (!fe->pte)
+ if (!vmf->pte)
goto out;
/* check if the page fault is solved */
- fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
- if (!pte_none(*fe->pte))
+ vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
+ if (!pte_none(*vmf->pte))
ret = VM_FAULT_NOPAGE;
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
- fe->address = address;
- fe->pte = NULL;
+ vmf->address = address;
+ vmf->pte = NULL;
return ret;
}
-static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_read_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *fault_page;
+ struct vm_area_struct *vma = vmf->vma;
int ret = 0;
/*
@@ -3183,80 +3261,67 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
* something).
*/
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
- ret = do_fault_around(fe, pgoff);
+ ret = do_fault_around(vmf);
if (ret)
return ret;
}
- ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
+ ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
- ret |= alloc_set_pte(fe, NULL, fault_page);
- if (fe->pte)
- pte_unmap_unlock(fe->pte, fe->ptl);
- unlock_page(fault_page);
+ ret |= finish_fault(vmf);
+ unlock_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
- put_page(fault_page);
+ put_page(vmf->page);
return ret;
}
-static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_cow_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *fault_page, *new_page;
- void *fault_entry;
- struct mem_cgroup *memcg;
+ struct vm_area_struct *vma = vmf->vma;
int ret;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address);
- if (!new_page)
+ vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
+ if (!vmf->cow_page)
return VM_FAULT_OOM;
- if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
- &memcg, false)) {
- put_page(new_page);
+ if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
+ &vmf->memcg, false)) {
+ put_page(vmf->cow_page);
return VM_FAULT_OOM;
}
- ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry);
+ ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
+ if (ret & VM_FAULT_DONE_COW)
+ return ret;
- if (!(ret & VM_FAULT_DAX_LOCKED))
- copy_user_highpage(new_page, fault_page, fe->address, vma);
- __SetPageUptodate(new_page);
+ copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
+ __SetPageUptodate(vmf->cow_page);
- ret |= alloc_set_pte(fe, memcg, new_page);
- if (fe->pte)
- pte_unmap_unlock(fe->pte, fe->ptl);
- if (!(ret & VM_FAULT_DAX_LOCKED)) {
- unlock_page(fault_page);
- put_page(fault_page);
- } else {
- dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
- }
+ ret |= finish_fault(vmf);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
return ret;
uncharge_out:
- mem_cgroup_cancel_charge(new_page, memcg, false);
- put_page(new_page);
+ mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
+ put_page(vmf->cow_page);
return ret;
}
-static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_shared_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *fault_page;
- struct address_space *mapping;
- int dirtied = 0;
+ struct vm_area_struct *vma = vmf->vma;
int ret, tmp;
- ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
+ ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
@@ -3265,46 +3330,24 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
* about to become writable
*/
if (vma->vm_ops->page_mkwrite) {
- unlock_page(fault_page);
- tmp = do_page_mkwrite(vma, fault_page, fe->address);
+ unlock_page(vmf->page);
+ tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(fault_page);
+ put_page(vmf->page);
return tmp;
}
}
- ret |= alloc_set_pte(fe, NULL, fault_page);
- if (fe->pte)
- pte_unmap_unlock(fe->pte, fe->ptl);
+ ret |= finish_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY))) {
- unlock_page(fault_page);
- put_page(fault_page);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
return ret;
}
- if (set_page_dirty(fault_page))
- dirtied = 1;
- /*
- * Take a local copy of the address_space - page.mapping may be zeroed
- * by truncate after unlock_page(). The address_space itself remains
- * pinned by vma->vm_file's reference. We rely on unlock_page()'s
- * release semantics to prevent the compiler from undoing this copying.
- */
- mapping = page_rmapping(fault_page);
- unlock_page(fault_page);
- if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping but still
- * dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
-
- if (!vma->vm_ops->page_mkwrite)
- file_update_time(vma->vm_file);
-
+ fault_dirty_shared_page(vma, vmf->page);
return ret;
}
@@ -3314,19 +3357,18 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
-static int do_fault(struct fault_env *fe)
+static int do_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- pgoff_t pgoff = linear_page_index(vma, fe->address);
+ struct vm_area_struct *vma = vmf->vma;
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
if (!vma->vm_ops->fault)
return VM_FAULT_SIGBUS;
- if (!(fe->flags & FAULT_FLAG_WRITE))
- return do_read_fault(fe, pgoff);
+ if (!(vmf->flags & FAULT_FLAG_WRITE))
+ return do_read_fault(vmf);
if (!(vma->vm_flags & VM_SHARED))
- return do_cow_fault(fe, pgoff);
- return do_shared_fault(fe, pgoff);
+ return do_cow_fault(vmf);
+ return do_shared_fault(vmf);
}
static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
@@ -3344,14 +3386,15 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
return mpol_misplaced(page, vma, addr);
}
-static int do_numa_page(struct fault_env *fe, pte_t pte)
+static int do_numa_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
int page_nid = -1;
int last_cpupid;
int target_nid;
bool migrated = false;
+ pte_t pte = vmf->orig_pte;
bool was_writable = pte_write(pte);
int flags = 0;
@@ -3364,10 +3407,10 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
* page table entry is not accessible, so there would be no
* concurrent hardware modifications to the PTE.
*/
- fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd);
- spin_lock(fe->ptl);
- if (unlikely(!pte_same(*fe->pte, pte))) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
+ spin_lock(vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, pte))) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
@@ -3376,18 +3419,18 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
pte = pte_mkyoung(pte);
if (was_writable)
pte = pte_mkwrite(pte);
- set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
- update_mmu_cache(vma, fe->address, fe->pte);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
- page = vm_normal_page(vma, fe->address, pte);
+ page = vm_normal_page(vma, vmf->address, pte);
if (!page) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
/* TODO: handle PTE-mapped THP */
if (PageCompound(page)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
@@ -3411,9 +3454,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
last_cpupid = page_cpupid_last(page);
page_nid = page_to_nid(page);
- target_nid = numa_migrate_prep(page, vma, fe->address, page_nid,
+ target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
if (target_nid == -1) {
put_page(page);
goto out;
@@ -3433,28 +3476,28 @@ out:
return 0;
}
-static int create_huge_pmd(struct fault_env *fe)
+static int create_huge_pmd(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
if (vma_is_anonymous(vma))
- return do_huge_pmd_anonymous_page(fe);
+ return do_huge_pmd_anonymous_page(vmf);
if (vma->vm_ops->pmd_fault)
- return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd,
- fe->flags);
+ return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd,
+ vmf->flags);
return VM_FAULT_FALLBACK;
}
-static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
+static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
{
- if (vma_is_anonymous(fe->vma))
- return do_huge_pmd_wp_page(fe, orig_pmd);
- if (fe->vma->vm_ops->pmd_fault)
- return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd,
- fe->flags);
+ if (vma_is_anonymous(vmf->vma))
+ return do_huge_pmd_wp_page(vmf, orig_pmd);
+ if (vmf->vma->vm_ops->pmd_fault)
+ return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address,
+ vmf->pmd, vmf->flags);
/* COW handled on pte level: split pmd */
- VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma);
- split_huge_pmd(fe->vma, fe->pmd, fe->address);
+ VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
+ __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
return VM_FAULT_FALLBACK;
}
@@ -3479,21 +3522,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
* The mmap_sem may have been released depending on flags and our return value.
* See filemap_fault() and __lock_page_or_retry().
*/
-static int handle_pte_fault(struct fault_env *fe)
+static int handle_pte_fault(struct vm_fault *vmf)
{
pte_t entry;
- if (unlikely(pmd_none(*fe->pmd))) {
+ if (unlikely(pmd_none(*vmf->pmd))) {
/*
* Leave __pte_alloc() until later: because vm_ops->fault may
* want to allocate huge page, and if we expose page table
* for an instant, it will be difficult to retract from
* concurrent faults and from rmap lookups.
*/
- fe->pte = NULL;
+ vmf->pte = NULL;
} else {
/* See comment in pte_alloc_one_map() */
- if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+ if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
return 0;
/*
* A regular pmd is established and it can't morph into a huge
@@ -3501,9 +3544,8 @@ static int handle_pte_fault(struct fault_env *fe)
* mmap_sem read mode and khugepaged takes it in write mode.
* So now it's safe to run pte_offset_map().
*/
- fe->pte = pte_offset_map(fe->pmd, fe->address);
-
- entry = *fe->pte;
+ vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+ vmf->orig_pte = *vmf->pte;
/*
* some architectures can have larger ptes than wordsize,
@@ -3514,38 +3556,39 @@ static int handle_pte_fault(struct fault_env *fe)
* ptl lock held. So here a barrier will do.
*/
barrier();
- if (pte_none(entry)) {
- pte_unmap(fe->pte);
- fe->pte = NULL;
+ if (pte_none(vmf->orig_pte)) {
+ pte_unmap(vmf->pte);
+ vmf->pte = NULL;
}
}
- if (!fe->pte) {
- if (vma_is_anonymous(fe->vma))
- return do_anonymous_page(fe);
+ if (!vmf->pte) {
+ if (vma_is_anonymous(vmf->vma))
+ return do_anonymous_page(vmf);
else
- return do_fault(fe);
+ return do_fault(vmf);
}
- if (!pte_present(entry))
- return do_swap_page(fe, entry);
+ if (!pte_present(vmf->orig_pte))
+ return do_swap_page(vmf);
- if (pte_protnone(entry) && vma_is_accessible(fe->vma))
- return do_numa_page(fe, entry);
+ if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
+ return do_numa_page(vmf);
- fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd);
- spin_lock(fe->ptl);
- if (unlikely(!pte_same(*fe->pte, entry)))
+ vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+ spin_lock(vmf->ptl);
+ entry = vmf->orig_pte;
+ if (unlikely(!pte_same(*vmf->pte, entry)))
goto unlock;
- if (fe->flags & FAULT_FLAG_WRITE) {
+ if (vmf->flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
- return do_wp_page(fe, entry);
+ return do_wp_page(vmf);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
- if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry,
- fe->flags & FAULT_FLAG_WRITE)) {
- update_mmu_cache(fe->vma, fe->address, fe->pte);
+ if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
+ vmf->flags & FAULT_FLAG_WRITE)) {
+ update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
} else {
/*
* This is needed only for protection faults but the arch code
@@ -3553,11 +3596,11 @@ static int handle_pte_fault(struct fault_env *fe)
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
- if (fe->flags & FAULT_FLAG_WRITE)
- flush_tlb_fix_spurious_fault(fe->vma, fe->address);
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
}
unlock:
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
@@ -3570,10 +3613,12 @@ unlock:
static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags)
{
- struct fault_env fe = {
+ struct vm_fault vmf = {
.vma = vma,
- .address = address,
+ .address = address & PAGE_MASK,
.flags = flags,
+ .pgoff = linear_page_index(vma, address),
+ .gfp_mask = __get_fault_gfp_mask(vma),
};
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
@@ -3583,35 +3628,35 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
pud = pud_alloc(mm, pgd, address);
if (!pud)
return VM_FAULT_OOM;
- fe.pmd = pmd_alloc(mm, pud, address);
- if (!fe.pmd)
+ vmf.pmd = pmd_alloc(mm, pud, address);
+ if (!vmf.pmd)
return VM_FAULT_OOM;
- if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) {
- int ret = create_huge_pmd(&fe);
+ if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
+ int ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- pmd_t orig_pmd = *fe.pmd;
+ pmd_t orig_pmd = *vmf.pmd;
int ret;
barrier();
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
- return do_huge_pmd_numa_page(&fe, orig_pmd);
+ return do_huge_pmd_numa_page(&vmf, orig_pmd);
- if ((fe.flags & FAULT_FLAG_WRITE) &&
+ if ((vmf.flags & FAULT_FLAG_WRITE) &&
!pmd_write(orig_pmd)) {
- ret = wp_huge_pmd(&fe, orig_pmd);
+ ret = wp_huge_pmd(&vmf, orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- huge_pmd_set_accessed(&fe, orig_pmd);
+ huge_pmd_set_accessed(&vmf, orig_pmd);
return 0;
}
}
}
- return handle_pte_fault(&fe);
+ return handle_pte_fault(&vmf);
}
/*
@@ -3772,8 +3817,8 @@ out:
return -EINVAL;
}
-static inline int follow_pte(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, spinlock_t **ptlp)
+int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp,
+ spinlock_t **ptlp)
{
int res;
@@ -3868,7 +3913,7 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
* Access another process' address space as given in mm. If non-NULL, use the
* given task for page fault accounting.
*/
-static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
unsigned long addr, void *buf, int len, unsigned int gup_flags)
{
struct vm_area_struct *vma;
@@ -3883,7 +3928,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
struct page *page = NULL;
ret = get_user_pages_remote(tsk, mm, addr, 1,
- gup_flags, &page, &vma);
+ gup_flags, &page, &vma, NULL);
if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
break;
@@ -3966,6 +4011,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr,
return ret;
}
+EXPORT_SYMBOL_GPL(access_process_vm);
/*
* Print the name of a VMA.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index cad4b9125695..e43142c15631 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1727,26 +1727,6 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
static int __init cmdline_parse_movable_node(char *p)
{
#ifdef CONFIG_MOVABLE_NODE
- /*
- * Memory used by the kernel cannot be hot-removed because Linux
- * cannot migrate the kernel pages. When memory hotplug is
- * enabled, we should prevent memblock from allocating memory
- * for the kernel.
- *
- * ACPI SRAT records all hotpluggable memory ranges. But before
- * SRAT is parsed, we don't know about it.
- *
- * The kernel image is loaded into memory at very early time. We
- * cannot prevent this anyway. So on NUMA system, we set any
- * node the kernel resides in as un-hotpluggable.
- *
- * Since on modern servers, one node could have double-digit
- * gigabytes memory, we can assume the memory around the kernel
- * image is also un-hotpluggable. So before SRAT is parsed, just
- * allocate memory near the kernel image to try the best to keep
- * the kernel away from hotpluggable memory.
- */
- memblock_set_bottom_up(true);
movable_node_enabled = true;
#else
pr_warn("movable_node option not supported\n");
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0b859af06b87..6d3639e1f254 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -276,7 +276,9 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
return ERR_PTR(-EINVAL);
}
} else if (mode == MPOL_LOCAL) {
- if (!nodes_empty(*nodes))
+ if (!nodes_empty(*nodes) ||
+ (flags & MPOL_F_STATIC_NODES) ||
+ (flags & MPOL_F_RELATIVE_NODES))
return ERR_PTR(-EINVAL);
mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes))
@@ -496,7 +498,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
spin_unlock(ptl);
- split_huge_pmd(vma, pmd, addr);
+ __split_huge_pmd(vma, pmd, addr, false, NULL);
} else {
get_page(page);
spin_unlock(ptl);
@@ -1679,25 +1681,17 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
int nd)
{
- switch (policy->mode) {
- case MPOL_PREFERRED:
- if (!(policy->flags & MPOL_F_LOCAL))
- nd = policy->v.preferred_node;
- break;
- case MPOL_BIND:
+ if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
+ nd = policy->v.preferred_node;
+ else {
/*
- * Normally, MPOL_BIND allocations are node-local within the
- * allowed nodemask. However, if __GFP_THISNODE is set and the
- * current node isn't part of the mask, we use the zonelist for
- * the first node in the mask instead.
+ * __GFP_THISNODE shouldn't even be used with the bind policy
+ * because we might easily break the expectation to stay on the
+ * requested node and not break the policy.
*/
- if (unlikely(gfp & __GFP_THISNODE) &&
- unlikely(!node_isset(nd, policy->v.nodes)))
- nd = first_node(policy->v.nodes);
- break;
- default:
- BUG();
+ WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
}
+
return node_zonelist(nd, gfp);
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 99250aee1ac1..0ed24b1fa77b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -168,8 +168,6 @@ void putback_movable_pages(struct list_head *l)
continue;
}
list_del(&page->lru);
- dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
/*
* We isolated non-lru movable page so here we can use
* __PageMovable because LRU page's mapping cannot have
@@ -186,6 +184,8 @@ void putback_movable_pages(struct list_head *l)
put_page(page);
} else {
putback_lru_page(page);
+ dec_node_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
}
}
}
@@ -482,7 +482,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
SetPageDirty(newpage);
}
- radix_tree_replace_slot(pslot, newpage);
+ radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
/*
* Drop cache reference from old page by unfreezing
@@ -556,7 +556,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
get_page(newpage);
- radix_tree_replace_slot(pslot, newpage);
+ radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
page_ref_unfreeze(page, expected_count - 1);
@@ -1121,8 +1121,15 @@ out:
* restored.
*/
list_del(&page->lru);
- dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+
+ /*
+ * Compaction can migrate also non-LRU pages which are
+ * not accounted to NR_ISOLATED_*. They can be recognized
+ * as __PageMovable
+ */
+ if (likely(!__PageMovable(page)))
+ dec_node_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
}
/*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 11936526b08b..cc2459c57f60 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -69,11 +69,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pte_t *pte, oldpte;
spinlock_t *ptl;
unsigned long pages = 0;
+ int target_node = NUMA_NO_NODE;
pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
if (!pte)
return 0;
+ /* Get target node for single threaded private VMAs */
+ if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
+ atomic_read(&vma->vm_mm->mm_users) == 1)
+ target_node = numa_node_id();
+
arch_enter_lazy_mmu_mode();
do {
oldpte = *pte;
@@ -95,6 +101,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
/* Avoid TLB flush if possible */
if (pte_protnone(oldpte))
continue;
+
+ /*
+ * Don't mess with PTEs if page is already on the node
+ * a single-threaded process is running on.
+ */
+ if (target_node == page_to_nid(page))
+ continue;
}
ptent = ptep_modify_prot_start(mm, addr, pte);
@@ -163,7 +176,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE) {
- split_huge_pmd(vma, pmd, addr);
+ __split_huge_pmd(vma, pmd, addr, false, NULL);
if (pmd_trans_unstable(pmd))
continue;
} else {
@@ -484,6 +497,8 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
return do_mprotect_pkey(start, len, prot, -1);
}
+#ifdef CONFIG_ARCH_HAS_PKEYS
+
SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
unsigned long, prot, int, pkey)
{
@@ -534,3 +549,5 @@ SYSCALL_DEFINE1(pkey_free, int, pkey)
*/
return ret;
}
+
+#endif /* CONFIG_ARCH_HAS_PKEYS */
diff --git a/mm/nommu.c b/mm/nommu.c
index 8b8faaf2a9e9..210d7ec2843c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -176,9 +176,10 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
}
EXPORT_SYMBOL(get_user_pages_locked);
-long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- struct page **pages, unsigned int gup_flags)
+static long __get_user_pages_unlocked(struct task_struct *tsk,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long nr_pages, struct page **pages,
+ unsigned int gup_flags)
{
long ret;
down_read(&mm->mmap_sem);
@@ -187,7 +188,6 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
up_read(&mm->mmap_sem);
return ret;
}
-EXPORT_SYMBOL(__get_user_pages_unlocked);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags)
@@ -1801,14 +1801,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct fault_env *fe,
+void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
BUG();
}
EXPORT_SYMBOL(filemap_map_pages);
-static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
unsigned long addr, void *buf, int len, unsigned int gup_flags)
{
struct vm_area_struct *vma;
@@ -1878,6 +1878,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
mmput(mm);
return len;
}
+EXPORT_SYMBOL_GPL(access_process_vm);
/**
* nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 439cc63ad903..290e8b7d3181 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1778,6 +1778,7 @@ pause:
pause,
start_time);
__set_current_state(TASK_KILLABLE);
+ wb->dirty_sleep = now;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
@@ -2105,18 +2106,26 @@ void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
#define WRITEBACK_TAG_BATCH 4096
- unsigned long tagged;
-
- do {
- spin_lock_irq(&mapping->tree_lock);
- tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
- &start, end, WRITEBACK_TAG_BATCH,
- PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+ unsigned long tagged = 0;
+ struct radix_tree_iter iter;
+ void **slot;
+
+ spin_lock_irq(&mapping->tree_lock);
+ radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start,
+ PAGECACHE_TAG_DIRTY) {
+ if (iter.index > end)
+ break;
+ radix_tree_iter_tag_set(&mapping->page_tree, &iter,
+ PAGECACHE_TAG_TOWRITE);
+ tagged++;
+ if ((tagged % WRITEBACK_TAG_BATCH) != 0)
+ continue;
+ slot = radix_tree_iter_resume(slot, &iter);
spin_unlock_irq(&mapping->tree_lock);
- WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
cond_resched();
- /* We check 'start' to handle wrapping when end == ~0UL */
- } while (tagged >= WRITEBACK_TAG_BATCH && start);
+ spin_lock_irq(&mapping->tree_lock);
+ }
+ spin_unlock_irq(&mapping->tree_lock);
}
EXPORT_SYMBOL(tag_pages_for_writeback);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6de9440e3ae2..2c6d5f64feca 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2058,8 +2058,12 @@ out_unlock:
* potentially hurts the reliability of high-order allocations when under
* intense memory pressure but failed atomic allocations should be easier
* to recover from than an OOM.
+ *
+ * If @force is true, try to unreserve a pageblock even though highatomic
+ * pageblock is exhausted.
*/
-static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
+static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ bool force)
{
struct zonelist *zonelist = ac->zonelist;
unsigned long flags;
@@ -2067,11 +2071,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
struct zone *zone;
struct page *page;
int order;
+ bool ret;
for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
ac->nodemask) {
- /* Preserve at least one pageblock */
- if (zone->nr_reserved_highatomic <= pageblock_nr_pages)
+ /*
+ * Preserve at least one pageblock unless memory pressure
+ * is really high.
+ */
+ if (!force && zone->nr_reserved_highatomic <=
+ pageblock_nr_pages)
continue;
spin_lock_irqsave(&zone->lock, flags);
@@ -2085,13 +2094,25 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
continue;
/*
- * It should never happen but changes to locking could
- * inadvertently allow a per-cpu drain to add pages
- * to MIGRATE_HIGHATOMIC while unreserving so be safe
- * and watch for underflows.
+ * In page freeing path, migratetype change is racy so
+ * we can counter several free pages in a pageblock
+ * in this loop althoug we changed the pageblock type
+ * from highatomic to ac->migratetype. So we should
+ * adjust the count once.
*/
- zone->nr_reserved_highatomic -= min(pageblock_nr_pages,
- zone->nr_reserved_highatomic);
+ if (get_pageblock_migratetype(page) ==
+ MIGRATE_HIGHATOMIC) {
+ /*
+ * It should never happen but changes to
+ * locking could inadvertently allow a per-cpu
+ * drain to add pages to MIGRATE_HIGHATOMIC
+ * while unreserving so be safe and watch for
+ * underflows.
+ */
+ zone->nr_reserved_highatomic -= min(
+ pageblock_nr_pages,
+ zone->nr_reserved_highatomic);
+ }
/*
* Convert to ac->migratetype and avoid the normal
@@ -2103,12 +2124,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
* may increase.
*/
set_pageblock_migratetype(page, ac->migratetype);
- move_freepages_block(zone, page, ac->migratetype);
- spin_unlock_irqrestore(&zone->lock, flags);
- return;
+ ret = move_freepages_block(zone, page, ac->migratetype);
+ if (ret) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return ret;
+ }
}
spin_unlock_irqrestore(&zone->lock, flags);
}
+
+ return false;
}
/* Remove an element from the buddy allocator from the fallback list */
@@ -2133,7 +2158,8 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
page = list_first_entry(&area->free_list[fallback_mt],
struct page, lru);
- if (can_steal)
+ if (can_steal &&
+ get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC)
steal_suitable_fallback(zone, page, start_migratetype);
/* Remove the page from the freelists */
@@ -2192,7 +2218,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, bool cold)
{
- int i;
+ int i, alloced = 0;
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
@@ -2217,13 +2243,21 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
else
list_add_tail(&page->lru, list);
list = &page->lru;
+ alloced++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
+
+ /*
+ * i pages were removed from the buddy list even if some leak due
+ * to check_pcp_refill failing so adjust NR_FREE_PAGES based
+ * on i. Do not confuse with 'alloced' which is the number of
+ * pages added to the pcp list.
+ */
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock(&zone->lock);
- return i;
+ return alloced;
}
#ifdef CONFIG_NUMA
@@ -2534,7 +2568,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
int mt = get_pageblock_migratetype(page);
- if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
+ if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
+ && mt != MIGRATE_HIGHATOMIC)
set_pageblock_migratetype(page,
MIGRATE_MOVABLE);
}
@@ -3305,7 +3340,7 @@ retry:
* Shrink them them and try again
*/
if (!page && !drained) {
- unreserve_highatomic_pageblock(ac);
+ unreserve_highatomic_pageblock(ac, false);
drain_all_pages(NULL);
drained = true;
goto retry;
@@ -3422,8 +3457,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
* Make sure we converge to OOM if we cannot make any progress
* several times in the row.
*/
- if (*no_progress_loops > MAX_RECLAIM_RETRIES)
- return false;
+ if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
+ /* Before OOM, exhaust highatomic_reserve */
+ return unreserve_highatomic_pageblock(ac, true);
+ }
/*
* Keep reclaiming pages while there is a chance this will lead
@@ -3888,6 +3925,20 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc,
return page;
}
+void __page_frag_drain(struct page *page, unsigned int order,
+ unsigned int count)
+{
+ VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
+
+ if (page_ref_sub_and_test(page, count)) {
+ if (order == 0)
+ free_hot_cold_page(page, false);
+ else
+ __free_pages_ok(page, order);
+ }
+}
+EXPORT_SYMBOL(__page_frag_drain);
+
void *__alloc_page_frag(struct page_frag_cache *nc,
unsigned int fragsz, gfp_t gfp_mask)
{
@@ -6399,8 +6450,8 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
}
if (pages && s)
- pr_info("Freeing %s memory: %ldK (%p - %p)\n",
- s, pages << (PAGE_SHIFT - 10), start, end);
+ pr_info("Freeing %s memory: %ldK\n",
+ s, pages << (PAGE_SHIFT - 10));
return pages;
}
@@ -6491,38 +6542,39 @@ void __init free_area_init(unsigned long *zones_size)
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
}
-static int page_alloc_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static int page_alloc_cpu_dead(unsigned int cpu)
{
- int cpu = (unsigned long)hcpu;
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- lru_add_drain_cpu(cpu);
- drain_pages(cpu);
+ lru_add_drain_cpu(cpu);
+ drain_pages(cpu);
- /*
- * Spill the event counters of the dead processor
- * into the current processors event counters.
- * This artificially elevates the count of the current
- * processor.
- */
- vm_events_fold_cpu(cpu);
+ /*
+ * Spill the event counters of the dead processor
+ * into the current processors event counters.
+ * This artificially elevates the count of the current
+ * processor.
+ */
+ vm_events_fold_cpu(cpu);
- /*
- * Zero the differential counters of the dead processor
- * so that the vm statistics are consistent.
- *
- * This is only okay since the processor is dead and cannot
- * race with what we are doing.
- */
- cpu_vm_stats_fold(cpu);
- }
- return NOTIFY_OK;
+ /*
+ * Zero the differential counters of the dead processor
+ * so that the vm statistics are consistent.
+ *
+ * This is only okay since the processor is dead and cannot
+ * race with what we are doing.
+ */
+ cpu_vm_stats_fold(cpu);
+ return 0;
}
void __init page_alloc_init(void)
{
- hotcpu_notifier(page_alloc_cpu_notify, 0);
+ int ret;
+
+ ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
+ "mm/page_alloc:dead", NULL,
+ page_alloc_cpu_dead);
+ WARN_ON(ret < 0);
}
/*
diff --git a/mm/page_io.c b/mm/page_io.c
index a2651f58c86a..23f6d0d3470f 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -320,10 +320,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
ret = -ENOMEM;
goto out;
}
- if (wbc->sync_mode == WB_SYNC_ALL)
- bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC);
- else
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
count_vm_event(PSWPOUT);
set_page_writeback(page);
unlock_page(page);
diff --git a/mm/percpu.c b/mm/percpu.c
index 255714302394..0686f566d347 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -886,7 +886,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
size = ALIGN(size, 2);
- if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
+ if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
+ !is_power_of_2(align))) {
WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n",
size, align);
return NULL;
@@ -2093,6 +2094,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
size_t pages_size;
struct page **pages;
int unit, i, j, rc;
+ int upa;
+ int nr_g0_units;
snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
@@ -2100,7 +2103,12 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
if (IS_ERR(ai))
return PTR_ERR(ai);
BUG_ON(ai->nr_groups != 1);
- BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
+ upa = ai->alloc_size/ai->unit_size;
+ nr_g0_units = roundup(num_possible_cpus(), upa);
+ if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) {
+ pcpu_free_alloc_info(ai);
+ return -EINVAL;
+ }
unit_pages = ai->unit_size >> PAGE_SHIFT;
@@ -2111,21 +2119,22 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
/* allocate pages */
j = 0;
- for (unit = 0; unit < num_possible_cpus(); unit++)
+ for (unit = 0; unit < num_possible_cpus(); unit++) {
+ unsigned int cpu = ai->groups[0].cpu_map[unit];
for (i = 0; i < unit_pages; i++) {
- unsigned int cpu = ai->groups[0].cpu_map[unit];
void *ptr;
ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
if (!ptr) {
pr_warn("failed to allocate %s page for cpu%u\n",
- psize_str, cpu);
+ psize_str, cpu);
goto enomem;
}
/* kmemleak tracks the percpu allocations separately */
kmemleak_free(ptr);
pages[j++] = virt_to_page(ptr);
}
+ }
/* allocate vm area, map the pages and copy static data */
vm.flags = VM_ALLOC;
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index be8dc8d1edb9..84d0c7eada2b 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -88,7 +88,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
ssize_t rc = 0;
unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
/ sizeof(struct pages *);
- unsigned int flags = FOLL_REMOTE;
+ unsigned int flags = 0;
/* Work out address and page range required */
if (len == 0)
@@ -100,15 +100,19 @@ static int process_vm_rw_single_vec(unsigned long addr,
while (!rc && nr_pages && iov_iter_count(iter)) {
int pages = min(nr_pages, max_pages_per_loop);
+ int locked = 1;
size_t bytes;
/*
* Get the pages we're interested in. We must
- * add FOLL_REMOTE because task/mm might not
+ * access remotely because task/mm might not
* current/current->mm
*/
- pages = __get_user_pages_unlocked(task, mm, pa, pages,
- process_pages, flags);
+ down_read(&mm->mmap_sem);
+ pages = get_user_pages_remote(task, mm, pa, pages, flags,
+ process_pages, NULL, &locked);
+ if (locked)
+ up_read(&mm->mmap_sem);
if (pages <= 0)
return -EFAULT;
diff --git a/mm/readahead.c b/mm/readahead.c
index c8a955b1297e..c4ca70239233 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -207,12 +207,21 @@ out:
* memory at once.
*/
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
- pgoff_t offset, unsigned long nr_to_read)
+ pgoff_t offset, unsigned long nr_to_read)
{
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ struct file_ra_state *ra = &filp->f_ra;
+ unsigned long max_pages;
+
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
return -EINVAL;
- nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages);
+ /*
+ * If the request exceeds the readahead window, allow the read to
+ * be up to the optimal hardware IO size
+ */
+ max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
+ nr_to_read = min(nr_to_read, max_pages);
while (nr_to_read) {
int err;
@@ -369,10 +378,18 @@ ondemand_readahead(struct address_space *mapping,
bool hit_readahead_marker, pgoff_t offset,
unsigned long req_size)
{
- unsigned long max = ra->ra_pages;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ unsigned long max_pages = ra->ra_pages;
pgoff_t prev_offset;
/*
+ * If the request exceeds the readahead window, allow the read to
+ * be up to the optimal hardware IO size
+ */
+ if (req_size > max_pages && bdi->io_pages > max_pages)
+ max_pages = min(req_size, bdi->io_pages);
+
+ /*
* start of file
*/
if (!offset)
@@ -385,7 +402,7 @@ ondemand_readahead(struct address_space *mapping,
if ((offset == (ra->start + ra->size - ra->async_size) ||
offset == (ra->start + ra->size))) {
ra->start += ra->size;
- ra->size = get_next_ra_size(ra, max);
+ ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
goto readit;
}
@@ -400,16 +417,16 @@ ondemand_readahead(struct address_space *mapping,
pgoff_t start;
rcu_read_lock();
- start = page_cache_next_hole(mapping, offset + 1, max);
+ start = page_cache_next_hole(mapping, offset + 1, max_pages);
rcu_read_unlock();
- if (!start || start - offset > max)
+ if (!start || start - offset > max_pages)
return 0;
ra->start = start;
ra->size = start - offset; /* old async_size */
ra->size += req_size;
- ra->size = get_next_ra_size(ra, max);
+ ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
goto readit;
}
@@ -417,7 +434,7 @@ ondemand_readahead(struct address_space *mapping,
/*
* oversize read
*/
- if (req_size > max)
+ if (req_size > max_pages)
goto initial_readahead;
/*
@@ -433,7 +450,7 @@ ondemand_readahead(struct address_space *mapping,
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
- if (try_context_readahead(mapping, ra, offset, req_size, max))
+ if (try_context_readahead(mapping, ra, offset, req_size, max_pages))
goto readit;
/*
@@ -444,7 +461,7 @@ ondemand_readahead(struct address_space *mapping,
initial_readahead:
ra->start = offset;
- ra->size = get_init_ra_size(req_size, max);
+ ra->size = get_init_ra_size(req_size, max_pages);
ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
readit:
@@ -454,7 +471,7 @@ readit:
* the resulted next readahead window into the current one.
*/
if (offset == ra->start && ra->size == ra->async_size) {
- ra->async_size = get_next_ra_size(ra, max);
+ ra->async_size = get_next_ra_size(ra, max_pages);
ra->size += ra->async_size;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 1ef36404e7b2..91619fd70939 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -141,14 +141,15 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
}
/**
- * anon_vma_prepare - attach an anon_vma to a memory region
+ * __anon_vma_prepare - attach an anon_vma to a memory region
* @vma: the memory region in question
*
* This makes sure the memory mapping described by 'vma' has
* an 'anon_vma' attached to it, so that we can associate the
* anonymous pages mapped into it with that anon_vma.
*
- * The common case will be that we already have one, but if
+ * The common case will be that we already have one, which
+ * is handled inline by anon_vma_prepare(). But if
* not we either need to find an adjacent mapping that we
* can re-use the anon_vma from (very common when the only
* reason for splitting a vma has been mprotect()), or we
@@ -167,48 +168,46 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
*
* This must be called with the mmap_sem held for reading.
*/
-int anon_vma_prepare(struct vm_area_struct *vma)
+int __anon_vma_prepare(struct vm_area_struct *vma)
{
- struct anon_vma *anon_vma = vma->anon_vma;
+ struct mm_struct *mm = vma->vm_mm;
+ struct anon_vma *anon_vma, *allocated;
struct anon_vma_chain *avc;
might_sleep();
- if (unlikely(!anon_vma)) {
- struct mm_struct *mm = vma->vm_mm;
- struct anon_vma *allocated;
- avc = anon_vma_chain_alloc(GFP_KERNEL);
- if (!avc)
- goto out_enomem;
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc)
+ goto out_enomem;
+
+ anon_vma = find_mergeable_anon_vma(vma);
+ allocated = NULL;
+ if (!anon_vma) {
+ anon_vma = anon_vma_alloc();
+ if (unlikely(!anon_vma))
+ goto out_enomem_free_avc;
+ allocated = anon_vma;
+ }
- anon_vma = find_mergeable_anon_vma(vma);
+ anon_vma_lock_write(anon_vma);
+ /* page_table_lock to protect against threads */
+ spin_lock(&mm->page_table_lock);
+ if (likely(!vma->anon_vma)) {
+ vma->anon_vma = anon_vma;
+ anon_vma_chain_link(vma, avc, anon_vma);
+ /* vma reference or self-parent link for new root */
+ anon_vma->degree++;
allocated = NULL;
- if (!anon_vma) {
- anon_vma = anon_vma_alloc();
- if (unlikely(!anon_vma))
- goto out_enomem_free_avc;
- allocated = anon_vma;
- }
+ avc = NULL;
+ }
+ spin_unlock(&mm->page_table_lock);
+ anon_vma_unlock_write(anon_vma);
- anon_vma_lock_write(anon_vma);
- /* page_table_lock to protect against threads */
- spin_lock(&mm->page_table_lock);
- if (likely(!vma->anon_vma)) {
- vma->anon_vma = anon_vma;
- anon_vma_chain_link(vma, avc, anon_vma);
- /* vma reference or self-parent link for new root */
- anon_vma->degree++;
- allocated = NULL;
- avc = NULL;
- }
- spin_unlock(&mm->page_table_lock);
- anon_vma_unlock_write(anon_vma);
+ if (unlikely(allocated))
+ put_anon_vma(allocated);
+ if (unlikely(avc))
+ anon_vma_chain_free(avc);
- if (unlikely(allocated))
- put_anon_vma(allocated);
- if (unlikely(avc))
- anon_vma_chain_free(avc);
- }
return 0;
out_enomem_free_avc:
diff --git a/mm/shmem.c b/mm/shmem.c
index aa47e6baecde..b1b20dc63265 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -300,18 +300,19 @@ void shmem_uncharge(struct inode *inode, long pages)
static int shmem_radix_tree_replace(struct address_space *mapping,
pgoff_t index, void *expected, void *replacement)
{
+ struct radix_tree_node *node;
void **pslot;
void *item;
VM_BUG_ON(!expected);
VM_BUG_ON(!replacement);
- pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
- if (!pslot)
+ item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot);
+ if (!item)
return -ENOENT;
- item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
if (item != expected)
return -ENOENT;
- radix_tree_replace_slot(pslot, replacement);
+ __radix_tree_replace(&mapping->page_tree, node, pslot,
+ replacement, NULL, NULL);
return 0;
}
@@ -370,6 +371,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
int shmem_huge __read_mostly;
+#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
static int shmem_parse_huge(const char *str)
{
if (!strcmp(str, "never"))
@@ -407,6 +409,7 @@ static const char *shmem_format_huge(int huge)
return "bad_val";
}
}
+#endif
static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
struct shrink_control *sc, unsigned long nr_to_split)
@@ -658,8 +661,8 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
swapped++;
if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}
@@ -1046,6 +1049,30 @@ static void shmem_evict_inode(struct inode *inode)
clear_inode(inode);
}
+static unsigned long find_swap_entry(struct radix_tree_root *root, void *item)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ unsigned long found = -1;
+ unsigned int checked = 0;
+
+ rcu_read_lock();
+ radix_tree_for_each_slot(slot, root, &iter, 0) {
+ if (*slot == item) {
+ found = iter.index;
+ break;
+ }
+ checked++;
+ if ((checked % 4096) != 0)
+ continue;
+ slot = radix_tree_iter_resume(slot, &iter);
+ cond_resched_rcu();
+ }
+
+ rcu_read_unlock();
+ return found;
+}
+
/*
* If swap found in inode, free it and move page from swapcache to filecache.
*/
@@ -1059,7 +1086,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
int error = 0;
radswap = swp_to_radix_entry(swap);
- index = radix_tree_locate_item(&mapping->page_tree, radswap);
+ index = find_swap_entry(&mapping->page_tree, radswap);
if (index == -1)
return -EAGAIN; /* tell shmem_unuse we found nothing */
@@ -1539,7 +1566,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct mm_struct *fault_mm, int *fault_type)
{
struct address_space *mapping = inode->i_mapping;
- struct shmem_inode_info *info;
+ struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo;
struct mm_struct *charge_mm;
struct mem_cgroup *memcg;
@@ -1589,7 +1616,6 @@ repeat:
* Fast cache lookup did not find it:
* bring it back from swap or allocate.
*/
- info = SHMEM_I(inode);
sbinfo = SHMEM_SB(inode->i_sb);
charge_mm = fault_mm ? : current->mm;
@@ -1837,7 +1863,6 @@ unlock:
put_page(page);
}
if (error == -ENOSPC && !once++) {
- info = SHMEM_I(inode);
spin_lock_irq(&info->lock);
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
@@ -1848,6 +1873,18 @@ unlock:
return error;
}
+/*
+ * This is like autoremove_wake_function, but it removes the wait queue
+ * entry unconditionally - even if something else had already woken the
+ * target.
+ */
+static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ int ret = default_wake_function(wait, mode, sync, key);
+ list_del_init(&wait->task_list);
+ return ret;
+}
+
static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct inode *inode = file_inode(vma->vm_file);
@@ -1883,7 +1920,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
vmf->pgoff >= shmem_falloc->start &&
vmf->pgoff < shmem_falloc->next) {
wait_queue_head_t *shmem_falloc_waitq;
- DEFINE_WAIT(shmem_fault_wait);
+ DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
ret = VM_FAULT_NOPAGE;
if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
@@ -2434,8 +2471,8 @@ static void shmem_tag_pins(struct address_space *mapping)
}
if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
@@ -2504,8 +2541,8 @@ static int shmem_wait_for_pins(struct address_space *mapping)
spin_unlock_irq(&mapping->tree_lock);
continue_resched:
if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
@@ -2665,6 +2702,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
spin_lock(&inode->i_lock);
inode->i_private = NULL;
wake_up_all(&shmem_falloc_waitq);
+ WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list));
spin_unlock(&inode->i_lock);
error = 0;
goto out;
diff --git a/mm/slab.c b/mm/slab.c
index 0b0550ca85b4..29bc6c0dedd0 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -227,13 +227,14 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
INIT_LIST_HEAD(&parent->slabs_full);
INIT_LIST_HEAD(&parent->slabs_partial);
INIT_LIST_HEAD(&parent->slabs_free);
+ parent->total_slabs = 0;
+ parent->free_slabs = 0;
parent->shared = NULL;
parent->alien = NULL;
parent->colour_next = 0;
spin_lock_init(&parent->list_lock);
parent->free_objects = 0;
parent->free_touched = 0;
- parent->num_slabs = 0;
}
#define MAKE_LIST(cachep, listp, slab, nodeid) \
@@ -551,12 +552,7 @@ static void start_cpu_timer(int cpu)
{
struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
- /*
- * When this gets called from do_initcalls via cpucache_init(),
- * init_workqueues() has already run, so keventd will be setup
- * at that time.
- */
- if (keventd_up() && reap_work->work.func == NULL) {
+ if (reap_work->work.func == NULL) {
init_reap_node(cpu);
INIT_DEFERRABLE_WORK(reap_work, cache_reap);
schedule_delayed_work_on(cpu, reap_work,
@@ -1366,7 +1362,6 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
{
#if DEBUG
struct kmem_cache_node *n;
- struct page *page;
unsigned long flags;
int node;
static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -1381,32 +1376,18 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
cachep->name, cachep->size, cachep->gfporder);
for_each_kmem_cache_node(cachep, node, n) {
- unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
- unsigned long active_slabs = 0, num_slabs = 0;
- unsigned long num_slabs_partial = 0, num_slabs_free = 0;
- unsigned long num_slabs_full;
+ unsigned long total_slabs, free_slabs, free_objs;
spin_lock_irqsave(&n->list_lock, flags);
- num_slabs = n->num_slabs;
- list_for_each_entry(page, &n->slabs_partial, lru) {
- active_objs += page->active;
- num_slabs_partial++;
- }
- list_for_each_entry(page, &n->slabs_free, lru)
- num_slabs_free++;
-
- free_objects += n->free_objects;
+ total_slabs = n->total_slabs;
+ free_slabs = n->free_slabs;
+ free_objs = n->free_objects;
spin_unlock_irqrestore(&n->list_lock, flags);
- num_objs = num_slabs * cachep->num;
- active_slabs = num_slabs - num_slabs_free;
- num_slabs_full = num_slabs -
- (num_slabs_partial + num_slabs_free);
- active_objs += (num_slabs_full * cachep->num);
-
- pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
- node, active_slabs, num_slabs, active_objs, num_objs,
- free_objects);
+ pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
+ node, total_slabs - free_slabs, total_slabs,
+ (total_slabs * cachep->num) - free_objs,
+ total_slabs * cachep->num);
}
#endif
}
@@ -2318,7 +2299,8 @@ static int drain_freelist(struct kmem_cache *cache,
page = list_entry(p, struct page, lru);
list_del(&page->lru);
- n->num_slabs--;
+ n->free_slabs--;
+ n->total_slabs--;
/*
* Safe to drop the lock. The slab is no longer linked
* to the cache.
@@ -2332,7 +2314,7 @@ out:
return nr_freed;
}
-int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
+int __kmem_cache_shrink(struct kmem_cache *cachep)
{
int ret = 0;
int node;
@@ -2352,7 +2334,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
- return __kmem_cache_shrink(cachep, false);
+ return __kmem_cache_shrink(cachep);
}
void __kmem_cache_release(struct kmem_cache *cachep)
@@ -2753,12 +2735,13 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
n = get_node(cachep, page_to_nid(page));
spin_lock(&n->list_lock);
- if (!page->active)
+ n->total_slabs++;
+ if (!page->active) {
list_add_tail(&page->lru, &(n->slabs_free));
- else
+ n->free_slabs++;
+ } else
fixup_slab_list(cachep, n, page, &list);
- n->num_slabs++;
STATS_INC_GROWN(cachep);
n->free_objects += cachep->num - page->active;
spin_unlock(&n->list_lock);
@@ -2903,9 +2886,10 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
/* Move pfmemalloc slab to the end of list to speed up next search */
list_del(&page->lru);
- if (!page->active)
+ if (!page->active) {
list_add_tail(&page->lru, &n->slabs_free);
- else
+ n->free_slabs++;
+ } else
list_add_tail(&page->lru, &n->slabs_partial);
list_for_each_entry(page, &n->slabs_partial, lru) {
@@ -2913,9 +2897,12 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
return page;
}
+ n->free_touched = 1;
list_for_each_entry(page, &n->slabs_free, lru) {
- if (!PageSlabPfmemalloc(page))
+ if (!PageSlabPfmemalloc(page)) {
+ n->free_slabs--;
return page;
+ }
}
return NULL;
@@ -2925,16 +2912,18 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
{
struct page *page;
- page = list_first_entry_or_null(&n->slabs_partial,
- struct page, lru);
+ assert_spin_locked(&n->list_lock);
+ page = list_first_entry_or_null(&n->slabs_partial, struct page, lru);
if (!page) {
n->free_touched = 1;
- page = list_first_entry_or_null(&n->slabs_free,
- struct page, lru);
+ page = list_first_entry_or_null(&n->slabs_free, struct page,
+ lru);
+ if (page)
+ n->free_slabs--;
}
if (sk_memalloc_socks())
- return get_valid_first_slab(n, page, pfmemalloc);
+ page = get_valid_first_slab(n, page, pfmemalloc);
return page;
}
@@ -3434,9 +3423,10 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
STATS_DEC_ACTIVE(cachep);
/* fixup slab chains */
- if (page->active == 0)
+ if (page->active == 0) {
list_add(&page->lru, &n->slabs_free);
- else {
+ n->free_slabs++;
+ } else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
@@ -3450,7 +3440,8 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
page = list_last_entry(&n->slabs_free, struct page, lru);
list_move(&page->lru, list);
- n->num_slabs--;
+ n->free_slabs--;
+ n->total_slabs--;
}
}
@@ -4102,64 +4093,33 @@ out:
#ifdef CONFIG_SLABINFO
void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
{
- struct page *page;
- unsigned long active_objs;
- unsigned long num_objs;
- unsigned long active_slabs = 0;
- unsigned long num_slabs, free_objects = 0, shared_avail = 0;
- unsigned long num_slabs_partial = 0, num_slabs_free = 0;
- unsigned long num_slabs_full = 0;
- const char *name;
- char *error = NULL;
+ unsigned long active_objs, num_objs, active_slabs;
+ unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0;
+ unsigned long free_slabs = 0;
int node;
struct kmem_cache_node *n;
- active_objs = 0;
- num_slabs = 0;
for_each_kmem_cache_node(cachep, node, n) {
-
check_irq_on();
spin_lock_irq(&n->list_lock);
- num_slabs += n->num_slabs;
-
- list_for_each_entry(page, &n->slabs_partial, lru) {
- if (page->active == cachep->num && !error)
- error = "slabs_partial accounting error";
- if (!page->active && !error)
- error = "slabs_partial accounting error";
- active_objs += page->active;
- num_slabs_partial++;
- }
-
- list_for_each_entry(page, &n->slabs_free, lru) {
- if (page->active && !error)
- error = "slabs_free accounting error";
- num_slabs_free++;
- }
+ total_slabs += n->total_slabs;
+ free_slabs += n->free_slabs;
+ free_objs += n->free_objects;
- free_objects += n->free_objects;
if (n->shared)
shared_avail += n->shared->avail;
spin_unlock_irq(&n->list_lock);
}
- num_objs = num_slabs * cachep->num;
- active_slabs = num_slabs - num_slabs_free;
- num_slabs_full = num_slabs - (num_slabs_partial + num_slabs_free);
- active_objs += (num_slabs_full * cachep->num);
-
- if (num_objs - active_objs != free_objects && !error)
- error = "free_objects accounting error";
-
- name = cachep->name;
- if (error)
- pr_err("slab: cache %s error: %s\n", name, error);
+ num_objs = total_slabs * cachep->num;
+ active_slabs = total_slabs - free_slabs;
+ active_objs = num_objs - free_objs;
sinfo->active_objs = active_objs;
sinfo->num_objs = num_objs;
sinfo->active_slabs = active_slabs;
- sinfo->num_slabs = num_slabs;
+ sinfo->num_slabs = total_slabs;
sinfo->shared_avail = shared_avail;
sinfo->limit = cachep->limit;
sinfo->batchcount = cachep->batchcount;
diff --git a/mm/slab.h b/mm/slab.h
index bc05fdc3edce..de6579dc362c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -142,11 +142,26 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
#define SLAB_CACHE_FLAGS (0)
#endif
+/* Common flags available with current configuration */
#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
+/* Common flags permitted for kmem_cache_create */
+#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \
+ SLAB_RED_ZONE | \
+ SLAB_POISON | \
+ SLAB_STORE_USER | \
+ SLAB_TRACE | \
+ SLAB_CONSISTENCY_CHECKS | \
+ SLAB_MEM_SPREAD | \
+ SLAB_NOLEAKTRACE | \
+ SLAB_RECLAIM_ACCOUNT | \
+ SLAB_TEMPORARY | \
+ SLAB_NOTRACK | \
+ SLAB_ACCOUNT)
+
int __kmem_cache_shutdown(struct kmem_cache *);
void __kmem_cache_release(struct kmem_cache *);
-int __kmem_cache_shrink(struct kmem_cache *, bool);
+int __kmem_cache_shrink(struct kmem_cache *);
void slab_kmem_cache_release(struct kmem_cache *);
struct seq_file;
@@ -432,7 +447,8 @@ struct kmem_cache_node {
struct list_head slabs_partial; /* partial list first, better asm code */
struct list_head slabs_full;
struct list_head slabs_free;
- unsigned long num_slabs;
+ unsigned long total_slabs; /* length of all slab lists */
+ unsigned long free_slabs; /* length of free slab list only */
unsigned long free_objects;
unsigned int free_limit;
unsigned int colour_next; /* Per-node cache coloring */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 329b03843863..ae323841adb1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -404,6 +404,12 @@ kmem_cache_create(const char *name, size_t size, size_t align,
goto out_unlock;
}
+ /* Refuse requests with allocator specific flags */
+ if (flags & ~SLAB_FLAGS_PERMITTED) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
/*
* Some allocators will constraint the set of valid flags to a subset
* of all flags. We expect them to define CACHE_CREATE_MASK in this
@@ -573,6 +579,29 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
get_online_cpus();
get_online_mems();
+#ifdef CONFIG_SLUB
+ /*
+ * In case of SLUB, we need to disable empty slab caching to
+ * avoid pinning the offline memory cgroup by freeable kmem
+ * pages charged to it. SLAB doesn't need this, as it
+ * periodically purges unused slabs.
+ */
+ mutex_lock(&slab_mutex);
+ list_for_each_entry(s, &slab_caches, list) {
+ c = is_root_cache(s) ? cache_from_memcg_idx(s, idx) : NULL;
+ if (c) {
+ c->cpu_partial = 0;
+ c->min_partial = 0;
+ }
+ }
+ mutex_unlock(&slab_mutex);
+ /*
+ * kmem_cache->cpu_partial is checked locklessly (see
+ * put_cpu_partial()). Make sure the change is visible.
+ */
+ synchronize_sched();
+#endif
+
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list) {
if (!is_root_cache(s))
@@ -584,7 +613,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
if (!c)
continue;
- __kmem_cache_shrink(c, true);
+ __kmem_cache_shrink(c);
arr->entries[idx] = NULL;
}
mutex_unlock(&slab_mutex);
@@ -755,7 +784,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
get_online_cpus();
get_online_mems();
kasan_cache_shrink(cachep);
- ret = __kmem_cache_shrink(cachep, false);
+ ret = __kmem_cache_shrink(cachep);
put_online_mems();
put_online_cpus();
return ret;
diff --git a/mm/slob.c b/mm/slob.c
index 5ec158054ffe..eac04d4357ec 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -634,7 +634,7 @@ void __kmem_cache_release(struct kmem_cache *c)
{
}
-int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
+int __kmem_cache_shrink(struct kmem_cache *d)
{
return 0;
}
diff --git a/mm/slub.c b/mm/slub.c
index 2b3e740609e9..067598a00849 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3076,7 +3076,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
struct detached_freelist df;
size = build_detached_freelist(s, size, p, &df);
- if (unlikely(!df.page))
+ if (!df.page)
continue;
slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
@@ -3883,7 +3883,7 @@ EXPORT_SYMBOL(kfree);
* being allocated from last increasing the chance that the last objects
* are freed in them.
*/
-int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
+int __kmem_cache_shrink(struct kmem_cache *s)
{
int node;
int i;
@@ -3895,21 +3895,6 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
unsigned long flags;
int ret = 0;
- if (deactivate) {
- /*
- * Disable empty slabs caching. Used to avoid pinning offline
- * memory cgroups by kmem pages that can be freed.
- */
- s->cpu_partial = 0;
- s->min_partial = 0;
-
- /*
- * s->cpu_partial is checked locklessly (see put_cpu_partial),
- * so we have to make sure the change is visible.
- */
- synchronize_sched();
- }
-
flush_all(s);
for_each_kmem_cache_node(s, node, n) {
INIT_LIST_HEAD(&discard);
@@ -3966,7 +3951,7 @@ static int slab_mem_going_offline_callback(void *arg)
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list)
- __kmem_cache_shrink(s, false);
+ __kmem_cache_shrink(s);
mutex_unlock(&slab_mutex);
return 0;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f30438970cd1..1c6e0321205d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1234,6 +1234,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
+ cond_resched();
next = pmd_addr_end(addr, end);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
@@ -1313,6 +1314,7 @@ static int unuse_mm(struct mm_struct *mm,
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
break;
+ cond_resched();
}
up_read(&mm->mmap_sem);
return (ret < 0)? ret: 0;
@@ -1350,15 +1352,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
prev = 0;
i = 1;
}
- if (frontswap) {
- if (frontswap_test(si, i))
- break;
- else
- continue;
- }
count = READ_ONCE(si->swap_map[i]);
if (count && swap_count(count) != SWAP_MAP_BAD)
- break;
+ if (!frontswap || frontswap_test(si, i))
+ break;
+ if ((i % LATENCY_LIMIT) == 0)
+ cond_resched();
}
return i;
}
diff --git a/mm/truncate.c b/mm/truncate.c
index 8d8c62d89e6d..fd97f1dbce29 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -44,28 +44,13 @@ static void clear_exceptional_entry(struct address_space *mapping,
* without the tree itself locked. These unlocked entries
* need verification under the tree lock.
*/
- if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
- &slot))
+ if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
goto unlock;
if (*slot != entry)
goto unlock;
- radix_tree_replace_slot(slot, NULL);
+ __radix_tree_replace(&mapping->page_tree, node, slot, NULL,
+ workingset_update_node, mapping);
mapping->nrexceptional--;
- if (!node)
- goto unlock;
- workingset_node_shadows_dec(node);
- /*
- * Don't track node without shadow entries.
- *
- * Avoid acquiring the list_lru lock if already untracked.
- * The list_empty() test is safe as node->private_list is
- * protected by mapping->tree_lock.
- */
- if (!workingset_node_shadows(node) &&
- !list_empty(&node->private_list))
- list_lru_del(&workingset_shadow_nodes,
- &node->private_list);
- __radix_tree_delete_node(&mapping->page_tree, node);
unlock:
spin_unlock_irq(&mapping->tree_lock);
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f2481cb4e6b2..a5584384eabc 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -365,7 +365,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
BUG_ON(offset_in_page(size));
BUG_ON(!is_power_of_2(align));
- might_sleep_if(gfpflags_allow_blocking(gfp_mask));
+ might_sleep();
va = kmalloc_node(sizeof(struct vmap_area),
gfp_mask & GFP_RECLAIM_MASK, node);
@@ -601,6 +601,13 @@ static unsigned long lazy_max_pages(void)
static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+/*
+ * Serialize vmap purging. There is no actual criticial section protected
+ * by this look, but we want to avoid concurrent calls for performance
+ * reasons and to make the pcpu_get_vm_areas more deterministic.
+ */
+static DEFINE_MUTEX(vmap_purge_lock);
+
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
@@ -615,59 +622,40 @@ void set_iounmap_nonlazy(void)
/*
* Purges all lazily-freed vmap areas.
- *
- * If sync is 0 then don't purge if there is already a purge in progress.
- * If force_flush is 1, then flush kernel TLBs between *start and *end even
- * if we found no lazy vmap areas to unmap (callers can use this to optimise
- * their own TLB flushing).
- * Returns with *start = min(*start, lowest purged address)
- * *end = max(*end, highest purged address)
*/
-static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
- int sync, int force_flush)
+static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
{
- static DEFINE_SPINLOCK(purge_lock);
struct llist_node *valist;
struct vmap_area *va;
struct vmap_area *n_va;
- int nr = 0;
+ bool do_free = false;
- /*
- * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
- * should not expect such behaviour. This just simplifies locking for
- * the case that isn't actually used at the moment anyway.
- */
- if (!sync && !force_flush) {
- if (!spin_trylock(&purge_lock))
- return;
- } else
- spin_lock(&purge_lock);
-
- if (sync)
- purge_fragmented_blocks_allcpus();
+ lockdep_assert_held(&vmap_purge_lock);
valist = llist_del_all(&vmap_purge_list);
llist_for_each_entry(va, valist, purge_list) {
- if (va->va_start < *start)
- *start = va->va_start;
- if (va->va_end > *end)
- *end = va->va_end;
- nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
+ if (va->va_start < start)
+ start = va->va_start;
+ if (va->va_end > end)
+ end = va->va_end;
+ do_free = true;
}
- if (nr)
- atomic_sub(nr, &vmap_lazy_nr);
+ if (!do_free)
+ return false;
- if (nr || force_flush)
- flush_tlb_kernel_range(*start, *end);
+ flush_tlb_kernel_range(start, end);
- if (nr) {
- spin_lock(&vmap_area_lock);
- llist_for_each_entry_safe(va, n_va, valist, purge_list)
- __free_vmap_area(va);
- spin_unlock(&vmap_area_lock);
+ spin_lock(&vmap_area_lock);
+ llist_for_each_entry_safe(va, n_va, valist, purge_list) {
+ int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
+
+ __free_vmap_area(va);
+ atomic_sub(nr, &vmap_lazy_nr);
+ cond_resched_lock(&vmap_area_lock);
}
- spin_unlock(&purge_lock);
+ spin_unlock(&vmap_area_lock);
+ return true;
}
/*
@@ -676,9 +664,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
*/
static void try_purge_vmap_area_lazy(void)
{
- unsigned long start = ULONG_MAX, end = 0;
-
- __purge_vmap_area_lazy(&start, &end, 0, 0);
+ if (mutex_trylock(&vmap_purge_lock)) {
+ __purge_vmap_area_lazy(ULONG_MAX, 0);
+ mutex_unlock(&vmap_purge_lock);
+ }
}
/*
@@ -686,9 +675,10 @@ static void try_purge_vmap_area_lazy(void)
*/
static void purge_vmap_area_lazy(void)
{
- unsigned long start = ULONG_MAX, end = 0;
-
- __purge_vmap_area_lazy(&start, &end, 1, 0);
+ mutex_lock(&vmap_purge_lock);
+ purge_fragmented_blocks_allcpus();
+ __purge_vmap_area_lazy(ULONG_MAX, 0);
+ mutex_unlock(&vmap_purge_lock);
}
/*
@@ -711,22 +701,13 @@ static void free_vmap_area_noflush(struct vmap_area *va)
}
/*
- * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
- * called for the correct range previously.
- */
-static void free_unmap_vmap_area_noflush(struct vmap_area *va)
-{
- unmap_vmap_area(va);
- free_vmap_area_noflush(va);
-}
-
-/*
* Free and unmap a vmap area
*/
static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
- free_unmap_vmap_area_noflush(va);
+ unmap_vmap_area(va);
+ free_vmap_area_noflush(va);
}
static struct vmap_area *find_vmap_area(unsigned long addr)
@@ -740,16 +721,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr)
return va;
}
-static void free_unmap_vmap_area_addr(unsigned long addr)
-{
- struct vmap_area *va;
-
- va = find_vmap_area(addr);
- BUG_ON(!va);
- free_unmap_vmap_area(va);
-}
-
-
/*** Per cpu kva allocator ***/
/*
@@ -1070,6 +1041,8 @@ void vm_unmap_aliases(void)
if (unlikely(!vmap_initialized))
return;
+ might_sleep();
+
for_each_possible_cpu(cpu) {
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
struct vmap_block *vb;
@@ -1094,7 +1067,11 @@ void vm_unmap_aliases(void)
rcu_read_unlock();
}
- __purge_vmap_area_lazy(&start, &end, 1, flush);
+ mutex_lock(&vmap_purge_lock);
+ purge_fragmented_blocks_allcpus();
+ if (!__purge_vmap_area_lazy(start, end) && flush)
+ flush_tlb_kernel_range(start, end);
+ mutex_unlock(&vmap_purge_lock);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
@@ -1107,7 +1084,9 @@ void vm_unmap_ram(const void *mem, unsigned int count)
{
unsigned long size = (unsigned long)count << PAGE_SHIFT;
unsigned long addr = (unsigned long)mem;
+ struct vmap_area *va;
+ might_sleep();
BUG_ON(!addr);
BUG_ON(addr < VMALLOC_START);
BUG_ON(addr > VMALLOC_END);
@@ -1116,10 +1095,14 @@ void vm_unmap_ram(const void *mem, unsigned int count)
debug_check_no_locks_freed(mem, size);
vmap_debug_free_range(addr, addr+size);
- if (likely(count <= VMAP_MAX_ALLOC))
+ if (likely(count <= VMAP_MAX_ALLOC)) {
vb_free(mem, size);
- else
- free_unmap_vmap_area_addr(addr);
+ return;
+ }
+
+ va = find_vmap_area(addr);
+ BUG_ON(!va);
+ free_unmap_vmap_area(va);
}
EXPORT_SYMBOL(vm_unmap_ram);
@@ -1455,6 +1438,8 @@ struct vm_struct *remove_vm_area(const void *addr)
{
struct vmap_area *va;
+ might_sleep();
+
va = find_vmap_area((unsigned long)addr);
if (va && va->flags & VM_VM_AREA) {
struct vm_struct *vm = va->vm;
@@ -1510,7 +1495,39 @@ static void __vunmap(const void *addr, int deallocate_pages)
kfree(area);
return;
}
-
+
+static inline void __vfree_deferred(const void *addr)
+{
+ /*
+ * Use raw_cpu_ptr() because this can be called from preemptible
+ * context. Preemption is absolutely fine here, because the llist_add()
+ * implementation is lockless, so it works even if we are adding to
+ * nother cpu's list. schedule_work() should be fine with this too.
+ */
+ struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
+
+ if (llist_add((struct llist_node *)addr, &p->list))
+ schedule_work(&p->wq);
+}
+
+/**
+ * vfree_atomic - release memory allocated by vmalloc()
+ * @addr: memory base address
+ *
+ * This one is just like vfree() but can be called in any atomic context
+ * except NMIs.
+ */
+void vfree_atomic(const void *addr)
+{
+ BUG_ON(in_nmi());
+
+ kmemleak_free(addr);
+
+ if (!addr)
+ return;
+ __vfree_deferred(addr);
+}
+
/**
* vfree - release memory allocated by vmalloc()
* @addr: memory base address
@@ -1533,11 +1550,9 @@ void vfree(const void *addr)
if (!addr)
return;
- if (unlikely(in_interrupt())) {
- struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred);
- if (llist_add((struct llist_node *)addr, &p->list))
- schedule_work(&p->wq);
- } else
+ if (unlikely(in_interrupt()))
+ __vfree_deferred(addr);
+ else
__vunmap(addr, 1);
}
EXPORT_SYMBOL(vfree);
@@ -2574,32 +2589,13 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
static void *s_start(struct seq_file *m, loff_t *pos)
__acquires(&vmap_area_lock)
{
- loff_t n = *pos;
- struct vmap_area *va;
-
spin_lock(&vmap_area_lock);
- va = list_first_entry(&vmap_area_list, typeof(*va), list);
- while (n > 0 && &va->list != &vmap_area_list) {
- n--;
- va = list_next_entry(va, list);
- }
- if (!n && &va->list != &vmap_area_list)
- return va;
-
- return NULL;
-
+ return seq_list_start(&vmap_area_list, *pos);
}
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
- struct vmap_area *va = p, *next;
-
- ++*pos;
- next = list_next_entry(va, list);
- if (&next->list != &vmap_area_list)
- return next;
-
- return NULL;
+ return seq_list_next(p, &vmap_area_list, pos);
}
static void s_stop(struct seq_file *m, void *p)
@@ -2634,9 +2630,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
static int s_show(struct seq_file *m, void *p)
{
- struct vmap_area *va = p;
+ struct vmap_area *va;
struct vm_struct *v;
+ va = list_entry(p, struct vmap_area, list);
+
/*
* s_show can encounter race with remove_vm_area, !VM_VM_AREA on
* behalf of vmap area is being tear down or vm_map_ram allocation.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d75cdf360730..6aa5b01d3e75 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -291,6 +291,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
int nid = shrinkctl->nid;
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
+ long scanned = 0, next_deferred;
freeable = shrinker->count_objects(shrinker, shrinkctl);
if (freeable == 0)
@@ -312,7 +313,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
shrinker->scan_objects, total_scan);
total_scan = freeable;
- }
+ next_deferred = nr;
+ } else
+ next_deferred = total_scan;
/*
* We need to avoid excessive windup on filesystem shrinkers
@@ -369,17 +372,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
count_vm_events(SLABS_SCANNED, nr_to_scan);
total_scan -= nr_to_scan;
+ scanned += nr_to_scan;
cond_resched();
}
+ if (next_deferred >= scanned)
+ next_deferred -= scanned;
+ else
+ next_deferred = 0;
/*
* move the unused scan count back into the shrinker in a
* manner that handles concurrent updates. If we exhausted the
* scan, there is no need to do an update.
*/
- if (total_scan > 0)
- new_nr = atomic_long_add_return(total_scan,
+ if (next_deferred > 0)
+ new_nr = atomic_long_add_return(next_deferred,
&shrinker->nr_deferred[nid]);
else
new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
@@ -3558,24 +3566,21 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
not required for correctness. So if the last cpu in a node goes
away, we get changed to run anywhere: as the first one comes back,
restore their cpu bindings. */
-static int cpu_callback(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
+static int kswapd_cpu_online(unsigned int cpu)
{
int nid;
- if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
- for_each_node_state(nid, N_MEMORY) {
- pg_data_t *pgdat = NODE_DATA(nid);
- const struct cpumask *mask;
+ for_each_node_state(nid, N_MEMORY) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ const struct cpumask *mask;
- mask = cpumask_of_node(pgdat->node_id);
+ mask = cpumask_of_node(pgdat->node_id);
- if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
- /* One of our CPUs online: restore mask */
- set_cpus_allowed_ptr(pgdat->kswapd, mask);
- }
+ if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
+ /* One of our CPUs online: restore mask */
+ set_cpus_allowed_ptr(pgdat->kswapd, mask);
}
- return NOTIFY_OK;
+ return 0;
}
/*
@@ -3617,12 +3622,15 @@ void kswapd_stop(int nid)
static int __init kswapd_init(void)
{
- int nid;
+ int nid, ret;
swap_setup();
for_each_node_state(nid, N_MEMORY)
kswapd_run(nid);
- hotcpu_notifier(cpu_callback, 0);
+ ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "mm/vmscan:online", kswapd_cpu_online,
+ NULL);
+ WARN_ON(ret < 0);
return 0;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 604f26a4f696..7c28df36f50f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1720,75 +1720,66 @@ static void __init start_shepherd_timer(void)
static void __init init_cpu_node_state(void)
{
- int cpu;
+ int node;
- get_online_cpus();
- for_each_online_cpu(cpu)
- node_set_state(cpu_to_node(cpu), N_CPU);
- put_online_cpus();
+ for_each_online_node(node) {
+ if (cpumask_weight(cpumask_of_node(node)) > 0)
+ node_set_state(node, N_CPU);
+ }
}
-static void vmstat_cpu_dead(int node)
+static int vmstat_cpu_online(unsigned int cpu)
{
- int cpu;
-
- get_online_cpus();
- for_each_online_cpu(cpu)
- if (cpu_to_node(cpu) == node)
- goto end;
+ refresh_zone_stat_thresholds();
+ node_set_state(cpu_to_node(cpu), N_CPU);
+ return 0;
+}
- node_clear_state(node, N_CPU);
-end:
- put_online_cpus();
+static int vmstat_cpu_down_prep(unsigned int cpu)
+{
+ cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+ return 0;
}
-/*
- * Use the cpu notifier to insure that the thresholds are recalculated
- * when necessary.
- */
-static int vmstat_cpuup_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
-{
- long cpu = (long)hcpu;
-
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- refresh_zone_stat_thresholds();
- node_set_state(cpu_to_node(cpu), N_CPU);
- break;
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
- break;
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- refresh_zone_stat_thresholds();
- vmstat_cpu_dead(cpu_to_node(cpu));
- break;
- default:
- break;
- }
- return NOTIFY_OK;
+static int vmstat_cpu_dead(unsigned int cpu)
+{
+ const struct cpumask *node_cpus;
+ int node;
+
+ node = cpu_to_node(cpu);
+
+ refresh_zone_stat_thresholds();
+ node_cpus = cpumask_of_node(node);
+ if (cpumask_weight(node_cpus) > 0)
+ return 0;
+
+ node_clear_state(node, N_CPU);
+ return 0;
}
-static struct notifier_block vmstat_notifier =
- { &vmstat_cpuup_callback, NULL, 0 };
#endif
static int __init setup_vmstat(void)
{
#ifdef CONFIG_SMP
- cpu_notifier_register_begin();
- __register_cpu_notifier(&vmstat_notifier);
+ int ret;
+
+ ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
+ NULL, vmstat_cpu_dead);
+ if (ret < 0)
+ pr_err("vmstat: failed to register 'dead' hotplug state\n");
+
+ ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
+ vmstat_cpu_online,
+ vmstat_cpu_down_prep);
+ if (ret < 0)
+ pr_err("vmstat: failed to register 'online' hotplug state\n");
+
+ get_online_cpus();
init_cpu_node_state();
+ put_online_cpus();
start_shepherd_timer();
- cpu_notifier_register_done();
#endif
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
diff --git a/mm/workingset.c b/mm/workingset.c
index fb1f9183d89a..241fa5d6b3b2 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -10,6 +10,7 @@
#include <linux/atomic.h>
#include <linux/module.h>
#include <linux/swap.h>
+#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/mm.h>
@@ -334,48 +335,81 @@ out:
* point where they would still be useful.
*/
-struct list_lru workingset_shadow_nodes;
+static struct list_lru shadow_nodes;
+
+void workingset_update_node(struct radix_tree_node *node, void *private)
+{
+ struct address_space *mapping = private;
+
+ /* Only regular page cache has shadow entries */
+ if (dax_mapping(mapping) || shmem_mapping(mapping))
+ return;
+
+ /*
+ * Track non-empty nodes that contain only shadow entries;
+ * unlink those that contain pages or are being freed.
+ *
+ * Avoid acquiring the list_lru lock when the nodes are
+ * already where they should be. The list_empty() test is safe
+ * as node->private_list is protected by &mapping->tree_lock.
+ */
+ if (node->count && node->count == node->exceptional) {
+ if (list_empty(&node->private_list)) {
+ node->private_data = mapping;
+ list_lru_add(&shadow_nodes, &node->private_list);
+ }
+ } else {
+ if (!list_empty(&node->private_list))
+ list_lru_del(&shadow_nodes, &node->private_list);
+ }
+}
static unsigned long count_shadow_nodes(struct shrinker *shrinker,
struct shrink_control *sc)
{
- unsigned long shadow_nodes;
unsigned long max_nodes;
- unsigned long pages;
+ unsigned long nodes;
+ unsigned long cache;
/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
local_irq_disable();
- shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
+ nodes = list_lru_shrink_count(&shadow_nodes, sc);
local_irq_enable();
- if (sc->memcg) {
- pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
- LRU_ALL_FILE);
- } else {
- pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
- node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
- }
-
/*
- * Active cache pages are limited to 50% of memory, and shadow
- * entries that represent a refault distance bigger than that
- * do not have any effect. Limit the number of shadow nodes
- * such that shadow entries do not exceed the number of active
- * cache pages, assuming a worst-case node population density
- * of 1/8th on average.
+ * Approximate a reasonable limit for the radix tree nodes
+ * containing shadow entries. We don't need to keep more
+ * shadow entries than possible pages on the active list,
+ * since refault distances bigger than that are dismissed.
+ *
+ * The size of the active list converges toward 100% of
+ * overall page cache as memory grows, with only a tiny
+ * inactive list. Assume the total cache size for that.
+ *
+ * Nodes might be sparsely populated, with only one shadow
+ * entry in the extreme case. Obviously, we cannot keep one
+ * node for every eligible shadow entry, so compromise on a
+ * worst-case density of 1/8th. Below that, not all eligible
+ * refaults can be detected anymore.
*
* On 64-bit with 7 radix_tree_nodes per page and 64 slots
* each, this will reclaim shadow entries when they consume
- * ~2% of available memory:
+ * ~1.8% of available memory:
*
- * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE
+ * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE
*/
- max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3);
+ if (sc->memcg) {
+ cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
+ LRU_ALL_FILE);
+ } else {
+ cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
+ node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
+ }
+ max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3);
- if (shadow_nodes <= max_nodes)
+ if (nodes <= max_nodes)
return 0;
-
- return shadow_nodes - max_nodes;
+ return nodes - max_nodes;
}
static enum lru_status shadow_lru_isolate(struct list_head *item,
@@ -418,23 +452,30 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
* no pages, so we expect to be able to remove them all and
* delete and free the empty node afterwards.
*/
- BUG_ON(!workingset_node_shadows(node));
- BUG_ON(workingset_node_pages(node));
-
+ if (WARN_ON_ONCE(!node->exceptional))
+ goto out_invalid;
+ if (WARN_ON_ONCE(node->count != node->exceptional))
+ goto out_invalid;
for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
if (node->slots[i]) {
- BUG_ON(!radix_tree_exceptional_entry(node->slots[i]));
+ if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i])))
+ goto out_invalid;
+ if (WARN_ON_ONCE(!node->exceptional))
+ goto out_invalid;
+ if (WARN_ON_ONCE(!mapping->nrexceptional))
+ goto out_invalid;
node->slots[i] = NULL;
- workingset_node_shadows_dec(node);
- BUG_ON(!mapping->nrexceptional);
+ node->exceptional--;
+ node->count--;
mapping->nrexceptional--;
}
}
- BUG_ON(workingset_node_shadows(node));
+ if (WARN_ON_ONCE(node->exceptional))
+ goto out_invalid;
inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
- if (!__radix_tree_delete_node(&mapping->page_tree, node))
- BUG();
+ __radix_tree_delete_node(&mapping->page_tree, node);
+out_invalid:
spin_unlock(&mapping->tree_lock);
ret = LRU_REMOVED_RETRY;
out:
@@ -452,8 +493,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
local_irq_disable();
- ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc,
- shadow_lru_isolate, NULL);
+ ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
local_irq_enable();
return ret;
}
@@ -492,7 +532,7 @@ static int __init workingset_init(void)
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
timestamp_bits, max_order, bucket_order);
- ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
+ ret = list_lru_init_key(&shadow_nodes, &shadow_nodes_key);
if (ret)
goto err;
ret = register_shrinker(&workingset_shadow_shrinker);
@@ -500,7 +540,7 @@ static int __init workingset_init(void)
goto err_list_lru;
return 0;
err_list_lru:
- list_lru_destroy(&workingset_shadow_nodes);
+ list_lru_destroy(&shadow_nodes);
err:
return ret;
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b0bc023d25c5..9cc3c0b2c2c1 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1284,61 +1284,21 @@ out:
#endif /* CONFIG_PGTABLE_MAPPING */
-static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action,
- void *pcpu)
+static int zs_cpu_prepare(unsigned int cpu)
{
- int ret, cpu = (long)pcpu;
struct mapping_area *area;
- switch (action) {
- case CPU_UP_PREPARE:
- area = &per_cpu(zs_map_area, cpu);
- ret = __zs_cpu_up(area);
- if (ret)
- return notifier_from_errno(ret);
- break;
- case CPU_DEAD:
- case CPU_UP_CANCELED:
- area = &per_cpu(zs_map_area, cpu);
- __zs_cpu_down(area);
- break;
- }
-
- return NOTIFY_OK;
+ area = &per_cpu(zs_map_area, cpu);
+ return __zs_cpu_up(area);
}
-static struct notifier_block zs_cpu_nb = {
- .notifier_call = zs_cpu_notifier
-};
-
-static int zs_register_cpu_notifier(void)
+static int zs_cpu_dead(unsigned int cpu)
{
- int cpu, uninitialized_var(ret);
-
- cpu_notifier_register_begin();
-
- __register_cpu_notifier(&zs_cpu_nb);
- for_each_online_cpu(cpu) {
- ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
- if (notifier_to_errno(ret))
- break;
- }
-
- cpu_notifier_register_done();
- return notifier_to_errno(ret);
-}
-
-static void zs_unregister_cpu_notifier(void)
-{
- int cpu;
-
- cpu_notifier_register_begin();
-
- for_each_online_cpu(cpu)
- zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu);
- __unregister_cpu_notifier(&zs_cpu_nb);
+ struct mapping_area *area;
- cpu_notifier_register_done();
+ area = &per_cpu(zs_map_area, cpu);
+ __zs_cpu_down(area);
+ return 0;
}
static void __init init_zs_size_classes(void)
@@ -2534,10 +2494,10 @@ static int __init zs_init(void)
if (ret)
goto out;
- ret = zs_register_cpu_notifier();
-
+ ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare",
+ zs_cpu_prepare, zs_cpu_dead);
if (ret)
- goto notifier_fail;
+ goto hp_setup_fail;
init_zs_size_classes();
@@ -2549,8 +2509,7 @@ static int __init zs_init(void)
return 0;
-notifier_fail:
- zs_unregister_cpu_notifier();
+hp_setup_fail:
zsmalloc_unmount();
out:
return ret;
@@ -2562,7 +2521,7 @@ static void __exit zs_exit(void)
zpool_unregister_driver(&zs_zpool_driver);
#endif
zsmalloc_unmount();
- zs_unregister_cpu_notifier();
+ cpuhp_remove_state(CPUHP_MM_ZS_PREPARE);
zs_stat_exit();
}
diff --git a/mm/zswap.c b/mm/zswap.c
index 275b22cc8df4..067a0d62f318 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -118,7 +118,7 @@ struct zswap_pool {
struct kref kref;
struct list_head list;
struct work_struct work;
- struct notifier_block notifier;
+ struct hlist_node node;
char tfm_name[CRYPTO_MAX_ALG_NAME];
};
@@ -352,143 +352,58 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
**********************************/
static DEFINE_PER_CPU(u8 *, zswap_dstmem);
-static int __zswap_cpu_dstmem_notifier(unsigned long action, unsigned long cpu)
+static int zswap_dstmem_prepare(unsigned int cpu)
{
u8 *dst;
- switch (action) {
- case CPU_UP_PREPARE:
- dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!dst) {
- pr_err("can't allocate compressor buffer\n");
- return NOTIFY_BAD;
- }
- per_cpu(zswap_dstmem, cpu) = dst;
- break;
- case CPU_DEAD:
- case CPU_UP_CANCELED:
- dst = per_cpu(zswap_dstmem, cpu);
- kfree(dst);
- per_cpu(zswap_dstmem, cpu) = NULL;
- break;
- default:
- break;
+ dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ if (!dst) {
+ pr_err("can't allocate compressor buffer\n");
+ return -ENOMEM;
}
- return NOTIFY_OK;
+ per_cpu(zswap_dstmem, cpu) = dst;
+ return 0;
}
-static int zswap_cpu_dstmem_notifier(struct notifier_block *nb,
- unsigned long action, void *pcpu)
+static int zswap_dstmem_dead(unsigned int cpu)
{
- return __zswap_cpu_dstmem_notifier(action, (unsigned long)pcpu);
-}
+ u8 *dst;
-static struct notifier_block zswap_dstmem_notifier = {
- .notifier_call = zswap_cpu_dstmem_notifier,
-};
+ dst = per_cpu(zswap_dstmem, cpu);
+ kfree(dst);
+ per_cpu(zswap_dstmem, cpu) = NULL;
-static int __init zswap_cpu_dstmem_init(void)
-{
- unsigned long cpu;
-
- cpu_notifier_register_begin();
- for_each_online_cpu(cpu)
- if (__zswap_cpu_dstmem_notifier(CPU_UP_PREPARE, cpu) ==
- NOTIFY_BAD)
- goto cleanup;
- __register_cpu_notifier(&zswap_dstmem_notifier);
- cpu_notifier_register_done();
return 0;
-
-cleanup:
- for_each_online_cpu(cpu)
- __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
- cpu_notifier_register_done();
- return -ENOMEM;
-}
-
-static void zswap_cpu_dstmem_destroy(void)
-{
- unsigned long cpu;
-
- cpu_notifier_register_begin();
- for_each_online_cpu(cpu)
- __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
- __unregister_cpu_notifier(&zswap_dstmem_notifier);
- cpu_notifier_register_done();
}
-static int __zswap_cpu_comp_notifier(struct zswap_pool *pool,
- unsigned long action, unsigned long cpu)
+static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
{
+ struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_comp *tfm;
- switch (action) {
- case CPU_UP_PREPARE:
- if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
- break;
- tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
- if (IS_ERR_OR_NULL(tfm)) {
- pr_err("could not alloc crypto comp %s : %ld\n",
- pool->tfm_name, PTR_ERR(tfm));
- return NOTIFY_BAD;
- }
- *per_cpu_ptr(pool->tfm, cpu) = tfm;
- break;
- case CPU_DEAD:
- case CPU_UP_CANCELED:
- tfm = *per_cpu_ptr(pool->tfm, cpu);
- if (!IS_ERR_OR_NULL(tfm))
- crypto_free_comp(tfm);
- *per_cpu_ptr(pool->tfm, cpu) = NULL;
- break;
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
-static int zswap_cpu_comp_notifier(struct notifier_block *nb,
- unsigned long action, void *pcpu)
-{
- unsigned long cpu = (unsigned long)pcpu;
- struct zswap_pool *pool = container_of(nb, typeof(*pool), notifier);
-
- return __zswap_cpu_comp_notifier(pool, action, cpu);
-}
+ if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
+ return 0;
-static int zswap_cpu_comp_init(struct zswap_pool *pool)
-{
- unsigned long cpu;
-
- memset(&pool->notifier, 0, sizeof(pool->notifier));
- pool->notifier.notifier_call = zswap_cpu_comp_notifier;
-
- cpu_notifier_register_begin();
- for_each_online_cpu(cpu)
- if (__zswap_cpu_comp_notifier(pool, CPU_UP_PREPARE, cpu) ==
- NOTIFY_BAD)
- goto cleanup;
- __register_cpu_notifier(&pool->notifier);
- cpu_notifier_register_done();
+ tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
+ if (IS_ERR_OR_NULL(tfm)) {
+ pr_err("could not alloc crypto comp %s : %ld\n",
+ pool->tfm_name, PTR_ERR(tfm));
+ return -ENOMEM;
+ }
+ *per_cpu_ptr(pool->tfm, cpu) = tfm;
return 0;
-
-cleanup:
- for_each_online_cpu(cpu)
- __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
- cpu_notifier_register_done();
- return -ENOMEM;
}
-static void zswap_cpu_comp_destroy(struct zswap_pool *pool)
+static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
{
- unsigned long cpu;
+ struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
+ struct crypto_comp *tfm;
- cpu_notifier_register_begin();
- for_each_online_cpu(cpu)
- __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
- __unregister_cpu_notifier(&pool->notifier);
- cpu_notifier_register_done();
+ tfm = *per_cpu_ptr(pool->tfm, cpu);
+ if (!IS_ERR_OR_NULL(tfm))
+ crypto_free_comp(tfm);
+ *per_cpu_ptr(pool->tfm, cpu) = NULL;
+ return 0;
}
/*********************************
@@ -569,6 +484,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
struct zswap_pool *pool;
char name[38]; /* 'zswap' + 32 char (max) num + \0 */
gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+ int ret;
pool = kzalloc(sizeof(*pool), GFP_KERNEL);
if (!pool) {
@@ -593,7 +509,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
goto error;
}
- if (zswap_cpu_comp_init(pool))
+ ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
+ &pool->node);
+ if (ret)
goto error;
pr_debug("using %s compressor\n", pool->tfm_name);
@@ -647,7 +565,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
{
zswap_pool_debug("destroying", pool);
- zswap_cpu_comp_destroy(pool);
+ cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
free_percpu(pool->tfm);
zpool_destroy_pool(pool->zpool);
kfree(pool);
@@ -1238,6 +1156,7 @@ static void __exit zswap_debugfs_exit(void) { }
static int __init init_zswap(void)
{
struct zswap_pool *pool;
+ int ret;
zswap_init_started = true;
@@ -1246,11 +1165,20 @@ static int __init init_zswap(void)
goto cache_fail;
}
- if (zswap_cpu_dstmem_init()) {
+ ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare",
+ zswap_dstmem_prepare, zswap_dstmem_dead);
+ if (ret) {
pr_err("dstmem alloc failed\n");
goto dstmem_fail;
}
+ ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
+ "mm/zswap_pool:prepare",
+ zswap_cpu_comp_prepare,
+ zswap_cpu_comp_dead);
+ if (ret)
+ goto hp_fail;
+
pool = __zswap_pool_create_fallback();
if (!pool) {
pr_err("pool creation failed\n");
@@ -1267,7 +1195,9 @@ static int __init init_zswap(void)
return 0;
pool_fail:
- zswap_cpu_dstmem_destroy();
+ cpuhp_remove_state_nocalls(CPUHP_MM_ZSWP_POOL_PREPARE);
+hp_fail:
+ cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
dstmem_fail:
zswap_entry_cache_destroy();
cache_fail: