summaryrefslogtreecommitdiff
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c179
1 files changed, 124 insertions, 55 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 7663068a33c6..14fc0b40f0bb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -30,7 +30,7 @@
/*
* 05.04.94 - Multi-page memory management added for v1.1.
- * Idea by Alex Bligh (alex@cconcepts.co.uk)
+ * Idea by Alex Bligh (alex@cconcepts.co.uk)
*
* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
* (Gerhard.Wichert@pdb.siemens.de)
@@ -82,9 +82,9 @@
#ifndef CONFIG_NEED_MULTIPLE_NODES
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
-struct page *mem_map;
-
EXPORT_SYMBOL(max_mapnr);
+
+struct page *mem_map;
EXPORT_SYMBOL(mem_map);
#endif
@@ -95,8 +95,7 @@ EXPORT_SYMBOL(mem_map);
* highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
* and ZONE_HIGHMEM.
*/
-void * high_memory;
-
+void *high_memory;
EXPORT_SYMBOL(high_memory);
/*
@@ -120,10 +119,10 @@ static int __init disable_randmaps(char *s)
__setup("norandmaps", disable_randmaps);
unsigned long zero_pfn __read_mostly;
-unsigned long highest_memmap_pfn __read_mostly;
-
EXPORT_SYMBOL(zero_pfn);
+unsigned long highest_memmap_pfn __read_mostly;
+
/*
* CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
*/
@@ -556,7 +555,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (is_vm_hugetlb_page(vma)) {
hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
- floor, next? next->vm_start: ceiling);
+ floor, next ? next->vm_start : ceiling);
} else {
/*
* Optimization: gather nearby vmas into one call down
@@ -569,7 +568,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
- floor, next? next->vm_start: ceiling);
+ floor, next ? next->vm_start : ceiling);
}
vma = next;
}
@@ -1001,7 +1000,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
int err;
- VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
+ VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
err = copy_huge_pmd(dst_mm, src_mm,
dst_pmd, src_pmd, addr, vma);
if (err == -ENOMEM)
@@ -1032,6 +1031,18 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
src_pud = pud_offset(src_pgd, addr);
do {
next = pud_addr_end(addr, end);
+ if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
+ int err;
+
+ VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
+ err = copy_huge_pud(dst_mm, src_mm,
+ dst_pud, src_pud, addr, vma);
+ if (err == -ENOMEM)
+ return -ENOMEM;
+ if (!err)
+ continue;
+ /* fall through */
+ }
if (pud_none_or_clear_bad(src_pud))
continue;
if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
@@ -1129,9 +1140,8 @@ again:
arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
- if (pte_none(ptent)) {
+ if (pte_none(ptent))
continue;
- }
if (pte_present(ptent)) {
struct page *page;
@@ -1263,9 +1273,19 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
+ if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
+ if (next - addr != HPAGE_PUD_SIZE) {
+ VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
+ split_huge_pud(vma, pud, addr);
+ } else if (zap_huge_pud(tlb, vma, pud, addr))
+ goto next;
+ /* fall through */
+ }
if (pud_none_or_clear_bad(pud))
continue;
next = zap_pmd_range(tlb, vma, pud, addr, next, details);
+next:
+ cond_resched();
} while (pud++, addr = next, addr != end);
return addr;
@@ -1441,10 +1461,10 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes);
pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
spinlock_t **ptl)
{
- pgd_t * pgd = pgd_offset(mm, addr);
- pud_t * pud = pud_alloc(mm, pgd, addr);
+ pgd_t *pgd = pgd_offset(mm, addr);
+ pud_t *pud = pud_alloc(mm, pgd, addr);
if (pud) {
- pmd_t * pmd = pmd_alloc(mm, pud, addr);
+ pmd_t *pmd = pmd_alloc(mm, pud, addr);
if (pmd) {
VM_BUG_ON(pmd_trans_huge(*pmd));
return pte_alloc_map_lock(mm, pmd, addr, ptl);
@@ -2035,7 +2055,7 @@ static int do_page_mkwrite(struct vm_fault *vmf)
vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
- ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf);
+ ret = vmf->vma->vm_ops->page_mkwrite(vmf);
/* Restore original flags so that caller is not surprised */
vmf->flags = old_flags;
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
@@ -2307,7 +2327,7 @@ static int wp_pfn_shared(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
vmf->flags |= FAULT_FLAG_MKWRITE;
- ret = vma->vm_ops->pfn_mkwrite(vma, vmf);
+ ret = vma->vm_ops->pfn_mkwrite(vmf);
if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
return ret;
return finish_mkwrite_fault(vmf);
@@ -2503,7 +2523,7 @@ void unmap_mapping_range(struct address_space *mapping,
hlen = ULONG_MAX - hba + 1;
}
- details.check_mapping = even_cows? NULL: mapping;
+ details.check_mapping = even_cows ? NULL : mapping;
details.first_index = hba;
details.last_index = hba + hlen - 1;
if (details.last_index < details.first_index)
@@ -2861,7 +2881,7 @@ static int __do_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
int ret;
- ret = vma->vm_ops->fault(vma, vmf);
+ ret = vma->vm_ops->fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
VM_FAULT_DONE_COW)))
return ret;
@@ -2898,7 +2918,7 @@ static int pte_alloc_one_map(struct vm_fault *vmf)
atomic_long_inc(&vma->vm_mm->nr_ptes);
pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
spin_unlock(vmf->ptl);
- vmf->prealloc_pte = 0;
+ vmf->prealloc_pte = NULL;
} else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
return VM_FAULT_OOM;
}
@@ -2946,7 +2966,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
* count that as nr_ptes.
*/
atomic_long_inc(&vma->vm_mm->nr_ptes);
- vmf->prealloc_pte = 0;
+ vmf->prealloc_pte = NULL;
}
static int do_set_pmd(struct vm_fault *vmf, struct page *page)
@@ -3352,7 +3372,7 @@ static int do_fault(struct vm_fault *vmf)
/* preallocated pagetable is unused: free it */
if (vmf->prealloc_pte) {
pte_free(vma->vm_mm, vmf->prealloc_pte);
- vmf->prealloc_pte = 0;
+ vmf->prealloc_pte = NULL;
}
return ret;
}
@@ -3380,32 +3400,32 @@ static int do_numa_page(struct vm_fault *vmf)
int last_cpupid;
int target_nid;
bool migrated = false;
- pte_t pte = vmf->orig_pte;
- bool was_writable = pte_write(pte);
+ pte_t pte;
+ bool was_writable = pte_savedwrite(vmf->orig_pte);
int flags = 0;
/*
- * The "pte" at this point cannot be used safely without
- * validation through pte_unmap_same(). It's of NUMA type but
- * the pfn may be screwed if the read is non atomic.
- *
- * We can safely just do a "set_pte_at()", because the old
- * page table entry is not accessible, so there would be no
- * concurrent hardware modifications to the PTE.
- */
+ * The "pte" at this point cannot be used safely without
+ * validation through pte_unmap_same(). It's of NUMA type but
+ * the pfn may be screwed if the read is non atomic.
+ */
vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
- if (unlikely(!pte_same(*vmf->pte, pte))) {
+ if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
- /* Make it present again */
+ /*
+ * Make it present again, Depending on how arch implementes non
+ * accessible ptes, some can allow access by kernel mode.
+ */
+ pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
pte = pte_modify(pte, vma->vm_page_prot);
pte = pte_mkyoung(pte);
if (was_writable)
pte = pte_mkwrite(pte);
- set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+ ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
update_mmu_cache(vma, vmf->address, vmf->pte);
page = vm_normal_page(vma, vmf->address, pte);
@@ -3466,8 +3486,8 @@ static int create_huge_pmd(struct vm_fault *vmf)
{
if (vma_is_anonymous(vmf->vma))
return do_huge_pmd_anonymous_page(vmf);
- if (vmf->vma->vm_ops->pmd_fault)
- return vmf->vma->vm_ops->pmd_fault(vmf);
+ if (vmf->vma->vm_ops->huge_fault)
+ return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
return VM_FAULT_FALLBACK;
}
@@ -3475,8 +3495,8 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
{
if (vma_is_anonymous(vmf->vma))
return do_huge_pmd_wp_page(vmf, orig_pmd);
- if (vmf->vma->vm_ops->pmd_fault)
- return vmf->vma->vm_ops->pmd_fault(vmf);
+ if (vmf->vma->vm_ops->huge_fault)
+ return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
/* COW handled on pte level: split pmd */
VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
@@ -3490,6 +3510,30 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
}
+static int create_huge_pud(struct vm_fault *vmf)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /* No support for anonymous transparent PUD pages yet */
+ if (vma_is_anonymous(vmf->vma))
+ return VM_FAULT_FALLBACK;
+ if (vmf->vma->vm_ops->huge_fault)
+ return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ return VM_FAULT_FALLBACK;
+}
+
+static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /* No support for anonymous transparent PUD pages yet */
+ if (vma_is_anonymous(vmf->vma))
+ return VM_FAULT_FALLBACK;
+ if (vmf->vma->vm_ops->huge_fault)
+ return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ return VM_FAULT_FALLBACK;
+}
+
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
@@ -3605,22 +3649,46 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
};
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
- pud_t *pud;
+ int ret;
pgd = pgd_offset(mm, address);
- pud = pud_alloc(mm, pgd, address);
- if (!pud)
+
+ vmf.pud = pud_alloc(mm, pgd, address);
+ if (!vmf.pud)
return VM_FAULT_OOM;
- vmf.pmd = pmd_alloc(mm, pud, address);
+ if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
+ ret = create_huge_pud(&vmf);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ } else {
+ pud_t orig_pud = *vmf.pud;
+
+ barrier();
+ if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
+ unsigned int dirty = flags & FAULT_FLAG_WRITE;
+
+ /* NUMA case for anonymous PUDs would go here */
+
+ if (dirty && !pud_write(orig_pud)) {
+ ret = wp_huge_pud(&vmf, orig_pud);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ } else {
+ huge_pud_set_accessed(&vmf, orig_pud);
+ return 0;
+ }
+ }
+ }
+
+ vmf.pmd = pmd_alloc(mm, vmf.pud, address);
if (!vmf.pmd)
return VM_FAULT_OOM;
if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
- int ret = create_huge_pmd(&vmf);
+ ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
pmd_t orig_pmd = *vmf.pmd;
- int ret;
barrier();
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
@@ -3680,14 +3748,14 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
if (flags & FAULT_FLAG_USER) {
mem_cgroup_oom_disable();
- /*
- * The task may have entered a memcg OOM situation but
- * if the allocation error was handled gracefully (no
- * VM_FAULT_OOM), there is no need to kill anything.
- * Just clean up the OOM state peacefully.
- */
- if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
- mem_cgroup_oom_synchronize(false);
+ /*
+ * The task may have entered a memcg OOM situation but
+ * if the allocation error was handled gracefully (no
+ * VM_FAULT_OOM), there is no need to kill anything.
+ * Just clean up the OOM state peacefully.
+ */
+ if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
+ mem_cgroup_oom_synchronize(false);
}
/*
@@ -3737,13 +3805,14 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
*/
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
+ spinlock_t *ptl;
pmd_t *new = pmd_alloc_one(mm, address);
if (!new)
return -ENOMEM;
smp_wmb(); /* See comment in __pte_alloc */
- spin_lock(&mm->page_table_lock);
+ ptl = pud_lock(mm, pud);
#ifndef __ARCH_HAS_4LEVEL_HACK
if (!pud_present(*pud)) {
mm_inc_nr_pmds(mm);
@@ -3757,7 +3826,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
} else /* Another has populated it */
pmd_free(mm, new);
#endif /* __ARCH_HAS_4LEVEL_HACK */
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
#endif /* __PAGETABLE_PMD_FOLDED */