summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/frontswap.c2
-rw-r--r--mm/huge_memory.c7
-rw-r--r--mm/hugetlb.c238
-rw-r--r--mm/memcontrol.c28
-rw-r--r--mm/memory.c19
-rw-r--r--mm/memory_hotplug.c90
-rw-r--r--mm/migrate.c25
-rw-r--r--mm/mmu_notifier.c79
-rw-r--r--mm/page_alloc.c8
-rw-r--r--mm/pagewalk.c70
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/shmem.c45
-rw-r--r--mm/slab_common.c4
-rw-r--r--mm/swap_state.c18
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/truncate.c117
17 files changed, 516 insertions, 240 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e742d06285b7..f5e698e30d4a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -173,7 +173,7 @@ config HAVE_BOOTMEM_INFO_NODE
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
- depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
+ depends on ARCH_ENABLE_MEMORY_HOTPLUG
depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
config MEMORY_HOTPLUG_SPARSE
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 538367ef1372..1b24bdcb3197 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -319,7 +319,7 @@ void __frontswap_invalidate_area(unsigned type)
return;
frontswap_ops->invalidate_area(type);
atomic_set(&sis->frontswap_pages, 0);
- memset(sis->frontswap_map, 0, sis->max / sizeof(long));
+ bitmap_zero(sis->frontswap_map, sis->max);
}
clear_bit(type, need_init);
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 03a89a2f464b..362c329b83fe 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2325,7 +2325,12 @@ static void collapse_huge_page(struct mm_struct *mm,
pte_unmap(pte);
spin_lock(&mm->page_table_lock);
BUG_ON(!pmd_none(*pmd));
- set_pmd_at(mm, address, pmd, _pmd);
+ /*
+ * We can only use set_pmd_at when establishing
+ * hugepmds and never for establishing regular pmds that
+ * points to regular pagetables. Use pmd_populate for that
+ */
+ pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(&mm->page_table_lock);
anon_vma_unlock_write(vma->anon_vma);
goto out;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f8feeeca6686..aed085ad11a8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -690,6 +690,23 @@ int PageHuge(struct page *page)
}
EXPORT_SYMBOL_GPL(PageHuge);
+pgoff_t __basepage_index(struct page *page)
+{
+ struct page *page_head = compound_head(page);
+ pgoff_t index = page_index(page_head);
+ unsigned long compound_idx;
+
+ if (!PageHuge(page_head))
+ return page_index(page);
+
+ if (compound_order(page_head) >= MAX_ORDER)
+ compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
+ else
+ compound_idx = page - page_head;
+
+ return (index << compound_order(page_head)) + compound_idx;
+}
+
static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
@@ -2839,7 +2856,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (ptep) {
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
- migration_entry_wait(mm, (pmd_t *)ptep, address);
+ migration_entry_wait_huge(mm, ptep);
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
@@ -2931,15 +2948,6 @@ out_mutex:
return ret;
}
-/* Can be overriden by architectures */
-__attribute__((weak)) struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int write)
-{
- BUG();
- return NULL;
-}
-
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
@@ -3169,6 +3177,216 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
hugetlb_acct_memory(h, -(chg - freed));
}
+#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+static unsigned long page_table_shareable(struct vm_area_struct *svma,
+ struct vm_area_struct *vma,
+ unsigned long addr, pgoff_t idx)
+{
+ unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
+ svma->vm_start;
+ unsigned long sbase = saddr & PUD_MASK;
+ unsigned long s_end = sbase + PUD_SIZE;
+
+ /* Allow segments to share if only one is marked locked */
+ unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
+ unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
+
+ /*
+ * match the virtual addresses, permission and the alignment of the
+ * page table page.
+ */
+ if (pmd_index(addr) != pmd_index(saddr) ||
+ vm_flags != svm_flags ||
+ sbase < svma->vm_start || svma->vm_end < s_end)
+ return 0;
+
+ return saddr;
+}
+
+static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+{
+ unsigned long base = addr & PUD_MASK;
+ unsigned long end = base + PUD_SIZE;
+
+ /*
+ * check on proper vm_flags and page table alignment
+ */
+ if (vma->vm_flags & VM_MAYSHARE &&
+ vma->vm_start <= base && end <= vma->vm_end)
+ return 1;
+ return 0;
+}
+
+/*
+ * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
+ * and returns the corresponding pte. While this is not necessary for the
+ * !shared pmd case because we can allocate the pmd later as well, it makes the
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_mutex section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
+ */
+pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+ struct vm_area_struct *vma = find_vma(mm, addr);
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
+ struct vm_area_struct *svma;
+ unsigned long saddr;
+ pte_t *spte = NULL;
+ pte_t *pte;
+
+ if (!vma_shareable(vma, addr))
+ return (pte_t *)pmd_alloc(mm, pud, addr);
+
+ mutex_lock(&mapping->i_mmap_mutex);
+ vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
+ if (svma == vma)
+ continue;
+
+ saddr = page_table_shareable(svma, vma, addr, idx);
+ if (saddr) {
+ spte = huge_pte_offset(svma->vm_mm, saddr);
+ if (spte) {
+ get_page(virt_to_page(spte));
+ break;
+ }
+ }
+ }
+
+ if (!spte)
+ goto out;
+
+ spin_lock(&mm->page_table_lock);
+ if (pud_none(*pud))
+ pud_populate(mm, pud,
+ (pmd_t *)((unsigned long)spte & PAGE_MASK));
+ else
+ put_page(virt_to_page(spte));
+ spin_unlock(&mm->page_table_lock);
+out:
+ pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ mutex_unlock(&mapping->i_mmap_mutex);
+ return pte;
+}
+
+/*
+ * unmap huge page backed by shared pte.
+ *
+ * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
+ * indicated by page_count > 1, unmap is achieved by clearing pud and
+ * decrementing the ref count. If count == 1, the pte page is not shared.
+ *
+ * called with vma->vm_mm->page_table_lock held.
+ *
+ * returns: 1 successfully unmapped a shared pte page
+ * 0 the underlying pte page is not shared, or it is the last user
+ */
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+ pgd_t *pgd = pgd_offset(mm, *addr);
+ pud_t *pud = pud_offset(pgd, *addr);
+
+ BUG_ON(page_count(virt_to_page(ptep)) == 0);
+ if (page_count(virt_to_page(ptep)) == 1)
+ return 0;
+
+ pud_clear(pud);
+ put_page(virt_to_page(ptep));
+ *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+ return 1;
+}
+#define want_pmd_share() (1)
+#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+ return NULL;
+}
+#define want_pmd_share() (0)
+#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+
+#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+ unsigned long addr, unsigned long sz)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pte_t *pte = NULL;
+
+ pgd = pgd_offset(mm, addr);
+ pud = pud_alloc(mm, pgd, addr);
+ if (pud) {
+ if (sz == PUD_SIZE) {
+ pte = (pte_t *)pud;
+ } else {
+ BUG_ON(sz != PMD_SIZE);
+ if (want_pmd_share() && pud_none(*pud))
+ pte = huge_pmd_share(mm, addr, pud);
+ else
+ pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ }
+ }
+ BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
+
+ return pte;
+}
+
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd = NULL;
+
+ pgd = pgd_offset(mm, addr);
+ if (pgd_present(*pgd)) {
+ pud = pud_offset(pgd, addr);
+ if (pud_present(*pud)) {
+ if (pud_huge(*pud))
+ return (pte_t *)pud;
+ pmd = pmd_offset(pud, addr);
+ }
+ }
+ return (pte_t *) pmd;
+}
+
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+ pmd_t *pmd, int write)
+{
+ struct page *page;
+
+ page = pte_page(*(pte_t *)pmd);
+ if (page)
+ page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
+ return page;
+}
+
+struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int write)
+{
+ struct page *page;
+
+ page = pte_page(*(pte_t *)pud);
+ if (page)
+ page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
+ return page;
+}
+
+#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+
+/* Can be overriden by architectures */
+__attribute__((weak)) struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int write)
+{
+ BUG();
+ return NULL;
+}
+
+#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+
#ifdef CONFIG_MEMORY_FAILURE
/* Should be called in hugetlb_lock */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cb1c9dedf9b6..194721839cf5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1199,7 +1199,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
mz = mem_cgroup_zoneinfo(root, nid, zid);
iter = &mz->reclaim_iter[reclaim->priority];
- last_visited = iter->last_visited;
if (prev && reclaim->generation != iter->generation) {
iter->last_visited = NULL;
goto out_unlock;
@@ -1218,13 +1217,12 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
* is alive.
*/
dead_count = atomic_read(&root->dead_count);
- smp_rmb();
- last_visited = iter->last_visited;
- if (last_visited) {
- if ((dead_count != iter->last_dead_count) ||
- !css_tryget(&last_visited->css)) {
+ if (dead_count == iter->last_dead_count) {
+ smp_rmb();
+ last_visited = iter->last_visited;
+ if (last_visited &&
+ !css_tryget(&last_visited->css))
last_visited = NULL;
- }
}
}
@@ -3141,8 +3139,6 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
return -ENOMEM;
}
- INIT_WORK(&s->memcg_params->destroy,
- kmem_cache_destroy_work_func);
s->memcg_params->is_root_cache = true;
/*
@@ -4108,8 +4104,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
if (mem_cgroup_disabled())
return NULL;
- VM_BUG_ON(PageSwapCache(page));
-
if (PageTransHuge(page)) {
nr_pages <<= compound_order(page);
VM_BUG_ON(!PageTransHuge(page));
@@ -4205,6 +4199,18 @@ void mem_cgroup_uncharge_page(struct page *page)
if (page_mapped(page))
return;
VM_BUG_ON(page->mapping && !PageAnon(page));
+ /*
+ * If the page is in swap cache, uncharge should be deferred
+ * to the swap path, which also properly accounts swap usage
+ * and handles memcg lifetime.
+ *
+ * Note that this check is not stable and reclaim may add the
+ * page to swap cache at any time after this. However, if the
+ * page is not in swap cache by the time page->mapcount hits
+ * 0, there won't be any page table references to the swap
+ * slot, and reclaim will free it and not actually write the
+ * page to disk.
+ */
if (PageSwapCache(page))
return;
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
diff --git a/mm/memory.c b/mm/memory.c
index 6dc1882fbd72..95d0cce63583 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -220,7 +220,6 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
tlb->start = -1UL;
tlb->end = 0;
tlb->need_flush = 0;
- tlb->fast_mode = (num_possible_cpus() == 1);
tlb->local.next = NULL;
tlb->local.nr = 0;
tlb->local.max = ARRAY_SIZE(tlb->__pages);
@@ -244,9 +243,6 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
tlb_table_flush(tlb);
#endif
- if (tlb_fast_mode(tlb))
- return;
-
for (batch = &tlb->local; batch; batch = batch->next) {
free_pages_and_swap_cache(batch->pages, batch->nr);
batch->nr = 0;
@@ -288,11 +284,6 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
VM_BUG_ON(!tlb->need_flush);
- if (tlb_fast_mode(tlb)) {
- free_page_and_swap_cache(page);
- return 1; /* avoid calling tlb_flush_mmu() */
- }
-
batch = tlb->active;
batch->pages[batch->nr++] = page;
if (batch->nr == batch->max) {
@@ -4210,7 +4201,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
up_read(&mm->mmap_sem);
}
-#ifdef CONFIG_PROVE_LOCKING
+#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
void might_fault(void)
{
/*
@@ -4222,13 +4213,17 @@ void might_fault(void)
if (segment_eq(get_fs(), KERNEL_DS))
return;
- might_sleep();
/*
* it would be nicer only to annotate paths which are not under
* pagefault_disable, however that requires a larger audit and
* providing helpers like get_user_atomic.
*/
- if (!in_atomic() && current->mm)
+ if (in_atomic())
+ return;
+
+ __might_sleep(__FILE__, __LINE__, 0);
+
+ if (current->mm)
might_lock_read(&current->mm->mmap_sem);
}
EXPORT_SYMBOL(might_fault);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a221fac1f47d..081b4d654ed6 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -720,9 +720,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
start = phys_start_pfn << PAGE_SHIFT;
size = nr_pages * PAGE_SIZE;
ret = release_mem_region_adjustable(&iomem_resource, start, size);
- if (ret)
- pr_warn("Unable to release resource <%016llx-%016llx> (%d)\n",
- start, start + size - 1, ret);
+ if (ret) {
+ resource_size_t endres = start + size - 1;
+
+ pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
+ &start, &endres, ret);
+ }
sections_to_remove = nr_pages / PAGES_PER_SECTION;
for (i = 0; i < sections_to_remove; i++) {
@@ -1618,6 +1621,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
/**
* walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
@@ -1631,7 +1635,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
*
* Returns the return value of func.
*/
-static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
+int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
void *arg, int (*func)(struct memory_block *, void *))
{
struct memory_block *mem = NULL;
@@ -1668,24 +1672,7 @@ static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
return 0;
}
-/**
- * offline_memory_block_cb - callback function for offlining memory block
- * @mem: the memory block to be offlined
- * @arg: buffer to hold error msg
- *
- * Always return 0, and put the error msg in arg if any.
- */
-static int offline_memory_block_cb(struct memory_block *mem, void *arg)
-{
- int *ret = arg;
- int error = offline_memory_block(mem);
-
- if (error != 0 && *ret == 0)
- *ret = error;
-
- return 0;
-}
-
+#ifdef CONFIG_MEMORY_HOTREMOVE
static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
int ret = !is_memblock_offlined(mem);
@@ -1811,54 +1798,22 @@ void try_offline_node(int nid)
}
EXPORT_SYMBOL(try_offline_node);
-int __ref remove_memory(int nid, u64 start, u64 size)
+void __ref remove_memory(int nid, u64 start, u64 size)
{
- unsigned long start_pfn, end_pfn;
- int ret = 0;
- int retry = 1;
-
- start_pfn = PFN_DOWN(start);
- end_pfn = PFN_UP(start + size - 1);
-
- /*
- * When CONFIG_MEMCG is on, one memory block may be used by other
- * blocks to store page cgroup when onlining pages. But we don't know
- * in what order pages are onlined. So we iterate twice to offline
- * memory:
- * 1st iterate: offline every non primary memory block.
- * 2nd iterate: offline primary (i.e. first added) memory block.
- */
-repeat:
- walk_memory_range(start_pfn, end_pfn, &ret,
- offline_memory_block_cb);
- if (ret) {
- if (!retry)
- return ret;
-
- retry = 0;
- ret = 0;
- goto repeat;
- }
+ int ret;
lock_memory_hotplug();
/*
- * we have offlined all memory blocks like this:
- * 1. lock memory hotplug
- * 2. offline a memory block
- * 3. unlock memory hotplug
- *
- * repeat step1-3 to offline the memory block. All memory blocks
- * must be offlined before removing memory. But we don't hold the
- * lock in the whole operation. So we should check whether all
- * memory blocks are offlined.
+ * All memory blocks must be offlined before removing memory. Check
+ * whether all memory blocks in question are offline and trigger a BUG()
+ * if this is not the case.
*/
-
- ret = walk_memory_range(start_pfn, end_pfn, NULL,
+ ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
is_memblock_offlined_cb);
if (ret) {
unlock_memory_hotplug();
- return ret;
+ BUG();
}
/* remove memmap entry */
@@ -1869,17 +1824,6 @@ repeat:
try_offline_node(nid);
unlock_memory_hotplug();
-
- return 0;
-}
-#else
-int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
-{
- return -EINVAL;
}
-int remove_memory(int nid, u64 start, u64 size)
-{
- return -EINVAL;
-}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
EXPORT_SYMBOL_GPL(remove_memory);
+#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/migrate.c b/mm/migrate.c
index 27ed22579fd9..6f0c24438bba 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -165,7 +165,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte = arch_make_huge_pte(pte, vma, new, 0);
}
#endif
- flush_cache_page(vma, addr, pte_pfn(pte));
+ flush_dcache_page(new);
set_pte_at(mm, addr, ptep, pte);
if (PageHuge(new)) {
@@ -200,15 +200,14 @@ static void remove_migration_ptes(struct page *old, struct page *new)
* get to the page and wait until migration is finished.
* When we return from this function the fault will be retried.
*/
-void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
- unsigned long address)
+static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
+ spinlock_t *ptl)
{
- pte_t *ptep, pte;
- spinlock_t *ptl;
+ pte_t pte;
swp_entry_t entry;
struct page *page;
- ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+ spin_lock(ptl);
pte = *ptep;
if (!is_swap_pte(pte))
goto out;
@@ -236,6 +235,20 @@ out:
pte_unmap_unlock(ptep, ptl);
}
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long address)
+{
+ spinlock_t *ptl = pte_lockptr(mm, pmd);
+ pte_t *ptep = pte_offset_map(pmd, address);
+ __migration_entry_wait(mm, ptep, ptl);
+}
+
+void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte)
+{
+ spinlock_t *ptl = &(mm)->page_table_lock;
+ __migration_entry_wait(mm, pte, ptl);
+}
+
#ifdef CONFIG_BLOCK
/* Returns true if all buffers are successfully locked */
static bool buffer_migrate_lock_buffers(struct buffer_head *head,
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index be04122fb277..6725ff183374 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -40,48 +40,44 @@ void __mmu_notifier_release(struct mm_struct *mm)
int id;
/*
- * srcu_read_lock() here will block synchronize_srcu() in
- * mmu_notifier_unregister() until all registered
- * ->release() callouts this function makes have
- * returned.
+ * SRCU here will block mmu_notifier_unregister until
+ * ->release returns.
*/
id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist)
+ /*
+ * If ->release runs before mmu_notifier_unregister it must be
+ * handled, as it's the only way for the driver to flush all
+ * existing sptes and stop the driver from establishing any more
+ * sptes before all the pages in the mm are freed.
+ */
+ if (mn->ops->release)
+ mn->ops->release(mn, mm);
+ srcu_read_unlock(&srcu, id);
+
spin_lock(&mm->mmu_notifier_mm->lock);
while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
mn = hlist_entry(mm->mmu_notifier_mm->list.first,
struct mmu_notifier,
hlist);
-
/*
- * Unlink. This will prevent mmu_notifier_unregister()
- * from also making the ->release() callout.
+ * We arrived before mmu_notifier_unregister so
+ * mmu_notifier_unregister will do nothing other than to wait
+ * for ->release to finish and for mmu_notifier_unregister to
+ * return.
*/
hlist_del_init_rcu(&mn->hlist);
- spin_unlock(&mm->mmu_notifier_mm->lock);
-
- /*
- * Clear sptes. (see 'release' description in mmu_notifier.h)
- */
- if (mn->ops->release)
- mn->ops->release(mn, mm);
-
- spin_lock(&mm->mmu_notifier_mm->lock);
}
spin_unlock(&mm->mmu_notifier_mm->lock);
/*
- * All callouts to ->release() which we have done are complete.
- * Allow synchronize_srcu() in mmu_notifier_unregister() to complete
- */
- srcu_read_unlock(&srcu, id);
-
- /*
- * mmu_notifier_unregister() may have unlinked a notifier and may
- * still be calling out to it. Additionally, other notifiers
- * may have been active via vmtruncate() et. al. Block here
- * to ensure that all notifier callouts for this mm have been
- * completed and the sptes are really cleaned up before returning
- * to exit_mmap().
+ * synchronize_srcu here prevents mmu_notifier_release from returning to
+ * exit_mmap (which would proceed with freeing all pages in the mm)
+ * until the ->release method returns, if it was invoked by
+ * mmu_notifier_unregister.
+ *
+ * The mmu_notifier_mm can't go away from under us because one mm_count
+ * is held by exit_mmap.
*/
synchronize_srcu(&srcu);
}
@@ -292,31 +288,34 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
{
BUG_ON(atomic_read(&mm->mm_count) <= 0);
- spin_lock(&mm->mmu_notifier_mm->lock);
if (!hlist_unhashed(&mn->hlist)) {
+ /*
+ * SRCU here will force exit_mmap to wait for ->release to
+ * finish before freeing the pages.
+ */
int id;
+ id = srcu_read_lock(&srcu);
/*
- * Ensure we synchronize up with __mmu_notifier_release().
+ * exit_mmap will block in mmu_notifier_release to guarantee
+ * that ->release is called before freeing the pages.
*/
- id = srcu_read_lock(&srcu);
-
- hlist_del_rcu(&mn->hlist);
- spin_unlock(&mm->mmu_notifier_mm->lock);
-
if (mn->ops->release)
mn->ops->release(mn, mm);
+ srcu_read_unlock(&srcu, id);
+ spin_lock(&mm->mmu_notifier_mm->lock);
/*
- * Allow __mmu_notifier_release() to complete.
+ * Can not use list_del_rcu() since __mmu_notifier_release
+ * can delete it before we hold the lock.
*/
- srcu_read_unlock(&srcu, id);
- } else
+ hlist_del_init_rcu(&mn->hlist);
spin_unlock(&mm->mmu_notifier_mm->lock);
+ }
/*
- * Wait for any running method to finish, including ->release() if it
- * was run by __mmu_notifier_release() instead of us.
+ * Wait for any running method to finish, of course including
+ * ->release if it was run by mmu_notifier_relase instead of us.
*/
synchronize_srcu(&srcu);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 98cbdf6e5532..c3edb624fccf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1628,6 +1628,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
long min = mark;
long lowmem_reserve = z->lowmem_reserve[classzone_idx];
int o;
+ long free_cma = 0;
free_pages -= (1 << order) - 1;
if (alloc_flags & ALLOC_HIGH)
@@ -1637,9 +1638,10 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
if (!(alloc_flags & ALLOC_CMA))
- free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
+ free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
- if (free_pages <= min + lowmem_reserve)
+
+ if (free_pages - free_cma <= min + lowmem_reserve)
return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
@@ -5158,7 +5160,7 @@ unsigned long free_reserved_area(unsigned long start, unsigned long end,
for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) {
if (poison)
memset((void *)pos, poison, PAGE_SIZE);
- free_reserved_page(virt_to_page(pos));
+ free_reserved_page(virt_to_page((void *)pos));
}
if (pages && s)
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 35aa294656cd..5da2cbcfdbb5 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -127,28 +127,7 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
return 0;
}
-static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
-{
- struct vm_area_struct *vma;
-
- /* We don't need vma lookup at all. */
- if (!walk->hugetlb_entry)
- return NULL;
-
- VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
- vma = find_vma(walk->mm, addr);
- if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma))
- return vma;
-
- return NULL;
-}
-
#else /* CONFIG_HUGETLB_PAGE */
-static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
-{
- return NULL;
-}
-
static int walk_hugetlb_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
@@ -198,30 +177,53 @@ int walk_page_range(unsigned long addr, unsigned long end,
if (!walk->mm)
return -EINVAL;
+ VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
+
pgd = pgd_offset(walk->mm, addr);
do {
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma = NULL;
next = pgd_addr_end(addr, end);
/*
- * handle hugetlb vma individually because pagetable walk for
- * the hugetlb page is dependent on the architecture and
- * we can't handled it in the same manner as non-huge pages.
+ * This function was not intended to be vma based.
+ * But there are vma special cases to be handled:
+ * - hugetlb vma's
+ * - VM_PFNMAP vma's
*/
- vma = hugetlb_vma(addr, walk);
+ vma = find_vma(walk->mm, addr);
if (vma) {
- if (vma->vm_end < next)
+ /*
+ * There are no page structures backing a VM_PFNMAP
+ * range, so do not allow split_huge_page_pmd().
+ */
+ if ((vma->vm_start <= addr) &&
+ (vma->vm_flags & VM_PFNMAP)) {
next = vma->vm_end;
+ pgd = pgd_offset(walk->mm, next);
+ continue;
+ }
/*
- * Hugepage is very tightly coupled with vma, so
- * walk through hugetlb entries within a given vma.
+ * Handle hugetlb vma individually because pagetable
+ * walk for the hugetlb page is dependent on the
+ * architecture and we can't handled it in the same
+ * manner as non-huge pages.
*/
- err = walk_hugetlb_range(vma, addr, next, walk);
- if (err)
- break;
- pgd = pgd_offset(walk->mm, next);
- continue;
+ if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
+ is_vm_hugetlb_page(vma)) {
+ if (vma->vm_end < next)
+ next = vma->vm_end;
+ /*
+ * Hugepage is very tightly coupled with vma,
+ * so walk through hugetlb entries within a
+ * given vma.
+ */
+ err = walk_hugetlb_range(vma, addr, next, walk);
+ if (err)
+ break;
+ pgd = pgd_offset(walk->mm, next);
+ continue;
+ }
}
if (pgd_none_or_clear_bad(pgd)) {
diff --git a/mm/readahead.c b/mm/readahead.c
index daed28dd5830..829a77c62834 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -48,7 +48,7 @@ static void read_cache_pages_invalidate_page(struct address_space *mapping,
if (!trylock_page(page))
BUG();
page->mapping = mapping;
- do_invalidatepage(page, 0);
+ do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
page->mapping = NULL;
unlock_page(page);
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 5e6a8422658b..a87990cf9f94 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1798,10 +1798,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
}
}
- if (offset >= 0 && offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
+ offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
mutex_unlock(&inode->i_mutex);
return offset;
}
@@ -1939,6 +1936,13 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
if (inode) {
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ error = generic_acl_init(inode, dir);
+ if (error) {
+ iput(inode);
+ return error;
+ }
+#endif
error = security_inode_init_security(inode, dir,
&dentry->d_name,
shmem_initxattrs, NULL);
@@ -1948,6 +1952,33 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
return error;
}
}
+
+ error = 0;
+ dir->i_size += BOGO_DIRENT_SIZE;
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ d_instantiate(dentry, inode);
+ dget(dentry); /* Extra count - pin the dentry in core */
+ }
+ return error;
+}
+
+static int
+shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode;
+ int error = -ENOSPC;
+
+ inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
+ if (inode) {
+ error = security_inode_init_security(inode, dir,
+ NULL,
+ shmem_initxattrs, NULL);
+ if (error) {
+ if (error != -EOPNOTSUPP) {
+ iput(inode);
+ return error;
+ }
+ }
#ifdef CONFIG_TMPFS_POSIX_ACL
error = generic_acl_init(inode, dir);
if (error) {
@@ -1957,10 +1988,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
#else
error = 0;
#endif
- dir->i_size += BOGO_DIRENT_SIZE;
- dir->i_ctime = dir->i_mtime = CURRENT_TIME;
- d_instantiate(dentry, inode);
- dget(dentry); /* Extra count - pin the dentry in core */
+ d_tmpfile(dentry, inode);
}
return error;
}
@@ -2723,6 +2751,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
.rmdir = shmem_rmdir,
.mknod = shmem_mknod,
.rename = shmem_rename,
+ .tmpfile = shmem_tmpfile,
#endif
#ifdef CONFIG_TMPFS_XATTR
.setxattr = shmem_setxattr,
diff --git a/mm/slab_common.c b/mm/slab_common.c
index ff3218a0f5e1..2d414508e9ec 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -373,8 +373,10 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
{
int index;
- if (WARN_ON_ONCE(size > KMALLOC_MAX_SIZE))
+ if (size > KMALLOC_MAX_SIZE) {
+ WARN_ON_ONCE(!(flags & __GFP_NOWARN));
return NULL;
+ }
if (size <= 192) {
if (!size)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b3d40dcf3624..f24ab0dff554 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -336,8 +336,24 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* Swap entry may have been freed since our caller observed it.
*/
err = swapcache_prepare(entry);
- if (err == -EEXIST) { /* seems racy */
+ if (err == -EEXIST) {
radix_tree_preload_end();
+ /*
+ * We might race against get_swap_page() and stumble
+ * across a SWAP_HAS_CACHE swap_map entry whose page
+ * has not been brought into the swapcache yet, while
+ * the other end is scheduled away waiting on discard
+ * I/O completion at scan_swap_map().
+ *
+ * In order to avoid turning this transitory state
+ * into a permanent loop around this -EEXIST case
+ * if !CONFIG_PREEMPT and the I/O completion happens
+ * to be waiting on the CPU waitqueue where we are now
+ * busy looping, we just conditionally invoke the
+ * scheduler here, if there are some more important
+ * tasks to run.
+ */
+ cond_resched();
continue;
}
if (err) { /* swp entry is obsolete ? */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6c340d908b27..746af55b8455 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2116,7 +2116,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
/* frontswap enabled? set up bit-per-page map for frontswap */
if (frontswap_enabled)
- frontswap_map = vzalloc(maxpages / sizeof(long));
+ frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long));
if (p->bdev) {
if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
diff --git a/mm/truncate.c b/mm/truncate.c
index c75b736e54b7..e2e8a8a7eb9d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -26,7 +26,8 @@
/**
* do_invalidatepage - invalidate part or all of a page
* @page: the page which is affected
- * @offset: the index of the truncation point
+ * @offset: start of the range to invalidate
+ * @length: length of the range to invalidate
*
* do_invalidatepage() is called when all or part of the page has become
* invalidated by a truncate operation.
@@ -37,24 +38,18 @@
* point. Because the caller is about to free (and possibly reuse) those
* blocks on-disk.
*/
-void do_invalidatepage(struct page *page, unsigned long offset)
+void do_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
- void (*invalidatepage)(struct page *, unsigned long);
+ void (*invalidatepage)(struct page *, unsigned int, unsigned int);
+
invalidatepage = page->mapping->a_ops->invalidatepage;
#ifdef CONFIG_BLOCK
if (!invalidatepage)
invalidatepage = block_invalidatepage;
#endif
if (invalidatepage)
- (*invalidatepage)(page, offset);
-}
-
-static inline void truncate_partial_page(struct page *page, unsigned partial)
-{
- zero_user_segment(page, partial, PAGE_CACHE_SIZE);
- cleancache_invalidate_page(page->mapping, page);
- if (page_has_private(page))
- do_invalidatepage(page, partial);
+ (*invalidatepage)(page, offset, length);
}
/*
@@ -103,7 +98,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
return -EIO;
if (page_has_private(page))
- do_invalidatepage(page, 0);
+ do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
cancel_dirty_page(page, PAGE_CACHE_SIZE);
@@ -185,11 +180,11 @@ int invalidate_inode_page(struct page *page)
* truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
* @mapping: mapping to truncate
* @lstart: offset from which to truncate
- * @lend: offset to which to truncate
+ * @lend: offset to which to truncate (inclusive)
*
* Truncate the page cache, removing the pages that are between
- * specified offsets (and zeroing out partial page
- * (if lstart is not page aligned)).
+ * specified offsets (and zeroing out partial pages
+ * if lstart or lend + 1 is not page aligned).
*
* Truncate takes two passes - the first pass is nonblocking. It will not
* block on page locks and it will not block on writeback. The second pass
@@ -200,35 +195,58 @@ int invalidate_inode_page(struct page *page)
* We pass down the cache-hot hint to the page freeing code. Even if the
* mapping is large, it is probably the case that the final pages are the most
* recently touched, and freeing happens in ascending file offset order.
+ *
+ * Note that since ->invalidatepage() accepts range to invalidate
+ * truncate_inode_pages_range is able to handle cases where lend + 1 is not
+ * page aligned properly.
*/
void truncate_inode_pages_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
{
- const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
- const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
- struct pagevec pvec;
- pgoff_t index;
- pgoff_t end;
- int i;
+ pgoff_t start; /* inclusive */
+ pgoff_t end; /* exclusive */
+ unsigned int partial_start; /* inclusive */
+ unsigned int partial_end; /* exclusive */
+ struct pagevec pvec;
+ pgoff_t index;
+ int i;
cleancache_invalidate_inode(mapping);
if (mapping->nrpages == 0)
return;
- BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
- end = (lend >> PAGE_CACHE_SHIFT);
+ /* Offsets within partial pages */
+ partial_start = lstart & (PAGE_CACHE_SIZE - 1);
+ partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
+
+ /*
+ * 'start' and 'end' always covers the range of pages to be fully
+ * truncated. Partial pages are covered with 'partial_start' at the
+ * start of the range and 'partial_end' at the end of the range.
+ * Note that 'end' is exclusive while 'lend' is inclusive.
+ */
+ start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if (lend == -1)
+ /*
+ * lend == -1 indicates end-of-file so we have to set 'end'
+ * to the highest possible pgoff_t and since the type is
+ * unsigned we're using -1.
+ */
+ end = -1;
+ else
+ end = (lend + 1) >> PAGE_CACHE_SHIFT;
pagevec_init(&pvec, 0);
index = start;
- while (index <= end && pagevec_lookup(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+ while (index < end && pagevec_lookup(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
/* We rely upon deletion not changing page->index */
index = page->index;
- if (index > end)
+ if (index >= end)
break;
if (!trylock_page(page))
@@ -247,27 +265,56 @@ void truncate_inode_pages_range(struct address_space *mapping,
index++;
}
- if (partial) {
+ if (partial_start) {
struct page *page = find_lock_page(mapping, start - 1);
if (page) {
+ unsigned int top = PAGE_CACHE_SIZE;
+ if (start > end) {
+ /* Truncation within a single page */
+ top = partial_end;
+ partial_end = 0;
+ }
wait_on_page_writeback(page);
- truncate_partial_page(page, partial);
+ zero_user_segment(page, partial_start, top);
+ cleancache_invalidate_page(mapping, page);
+ if (page_has_private(page))
+ do_invalidatepage(page, partial_start,
+ top - partial_start);
unlock_page(page);
page_cache_release(page);
}
}
+ if (partial_end) {
+ struct page *page = find_lock_page(mapping, end);
+ if (page) {
+ wait_on_page_writeback(page);
+ zero_user_segment(page, 0, partial_end);
+ cleancache_invalidate_page(mapping, page);
+ if (page_has_private(page))
+ do_invalidatepage(page, 0,
+ partial_end);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ }
+ /*
+ * If the truncation happened within a single page no pages
+ * will be released, just zeroed, so we can bail out now.
+ */
+ if (start >= end)
+ return;
index = start;
for ( ; ; ) {
cond_resched();
if (!pagevec_lookup(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+ min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
if (index == start)
break;
index = start;
continue;
}
- if (index == start && pvec.pages[0]->index > end) {
+ if (index == start && pvec.pages[0]->index >= end) {
pagevec_release(&pvec);
break;
}
@@ -277,7 +324,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
/* We rely upon deletion not changing page->index */
index = page->index;
- if (index > end)
+ if (index >= end)
break;
lock_page(page);
@@ -598,10 +645,8 @@ void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
* This rounding is currently just for example: unmap_mapping_range
* expands its hole outwards, whereas we want it to contract the hole
* inwards. However, existing callers of truncate_pagecache_range are
- * doing their own page rounding first; and truncate_inode_pages_range
- * currently BUGs if lend is not pagealigned-1 (it handles partial
- * page at start of hole, but not partial page at end of hole). Note
- * unmap_mapping_range allows holelen 0 for all, and we allow lend -1.
+ * doing their own page rounding first. Note that unmap_mapping_range
+ * allows holelen 0 for all, and we allow lend -1 for end of file.
*/
/*