summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig7
-rw-r--r--mm/compaction.c2
-rw-r--r--mm/fadvise.c10
-rw-r--r--mm/filemap.c1
-rw-r--r--mm/gup.c53
-rw-r--r--mm/hmm.c17
-rw-r--r--mm/huge_memory.c82
-rw-r--r--mm/hugetlb.c377
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/interval_tree.c2
-rw-r--r--mm/khugepaged.c15
-rw-r--r--mm/kmemleak.c1
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memcontrol.c287
-rw-r--r--mm/memory-failure.c48
-rw-r--r--mm/memory.c96
-rw-r--r--mm/memory_hotplug.c48
-rw-r--r--mm/mempolicy.c39
-rw-r--r--mm/migrate.c3
-rw-r--r--mm/mmu_notifier.c31
-rw-r--r--mm/mprotect.c5
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/oom_kill.c21
-rw-r--r--mm/page_alloc.c180
-rw-r--r--mm/page_ext.c2
-rw-r--r--mm/page_owner.c20
-rw-r--r--mm/pgtable-generic.c6
-rw-r--r--mm/shmem.c59
-rw-r--r--mm/slab.c40
-rw-r--r--mm/slab.h11
-rw-r--r--mm/slab_common.c118
-rw-r--r--mm/slub.c61
-rw-r--r--mm/sparse-vmemmap.c67
-rw-r--r--mm/sparse.c49
-rw-r--r--mm/swap.c27
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/truncate.c23
-rw-r--r--mm/usercopy.c133
-rw-r--r--mm/util.c36
-rw-r--r--mm/vmscan.c86
-rw-r--r--mm/zpool.c25
-rw-r--r--mm/zsmalloc.c31
-rw-r--r--mm/zswap.c91
43 files changed, 1216 insertions, 1007 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 03ff7703d322..c782e8fb7235 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -639,15 +639,10 @@ config MAX_STACK_SIZE_MB
A sane initial value is 80 MB.
-# For architectures that support deferred memory initialisation
-config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
- bool
-
config DEFERRED_STRUCT_PAGE_INIT
bool "Defer initialisation of struct pages to kthreads"
default n
- depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
- depends on NO_BOOTMEM && MEMORY_HOTPLUG
+ depends on NO_BOOTMEM
depends on !FLATMEM
help
Ordinarily all struct pages are initialised during early boot in a
diff --git a/mm/compaction.c b/mm/compaction.c
index 10cd757f1006..2c8999d027ab 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1738,7 +1738,7 @@ int sysctl_extfrag_threshold = 500;
* @order: The order of the current allocation
* @alloc_flags: The allocation flags of the current allocation
* @ac: The context of current allocation
- * @mode: The migration mode for async, sync light, or sync migration
+ * @prio: Determines how hard direct compaction should try to succeed
*
* This is the main entry point for direct page compaction.
*/
diff --git a/mm/fadvise.c b/mm/fadvise.c
index ec70d6e4b86d..767887f5f3bf 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -127,7 +127,15 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
*/
start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT;
end_index = (endbyte >> PAGE_SHIFT);
- if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK) {
+ /*
+ * The page at end_index will be inclusively discarded according
+ * by invalidate_mapping_pages(), so subtracting 1 from
+ * end_index means we will skip the last page. But if endbyte
+ * is page aligned or is at the end of file, we should not skip
+ * that page - discarding the last page is safe enough.
+ */
+ if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK &&
+ endbyte != inode->i_size - 1) {
/* First page is tricky as 0 - 1 = -1, but pgoff_t
* is unsigned, so the end_index >= start_index
* check below would be true and we'll discard the whole
diff --git a/mm/filemap.c b/mm/filemap.c
index ee83baaf855d..693f62212a59 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -31,7 +31,6 @@
#include <linux/blkdev.h>
#include <linux/security.h>
#include <linux/cpuset.h>
-#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/cleancache.h>
diff --git a/mm/gup.c b/mm/gup.c
index e0d82b6706d7..1b46e6e74881 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -848,7 +848,7 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
unsigned long nr_pages,
struct page **pages,
struct vm_area_struct **vmas,
- int *locked, bool notify_drop,
+ int *locked,
unsigned int flags)
{
long ret, pages_done;
@@ -922,7 +922,7 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
pages++;
start += PAGE_SIZE;
}
- if (notify_drop && lock_dropped && *locked) {
+ if (lock_dropped && *locked) {
/*
* We must let the caller know we temporarily dropped the lock
* and so the critical section protected by it was lost.
@@ -959,36 +959,12 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
int *locked)
{
return __get_user_pages_locked(current, current->mm, start, nr_pages,
- pages, NULL, locked, true,
+ pages, NULL, locked,
gup_flags | FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages_locked);
/*
- * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows for
- * tsk, mm to be specified.
- *
- * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the
- * caller if required (just like with __get_user_pages). "FOLL_GET"
- * is set implicitly if "pages" is non-NULL.
- */
-static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk,
- struct mm_struct *mm, unsigned long start,
- unsigned long nr_pages, struct page **pages,
- unsigned int gup_flags)
-{
- long ret;
- int locked = 1;
-
- down_read(&mm->mmap_sem);
- ret = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, NULL,
- &locked, false, gup_flags);
- if (locked)
- up_read(&mm->mmap_sem);
- return ret;
-}
-
-/*
* get_user_pages_unlocked() is suitable to replace the form:
*
* down_read(&mm->mmap_sem);
@@ -1006,8 +982,16 @@ static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk,
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags)
{
- return __get_user_pages_unlocked(current, current->mm, start, nr_pages,
- pages, gup_flags | FOLL_TOUCH);
+ struct mm_struct *mm = current->mm;
+ int locked = 1;
+ long ret;
+
+ down_read(&mm->mmap_sem);
+ ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
+ &locked, gup_flags | FOLL_TOUCH);
+ if (locked)
+ up_read(&mm->mmap_sem);
+ return ret;
}
EXPORT_SYMBOL(get_user_pages_unlocked);
@@ -1073,7 +1057,7 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
struct vm_area_struct **vmas, int *locked)
{
return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
- locked, true,
+ locked,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
EXPORT_SYMBOL(get_user_pages_remote);
@@ -1090,7 +1074,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
struct vm_area_struct **vmas)
{
return __get_user_pages_locked(current, current->mm, start, nr_pages,
- pages, vmas, NULL, false,
+ pages, vmas, NULL,
gup_flags | FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages);
@@ -1410,7 +1394,6 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
VM_BUG_ON_PAGE(compound_head(page) != head, page);
- put_dev_pagemap(pgmap);
SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
@@ -1420,6 +1403,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
ret = 1;
pte_unmap:
+ if (pgmap)
+ put_dev_pagemap(pgmap);
pte_unmap(ptem);
return ret;
}
@@ -1459,10 +1444,12 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
SetPageReferenced(page);
pages[*nr] = page;
get_page(page);
- put_dev_pagemap(pgmap);
(*nr)++;
pfn++;
} while (addr += PAGE_SIZE, addr != end);
+
+ if (pgmap)
+ put_dev_pagemap(pgmap);
return 1;
}
diff --git a/mm/hmm.c b/mm/hmm.c
index ea19742a5d60..320545b98ff5 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -418,7 +418,7 @@ again:
}
if (!pte_present(pte)) {
- swp_entry_t entry;
+ swp_entry_t entry = pte_to_swp_entry(pte);
if (!non_swap_entry(entry)) {
if (hmm_vma_walk->fault)
@@ -426,8 +426,6 @@ again:
continue;
}
- entry = pte_to_swp_entry(pte);
-
/*
* This is a special swap entry, ignore migration, use
* device and report anything else as error.
@@ -838,10 +836,10 @@ static void hmm_devmem_release(struct device *dev, void *data)
mem_hotplug_begin();
if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
- __remove_pages(zone, start_pfn, npages);
+ __remove_pages(zone, start_pfn, npages, NULL);
else
arch_remove_memory(start_pfn << PAGE_SHIFT,
- npages << PAGE_SHIFT);
+ npages << PAGE_SHIFT, NULL);
mem_hotplug_done();
hmm_devmem_radix_release(resource);
@@ -882,7 +880,7 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
else
devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
- devmem->pagemap.res = devmem->resource;
+ devmem->pagemap.res = *devmem->resource;
devmem->pagemap.page_fault = hmm_devmem_fault;
devmem->pagemap.page_free = hmm_devmem_free;
devmem->pagemap.dev = devmem->device;
@@ -931,17 +929,18 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
* want the linear mapping and thus use arch_add_memory().
*/
if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
- ret = arch_add_memory(nid, align_start, align_size, false);
+ ret = arch_add_memory(nid, align_start, align_size, NULL,
+ false);
else
ret = add_pages(nid, align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT, false);
+ align_size >> PAGE_SHIFT, NULL, false);
if (ret) {
mem_hotplug_done();
goto error_add_memory;
}
move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT);
+ align_size >> PAGE_SHIFT, NULL);
mem_hotplug_done();
for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0e7ded98d114..87ab9b8f56b5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1910,17 +1910,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* pmdp_invalidate() is required to make sure we don't miss
* dirty/young flags set by hardware.
*/
- entry = *pmd;
- pmdp_invalidate(vma, addr, pmd);
-
- /*
- * Recover dirty/young flags. It relies on pmdp_invalidate to not
- * corrupt them.
- */
- if (pmd_dirty(*pmd))
- entry = pmd_mkdirty(entry);
- if (pmd_young(*pmd))
- entry = pmd_mkyoung(entry);
+ entry = pmdp_invalidate(vma, addr, pmd);
entry = pmd_modify(entry, newprot);
if (preserve_write)
@@ -2073,8 +2063,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
struct mm_struct *mm = vma->vm_mm;
struct page *page;
pgtable_t pgtable;
- pmd_t _pmd;
- bool young, write, dirty, soft_dirty, pmd_migration = false;
+ pmd_t old_pmd, _pmd;
+ bool young, write, soft_dirty, pmd_migration = false;
unsigned long addr;
int i;
@@ -2116,24 +2106,50 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
+ /*
+ * Up to this point the pmd is present and huge and userland has the
+ * whole access to the hugepage during the split (which happens in
+ * place). If we overwrite the pmd with the not-huge version pointing
+ * to the pte here (which of course we could if all CPUs were bug
+ * free), userland could trigger a small page size TLB miss on the
+ * small sized TLB while the hugepage TLB entry is still established in
+ * the huge TLB. Some CPU doesn't like that.
+ * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
+ * 383 on page 93. Intel should be safe but is also warns that it's
+ * only safe if the permission and cache attributes of the two entries
+ * loaded in the two TLB is identical (which should be the case here).
+ * But it is generally safer to never allow small and huge TLB entries
+ * for the same virtual address to be loaded simultaneously. So instead
+ * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
+ * current pmd notpresent (atomically because here the pmd_trans_huge
+ * must remain set at all times on the pmd until the split is complete
+ * for this pmd), then we flush the SMP TLB and finally we write the
+ * non-huge version of the pmd entry with pmd_populate.
+ */
+ old_pmd = pmdp_invalidate(vma, haddr, pmd);
+
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
- pmd_migration = is_pmd_migration_entry(*pmd);
+ pmd_migration = is_pmd_migration_entry(old_pmd);
if (pmd_migration) {
swp_entry_t entry;
- entry = pmd_to_swp_entry(*pmd);
+ entry = pmd_to_swp_entry(old_pmd);
page = pfn_to_page(swp_offset(entry));
} else
#endif
- page = pmd_page(*pmd);
+ page = pmd_page(old_pmd);
VM_BUG_ON_PAGE(!page_count(page), page);
page_ref_add(page, HPAGE_PMD_NR - 1);
- write = pmd_write(*pmd);
- young = pmd_young(*pmd);
- dirty = pmd_dirty(*pmd);
- soft_dirty = pmd_soft_dirty(*pmd);
+ if (pmd_dirty(old_pmd))
+ SetPageDirty(page);
+ write = pmd_write(old_pmd);
+ young = pmd_young(old_pmd);
+ soft_dirty = pmd_soft_dirty(old_pmd);
- pmdp_huge_split_prepare(vma, haddr, pmd);
+ /*
+ * Withdraw the table only after we mark the pmd entry invalid.
+ * This's critical for some architectures (Power).
+ */
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
@@ -2160,8 +2176,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (soft_dirty)
entry = pte_mksoft_dirty(entry);
}
- if (dirty)
- SetPageDirty(page + i);
pte = pte_offset_map(&_pmd, addr);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, entry);
@@ -2189,28 +2203,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
smp_wmb(); /* make pte visible before pmd */
- /*
- * Up to this point the pmd is present and huge and userland has the
- * whole access to the hugepage during the split (which happens in
- * place). If we overwrite the pmd with the not-huge version pointing
- * to the pte here (which of course we could if all CPUs were bug
- * free), userland could trigger a small page size TLB miss on the
- * small sized TLB while the hugepage TLB entry is still established in
- * the huge TLB. Some CPU doesn't like that.
- * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
- * 383 on page 93. Intel should be safe but is also warns that it's
- * only safe if the permission and cache attributes of the two entries
- * loaded in the two TLB is identical (which should be the case here).
- * But it is generally safer to never allow small and huge TLB entries
- * for the same virtual address to be loaded simultaneously. So instead
- * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
- * current pmd notpresent (atomically because here the pmd_trans_huge
- * and pmd_trans_splitting must remain set at all times on the pmd
- * until the split is complete for this pmd), then we flush the SMP TLB
- * and finally we write the non-huge version of the pmd entry with
- * pmd_populate.
- */
- pmdp_invalidate(vma, haddr, pmd);
pmd_populate(mm, pmd, pgtable);
if (freeze) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9a334f5fb730..7c204e3d132b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -34,10 +34,9 @@
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
#include <linux/userfaultfd_k.h>
+#include <linux/page_owner.h>
#include "internal.h"
-int hugepages_treat_as_movable;
-
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
@@ -926,7 +925,7 @@ retry_cpuset:
/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
- if (hugepages_treat_as_movable || hugepage_migration_supported(h))
+ if (hugepage_migration_supported(h))
return GFP_HIGHUSER_MOVABLE;
else
return GFP_HIGHUSER;
@@ -1108,7 +1107,8 @@ static bool zone_spans_last_pfn(const struct zone *zone,
return zone_spans_pfn(zone, last_pfn);
}
-static struct page *alloc_gigantic_page(int nid, struct hstate *h)
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
{
unsigned int order = huge_page_order(h);
unsigned long nr_pages = 1 << order;
@@ -1116,11 +1116,9 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h)
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
- gfp_t gfp_mask;
- gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
zonelist = node_zonelist(nid, gfp_mask);
- for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) {
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
spin_lock_irqsave(&zone->lock, flags);
pfn = ALIGN(zone->zone_start_pfn, nr_pages);
@@ -1151,41 +1149,13 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h)
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
static void prep_compound_gigantic_page(struct page *page, unsigned int order);
-static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
-{
- struct page *page;
-
- page = alloc_gigantic_page(nid, h);
- if (page) {
- prep_compound_gigantic_page(page, huge_page_order(h));
- prep_new_huge_page(h, page, nid);
- }
-
- return page;
-}
-
-static int alloc_fresh_gigantic_page(struct hstate *h,
- nodemask_t *nodes_allowed)
-{
- struct page *page = NULL;
- int nr_nodes, node;
-
- for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- page = alloc_fresh_gigantic_page_node(h, node);
- if (page)
- return 1;
- }
-
- return 0;
-}
-
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
static inline bool gigantic_page_supported(void) { return false; }
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask) { return NULL; }
static inline void free_gigantic_page(struct page *page, unsigned int order) { }
static inline void destroy_compound_gigantic_page(struct page *page,
unsigned int order) { }
-static inline int alloc_fresh_gigantic_page(struct hstate *h,
- nodemask_t *nodes_allowed) { return 0; }
#endif
static void update_and_free_page(struct hstate *h, struct page *page)
@@ -1250,6 +1220,28 @@ static void clear_page_huge_active(struct page *page)
ClearPagePrivate(&page[1]);
}
+/*
+ * Internal hugetlb specific page flag. Do not use outside of the hugetlb
+ * code
+ */
+static inline bool PageHugeTemporary(struct page *page)
+{
+ if (!PageHuge(page))
+ return false;
+
+ return (unsigned long)page[2].mapping == -1U;
+}
+
+static inline void SetPageHugeTemporary(struct page *page)
+{
+ page[2].mapping = (void *)-1U;
+}
+
+static inline void ClearPageHugeTemporary(struct page *page)
+{
+ page[2].mapping = NULL;
+}
+
void free_huge_page(struct page *page)
{
/*
@@ -1284,7 +1276,11 @@ void free_huge_page(struct page *page)
if (restore_reserve)
h->resv_huge_pages++;
- if (h->surplus_huge_pages_node[nid]) {
+ if (PageHugeTemporary(page)) {
+ list_del(&page->lru);
+ ClearPageHugeTemporary(page);
+ update_and_free_page(h, page);
+ } else if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
list_del(&page->lru);
update_and_free_page(h, page);
@@ -1306,7 +1302,6 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
spin_unlock(&hugetlb_lock);
- put_page(page); /* free it into the hugepage allocator */
}
static void prep_compound_gigantic_page(struct page *page, unsigned int order)
@@ -1383,41 +1378,70 @@ pgoff_t __basepage_index(struct page *page)
return (index << compound_order(page_head)) + compound_idx;
}
-static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
+static struct page *alloc_buddy_huge_page(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nmask)
{
+ int order = huge_page_order(h);
struct page *page;
- page = __alloc_pages_node(nid,
- htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
- __GFP_RETRY_MAYFAIL|__GFP_NOWARN,
- huge_page_order(h));
- if (page) {
- prep_new_huge_page(h, page, nid);
- }
+ gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
+ if (nid == NUMA_NO_NODE)
+ nid = numa_mem_id();
+ page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
+ if (page)
+ __count_vm_event(HTLB_BUDDY_PGALLOC);
+ else
+ __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+
+ return page;
+}
+
+/*
+ * Common helper to allocate a fresh hugetlb page. All specific allocators
+ * should use this function to get new hugetlb pages
+ */
+static struct page *alloc_fresh_huge_page(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nmask)
+{
+ struct page *page;
+
+ if (hstate_is_gigantic(h))
+ page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
+ else
+ page = alloc_buddy_huge_page(h, gfp_mask,
+ nid, nmask);
+ if (!page)
+ return NULL;
+
+ if (hstate_is_gigantic(h))
+ prep_compound_gigantic_page(page, huge_page_order(h));
+ prep_new_huge_page(h, page, page_to_nid(page));
return page;
}
-static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+/*
+ * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
+ * manner.
+ */
+static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
{
struct page *page;
int nr_nodes, node;
- int ret = 0;
+ gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- page = alloc_fresh_huge_page_node(h, node);
- if (page) {
- ret = 1;
+ page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed);
+ if (page)
break;
- }
}
- if (ret)
- count_vm_event(HTLB_BUDDY_PGALLOC);
- else
- count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+ if (!page)
+ return 0;
- return ret;
+ put_page(page); /* free it into the hugepage allocator */
+
+ return 1;
}
/*
@@ -1525,79 +1549,66 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
return rc;
}
-static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nmask)
-{
- int order = huge_page_order(h);
-
- gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
- if (nid == NUMA_NO_NODE)
- nid = numa_mem_id();
- return __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
-}
-
-static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
+/*
+ * Allocates a fresh surplus page from the page allocator.
+ */
+static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
- struct page *page;
- unsigned int r_nid;
+ struct page *page = NULL;
if (hstate_is_gigantic(h))
return NULL;
+ spin_lock(&hugetlb_lock);
+ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
+ goto out_unlock;
+ spin_unlock(&hugetlb_lock);
+
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+ if (!page)
+ return NULL;
+
+ spin_lock(&hugetlb_lock);
/*
- * Assume we will successfully allocate the surplus page to
- * prevent racing processes from causing the surplus to exceed
- * overcommit
- *
- * This however introduces a different race, where a process B
- * tries to grow the static hugepage pool while alloc_pages() is
- * called by process A. B will only examine the per-node
- * counters in determining if surplus huge pages can be
- * converted to normal huge pages in adjust_pool_surplus(). A
- * won't be able to increment the per-node counter, until the
- * lock is dropped by B, but B doesn't drop hugetlb_lock until
- * no more huge pages can be converted from surplus to normal
- * state (and doesn't try to convert again). Thus, we have a
- * case where a surplus huge page exists, the pool is grown, and
- * the surplus huge page still exists after, even though it
- * should just have been converted to a normal huge page. This
- * does not leak memory, though, as the hugepage will be freed
- * once it is out of use. It also does not allow the counters to
- * go out of whack in adjust_pool_surplus() as we don't modify
- * the node values until we've gotten the hugepage and only the
- * per-node value is checked there.
+ * We could have raced with the pool size change.
+ * Double check that and simply deallocate the new page
+ * if we would end up overcommiting the surpluses. Abuse
+ * temporary page to workaround the nasty free_huge_page
+ * codeflow
*/
- spin_lock(&hugetlb_lock);
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
- spin_unlock(&hugetlb_lock);
- return NULL;
+ SetPageHugeTemporary(page);
+ put_page(page);
+ page = NULL;
} else {
- h->nr_huge_pages++;
h->surplus_huge_pages++;
+ h->nr_huge_pages_node[page_to_nid(page)]++;
}
+
+out_unlock:
spin_unlock(&hugetlb_lock);
- page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
+ return page;
+}
- spin_lock(&hugetlb_lock);
- if (page) {
- INIT_LIST_HEAD(&page->lru);
- r_nid = page_to_nid(page);
- set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
- set_hugetlb_cgroup(page, NULL);
- /*
- * We incremented the global counters already
- */
- h->nr_huge_pages_node[r_nid]++;
- h->surplus_huge_pages_node[r_nid]++;
- __count_vm_event(HTLB_BUDDY_PGALLOC);
- } else {
- h->nr_huge_pages--;
- h->surplus_huge_pages--;
- __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
- }
- spin_unlock(&hugetlb_lock);
+static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask)
+{
+ struct page *page;
+
+ if (hstate_is_gigantic(h))
+ return NULL;
+
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+ if (!page)
+ return NULL;
+
+ /*
+ * We do not account these pages as surplus because they are only
+ * temporary and will be released properly on the last reference
+ */
+ SetPageHugeTemporary(page);
return page;
}
@@ -1606,7 +1617,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
* Use the VMA's mpolicy to allocate a huge page from the buddy.
*/
static
-struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
+struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
struct page *page;
@@ -1616,17 +1627,13 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
nodemask_t *nodemask;
nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
- page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask);
+ page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
mpol_cond_put(mpol);
return page;
}
-/*
- * This allocation function is useful in the context where vma is irrelevant.
- * E.g. soft-offlining uses this function because it only cares physical
- * address of error page.
- */
+/* page migration callback function */
struct page *alloc_huge_page_node(struct hstate *h, int nid)
{
gfp_t gfp_mask = htlb_alloc_mask(h);
@@ -1641,12 +1648,12 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
spin_unlock(&hugetlb_lock);
if (!page)
- page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL);
+ page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
return page;
}
-
+/* page migration callback function */
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask)
{
@@ -1664,9 +1671,25 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
}
spin_unlock(&hugetlb_lock);
- /* No reservations, try to overcommit */
+ return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
+}
+
+/* mempolicy aware migration callback */
+struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
+ unsigned long address)
+{
+ struct mempolicy *mpol;
+ nodemask_t *nodemask;
+ struct page *page;
+ gfp_t gfp_mask;
+ int node;
+
+ gfp_mask = htlb_alloc_mask(h);
+ node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
+ page = alloc_huge_page_nodemask(h, node, nodemask);
+ mpol_cond_put(mpol);
- return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask);
+ return page;
}
/*
@@ -1694,7 +1717,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h),
+ page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
NUMA_NO_NODE, NULL);
if (!page) {
alloc_ok = false;
@@ -2031,7 +2054,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
if (!page) {
spin_unlock(&hugetlb_lock);
- page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
+ page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
if (!page)
goto out_uncharge_cgroup;
if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
@@ -2074,20 +2097,6 @@ out_subpool_put:
return ERR_PTR(-ENOSPC);
}
-/*
- * alloc_huge_page()'s wrapper which simply returns the page if allocation
- * succeeds, otherwise NULL. This function is called from new_vma_page(),
- * where no ERR_VALUE is expected to be returned.
- */
-struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
- unsigned long addr, int avoid_reserve)
-{
- struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
- if (IS_ERR(page))
- page = NULL;
- return page;
-}
-
int alloc_bootmem_huge_page(struct hstate *h)
__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
int __alloc_bootmem_huge_page(struct hstate *h)
@@ -2150,6 +2159,8 @@ static void __init gather_bootmem_prealloc(void)
prep_compound_huge_page(page, h->order);
WARN_ON(PageReserved(page));
prep_new_huge_page(h, page, page_to_nid(page));
+ put_page(page); /* free it into the hugepage allocator */
+
/*
* If we had gigantic hugepages allocated at boot time, we need
* to restore the 'stolen' pages to totalram_pages in order to
@@ -2169,7 +2180,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
if (hstate_is_gigantic(h)) {
if (!alloc_bootmem_huge_page(h))
break;
- } else if (!alloc_fresh_huge_page(h,
+ } else if (!alloc_pool_huge_page(h,
&node_states[N_MEMORY]))
break;
cond_resched();
@@ -2289,7 +2300,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* First take pages out of surplus state. Then make up the
* remaining difference by allocating fresh huge pages.
*
- * We might race with __alloc_buddy_huge_page() here and be unable
+ * We might race with alloc_surplus_huge_page() here and be unable
* to convert a surplus huge page to a normal huge page. That is
* not critical, though, it just means the overall size of the
* pool might be one hugepage larger than it needs to be, but
@@ -2312,10 +2323,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
/* yield cpu to avoid soft lockup */
cond_resched();
- if (hstate_is_gigantic(h))
- ret = alloc_fresh_gigantic_page(h, nodes_allowed);
- else
- ret = alloc_fresh_huge_page(h, nodes_allowed);
+ ret = alloc_pool_huge_page(h, nodes_allowed);
spin_lock(&hugetlb_lock);
if (!ret)
goto out;
@@ -2335,7 +2343,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* By placing pages into the surplus state independent of the
* overcommit value, we are allowing the surplus pool size to
* exceed overcommit. There are few sane options here. Since
- * __alloc_buddy_huge_page() is checking the global counter,
+ * alloc_surplus_huge_page() is checking the global counter,
* though, we'll note that we're not allowed to exceed surplus
* and won't grow the pool anywhere else. Not until one of the
* sysctls are changed, or the surplus pages go out of use.
@@ -2975,20 +2983,32 @@ out:
void hugetlb_report_meminfo(struct seq_file *m)
{
- struct hstate *h = &default_hstate;
+ struct hstate *h;
+ unsigned long total = 0;
+
if (!hugepages_supported())
return;
- seq_printf(m,
- "HugePages_Total: %5lu\n"
- "HugePages_Free: %5lu\n"
- "HugePages_Rsvd: %5lu\n"
- "HugePages_Surp: %5lu\n"
- "Hugepagesize: %8lu kB\n",
- h->nr_huge_pages,
- h->free_huge_pages,
- h->resv_huge_pages,
- h->surplus_huge_pages,
- 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
+
+ for_each_hstate(h) {
+ unsigned long count = h->nr_huge_pages;
+
+ total += (PAGE_SIZE << huge_page_order(h)) * count;
+
+ if (h == &default_hstate)
+ seq_printf(m,
+ "HugePages_Total: %5lu\n"
+ "HugePages_Free: %5lu\n"
+ "HugePages_Rsvd: %5lu\n"
+ "HugePages_Surp: %5lu\n"
+ "Hugepagesize: %8lu kB\n",
+ count,
+ h->free_huge_pages,
+ h->resv_huge_pages,
+ h->surplus_huge_pages,
+ (PAGE_SIZE << huge_page_order(h)) / 1024);
+ }
+
+ seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024);
}
int hugetlb_report_node_meminfo(int nid, char *buf)
@@ -4799,3 +4819,36 @@ void putback_active_hugepage(struct page *page)
spin_unlock(&hugetlb_lock);
put_page(page);
}
+
+void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
+{
+ struct hstate *h = page_hstate(oldpage);
+
+ hugetlb_cgroup_migrate(oldpage, newpage);
+ set_page_owner_migrate_reason(newpage, reason);
+
+ /*
+ * transfer temporary state of the new huge page. This is
+ * reverse to other transitions because the newpage is going to
+ * be final while the old one will be freed so it takes over
+ * the temporary status.
+ *
+ * Also note that we have to transfer the per-node surplus state
+ * here as well otherwise the global surplus count will not match
+ * the per-node's.
+ */
+ if (PageHugeTemporary(newpage)) {
+ int old_nid = page_to_nid(oldpage);
+ int new_nid = page_to_nid(newpage);
+
+ SetPageHugeTemporary(oldpage);
+ ClearPageHugeTemporary(newpage);
+
+ spin_lock(&hugetlb_lock);
+ if (h->surplus_huge_pages_node[old_nid]) {
+ h->surplus_huge_pages_node[old_nid]--;
+ h->surplus_huge_pages_node[new_nid]++;
+ }
+ spin_unlock(&hugetlb_lock);
+ }
+}
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 356df057a2a8..b6ac70616c32 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -52,7 +52,7 @@ static int hwpoison_inject(void *data, u64 val)
inject:
pr_info("Injecting memory failure at pfn %#lx\n", pfn);
- return memory_failure(pfn, 18, MF_COUNT_INCREASED);
+ return memory_failure(pfn, MF_COUNT_INCREASED);
put_out:
put_hwpoison_page(p);
return 0;
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index b47664358796..27ddfd29112a 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -18,7 +18,7 @@ static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
{
- return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
+ return v->vm_pgoff + vma_pages(v) - 1;
}
INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ea4ff259b671..b7e2268dfc9a 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1399,8 +1399,7 @@ static void collapse_shmem(struct mm_struct *mm,
}
if (page_mapped(page))
- unmap_mapping_range(mapping, index << PAGE_SHIFT,
- PAGE_SIZE, 0);
+ unmap_mapping_pages(mapping, index, 1, false);
spin_lock_irq(&mapping->tree_lock);
@@ -1674,10 +1673,14 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
spin_unlock(&khugepaged_mm_lock);
mm = mm_slot->mm;
- down_read(&mm->mmap_sem);
- if (unlikely(khugepaged_test_exit(mm)))
- vma = NULL;
- else
+ /*
+ * Don't wait for semaphore (to avoid long wait times). Just move to
+ * the next mm on the list.
+ */
+ vma = NULL;
+ if (unlikely(!down_read_trylock(&mm->mmap_sem)))
+ goto breakouterloop_mmap_sem;
+ if (likely(!khugepaged_test_exit(mm)))
vma = find_vma(mm, khugepaged_scan.address);
progress++;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index f656ca27f6c2..e83987c55a08 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -91,7 +91,6 @@
#include <linux/stacktrace.h>
#include <linux/cache.h>
#include <linux/percpu.h>
-#include <linux/hardirq.h>
#include <linux/bootmem.h>
#include <linux/pfn.h>
#include <linux/mmzone.h>
diff --git a/mm/madvise.c b/mm/madvise.c
index 751e97aa2210..4d3c922ea1a1 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -661,7 +661,7 @@ static int madvise_inject_error(int behavior,
pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
page_to_pfn(page), start);
- ret = memory_failure(page_to_pfn(page), 0, MF_COUNT_INCREASED);
+ ret = memory_failure(page_to_pfn(page), MF_COUNT_INCREASED);
if (ret)
return ret;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ac2ffd5e02b9..0937f2c52c7d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -542,39 +542,10 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
return mz;
}
-/*
- * Return page count for single (non recursive) @memcg.
- *
- * Implementation Note: reading percpu statistics for memcg.
- *
- * Both of vmstat[] and percpu_counter has threshold and do periodic
- * synchronization to implement "quick" read. There are trade-off between
- * reading cost and precision of value. Then, we may have a chance to implement
- * a periodic synchronization of counter in memcg's counter.
- *
- * But this _read() function is used for user interface now. The user accounts
- * memory usage by memory cgroup and he _always_ requires exact value because
- * he accounts memory. Even if we provide quick-and-fuzzy read, we always
- * have to visit all online cpus and make sum. So, for now, unnecessary
- * synchronization is not implemented. (just implemented for cpu hotplug)
- *
- * If there are kernel internal actions which can make use of some not-exact
- * value, and reading all cpu value can be performance bottleneck in some
- * common workload, threshold and synchronization as vmstat[] should be
- * implemented.
- *
- * The parameter idx can be of type enum memcg_event_item or vm_event_item.
- */
-
static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
int event)
{
- unsigned long val = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- val += per_cpu(memcg->stat->events[event], cpu);
- return val;
+ return atomic_long_read(&memcg->events[event]);
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
@@ -586,27 +557,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
* counted as CACHE even if it's on ANON LRU.
*/
if (PageAnon(page))
- __this_cpu_add(memcg->stat->count[MEMCG_RSS], nr_pages);
+ __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
else {
- __this_cpu_add(memcg->stat->count[MEMCG_CACHE], nr_pages);
+ __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
if (PageSwapBacked(page))
- __this_cpu_add(memcg->stat->count[NR_SHMEM], nr_pages);
+ __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
}
if (compound) {
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- __this_cpu_add(memcg->stat->count[MEMCG_RSS_HUGE], nr_pages);
+ __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
}
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
- __this_cpu_inc(memcg->stat->events[PGPGIN]);
+ __count_memcg_events(memcg, PGPGIN, 1);
else {
- __this_cpu_inc(memcg->stat->events[PGPGOUT]);
+ __count_memcg_events(memcg, PGPGOUT, 1);
nr_pages = -nr_pages; /* for event */
}
- __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
+ __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
}
unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
@@ -642,8 +613,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
{
unsigned long val, next;
- val = __this_cpu_read(memcg->stat->nr_page_events);
- next = __this_cpu_read(memcg->stat->targets[target]);
+ val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
+ next = __this_cpu_read(memcg->stat_cpu->targets[target]);
/* from time_after() in jiffies.h */
if ((long)(next - val) < 0) {
switch (target) {
@@ -659,7 +630,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
default:
break;
}
- __this_cpu_write(memcg->stat->targets[target], next);
+ __this_cpu_write(memcg->stat_cpu->targets[target], next);
return true;
}
return false;
@@ -1124,7 +1095,7 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
return false;
}
-unsigned int memcg1_stats[] = {
+static const unsigned int memcg1_stats[] = {
MEMCG_CACHE,
MEMCG_RSS,
MEMCG_RSS_HUGE,
@@ -1206,20 +1177,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
}
/*
- * This function returns the number of memcg under hierarchy tree. Returns
- * 1(self count) if no children.
- */
-static int mem_cgroup_count_children(struct mem_cgroup *memcg)
-{
- int num = 0;
- struct mem_cgroup *iter;
-
- for_each_mem_cgroup_tree(iter, memcg)
- num++;
- return num;
-}
-
-/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
@@ -1707,11 +1664,6 @@ void unlock_page_memcg(struct page *page)
}
EXPORT_SYMBOL(unlock_page_memcg);
-/*
- * size of first charge trial. "32" comes from vmscan.c's magic value.
- * TODO: maybe necessary to use big numbers in big irons.
- */
-#define CHARGE_BATCH 32U
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
@@ -1739,7 +1691,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
unsigned long flags;
bool ret = false;
- if (nr_pages > CHARGE_BATCH)
+ if (nr_pages > MEMCG_CHARGE_BATCH)
return ret;
local_irq_save(flags);
@@ -1808,7 +1760,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
}
stock->nr_pages += nr_pages;
- if (stock->nr_pages > CHARGE_BATCH)
+ if (stock->nr_pages > MEMCG_CHARGE_BATCH)
drain_stock(stock);
local_irq_restore(flags);
@@ -1858,9 +1810,44 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
struct memcg_stock_pcp *stock;
+ struct mem_cgroup *memcg;
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
+
+ for_each_mem_cgroup(memcg) {
+ int i;
+
+ for (i = 0; i < MEMCG_NR_STAT; i++) {
+ int nid;
+ long x;
+
+ x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
+ if (x)
+ atomic_long_add(x, &memcg->stat[i]);
+
+ if (i >= NR_VM_NODE_STAT_ITEMS)
+ continue;
+
+ for_each_node(nid) {
+ struct mem_cgroup_per_node *pn;
+
+ pn = mem_cgroup_nodeinfo(memcg, nid);
+ x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
+ if (x)
+ atomic_long_add(x, &pn->lruvec_stat[i]);
+ }
+ }
+
+ for (i = 0; i < MEMCG_NR_EVENTS; i++) {
+ long x;
+
+ x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
+ if (x)
+ atomic_long_add(x, &memcg->events[i]);
+ }
+ }
+
return 0;
}
@@ -1881,7 +1868,7 @@ static void high_work_func(struct work_struct *work)
struct mem_cgroup *memcg;
memcg = container_of(work, struct mem_cgroup, high_work);
- reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL);
+ reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
}
/*
@@ -1905,7 +1892,7 @@ void mem_cgroup_handle_over_high(void)
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
- unsigned int batch = max(CHARGE_BATCH, nr_pages);
+ unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
struct page_counter *counter;
@@ -2415,18 +2402,11 @@ void mem_cgroup_split_huge_fixup(struct page *head)
for (i = 1; i < HPAGE_PMD_NR; i++)
head[i].mem_cgroup = head->mem_cgroup;
- __this_cpu_sub(head->mem_cgroup->stat->count[MEMCG_RSS_HUGE],
- HPAGE_PMD_NR);
+ __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifdef CONFIG_MEMCG_SWAP
-static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
- int nr_entries)
-{
- this_cpu_add(memcg->stat->count[MEMCG_SWAP], nr_entries);
-}
-
/**
* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
* @entry: swap entry to be moved
@@ -2450,8 +2430,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
new_id = mem_cgroup_id(to);
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
- mem_cgroup_swap_statistics(from, -1);
- mem_cgroup_swap_statistics(to, 1);
+ mod_memcg_state(from, MEMCG_SWAP, -1);
+ mod_memcg_state(to, MEMCG_SWAP, 1);
return 0;
}
return -EINVAL;
@@ -2467,23 +2447,12 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
static DEFINE_MUTEX(memcg_limit_mutex);
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
- unsigned long limit)
+ unsigned long limit, bool memsw)
{
- unsigned long curusage;
- unsigned long oldusage;
bool enlarge = false;
- int retry_count;
int ret;
-
- /*
- * For keeping hierarchical_reclaim simple, how long we should retry
- * is depends on callers. We set our retry-count to be function
- * of # of children which we should visit in this loop.
- */
- retry_count = MEM_CGROUP_RECLAIM_RETRIES *
- mem_cgroup_count_children(memcg);
-
- oldusage = page_counter_read(&memcg->memory);
+ bool limits_invariant;
+ struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
do {
if (signal_pending(current)) {
@@ -2492,79 +2461,31 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
}
mutex_lock(&memcg_limit_mutex);
- if (limit > memcg->memsw.limit) {
+ /*
+ * Make sure that the new limit (memsw or memory limit) doesn't
+ * break our basic invariant rule memory.limit <= memsw.limit.
+ */
+ limits_invariant = memsw ? limit >= memcg->memory.limit :
+ limit <= memcg->memsw.limit;
+ if (!limits_invariant) {
mutex_unlock(&memcg_limit_mutex);
ret = -EINVAL;
break;
}
- if (limit > memcg->memory.limit)
+ if (limit > counter->limit)
enlarge = true;
- ret = page_counter_limit(&memcg->memory, limit);
+ ret = page_counter_limit(counter, limit);
mutex_unlock(&memcg_limit_mutex);
if (!ret)
break;
- try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
-
- curusage = page_counter_read(&memcg->memory);
- /* Usage is reduced ? */
- if (curusage >= oldusage)
- retry_count--;
- else
- oldusage = curusage;
- } while (retry_count);
-
- if (!ret && enlarge)
- memcg_oom_recover(memcg);
-
- return ret;
-}
-
-static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
- unsigned long limit)
-{
- unsigned long curusage;
- unsigned long oldusage;
- bool enlarge = false;
- int retry_count;
- int ret;
-
- /* see mem_cgroup_resize_res_limit */
- retry_count = MEM_CGROUP_RECLAIM_RETRIES *
- mem_cgroup_count_children(memcg);
-
- oldusage = page_counter_read(&memcg->memsw);
-
- do {
- if (signal_pending(current)) {
- ret = -EINTR;
- break;
- }
-
- mutex_lock(&memcg_limit_mutex);
- if (limit < memcg->memory.limit) {
- mutex_unlock(&memcg_limit_mutex);
- ret = -EINVAL;
+ if (!try_to_free_mem_cgroup_pages(memcg, 1,
+ GFP_KERNEL, !memsw)) {
+ ret = -EBUSY;
break;
}
- if (limit > memcg->memsw.limit)
- enlarge = true;
- ret = page_counter_limit(&memcg->memsw, limit);
- mutex_unlock(&memcg_limit_mutex);
-
- if (!ret)
- break;
-
- try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
-
- curusage = page_counter_read(&memcg->memsw);
- /* Usage is reduced ? */
- if (curusage >= oldusage)
- retry_count--;
- else
- oldusage = curusage;
- } while (retry_count);
+ } while (true);
if (!ret && enlarge)
memcg_oom_recover(memcg);
@@ -3020,10 +2941,10 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
}
switch (MEMFILE_TYPE(of_cft(of)->private)) {
case _MEM:
- ret = mem_cgroup_resize_limit(memcg, nr_pages);
+ ret = mem_cgroup_resize_limit(memcg, nr_pages, false);
break;
case _MEMSWAP:
- ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
+ ret = mem_cgroup_resize_limit(memcg, nr_pages, true);
break;
case _KMEM:
ret = memcg_update_kmem_limit(memcg, nr_pages);
@@ -3777,7 +3698,7 @@ static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
struct mem_cgroup_event *event =
container_of(wait, struct mem_cgroup_event, wait);
struct mem_cgroup *memcg = event->memcg;
- unsigned long flags = (unsigned long)key;
+ __poll_t flags = key_to_poll(key);
if (flags & POLLHUP) {
/*
@@ -4168,8 +4089,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return 1;
- pn->lruvec_stat = alloc_percpu(struct lruvec_stat);
- if (!pn->lruvec_stat) {
+ pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
+ if (!pn->lruvec_stat_cpu) {
kfree(pn);
return 1;
}
@@ -4187,7 +4108,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
- free_percpu(pn->lruvec_stat);
+ free_percpu(pn->lruvec_stat_cpu);
kfree(pn);
}
@@ -4197,7 +4118,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
- free_percpu(memcg->stat);
+ free_percpu(memcg->stat_cpu);
kfree(memcg);
}
@@ -4226,8 +4147,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
if (memcg->id.id < 0)
goto fail;
- memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
- if (!memcg->stat)
+ memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
+ if (!memcg->stat_cpu)
goto fail;
for_each_node(node)
@@ -4584,8 +4505,8 @@ static int mem_cgroup_move_account(struct page *page,
spin_lock_irqsave(&from->move_lock, flags);
if (!anon && page_mapped(page)) {
- __this_cpu_sub(from->stat->count[NR_FILE_MAPPED], nr_pages);
- __this_cpu_add(to->stat->count[NR_FILE_MAPPED], nr_pages);
+ __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
+ __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
}
/*
@@ -4597,16 +4518,14 @@ static int mem_cgroup_move_account(struct page *page,
struct address_space *mapping = page_mapping(page);
if (mapping_cap_account_dirty(mapping)) {
- __this_cpu_sub(from->stat->count[NR_FILE_DIRTY],
- nr_pages);
- __this_cpu_add(to->stat->count[NR_FILE_DIRTY],
- nr_pages);
+ __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
+ __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
}
}
if (PageWriteback(page)) {
- __this_cpu_sub(from->stat->count[NR_WRITEBACK], nr_pages);
- __this_cpu_add(to->stat->count[NR_WRITEBACK], nr_pages);
+ __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
+ __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
}
/*
@@ -5642,12 +5561,12 @@ static void uncharge_batch(const struct uncharge_gather *ug)
}
local_irq_save(flags);
- __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
- __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
- __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
- __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem);
- __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
- __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
+ __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
+ __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
+ __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
+ __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
+ __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
+ __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
@@ -5828,6 +5747,20 @@ void mem_cgroup_sk_alloc(struct sock *sk)
if (!mem_cgroup_sockets_enabled)
return;
+ /*
+ * Socket cloning can throw us here with sk_memcg already
+ * filled. It won't however, necessarily happen from
+ * process context. So the test for root memcg given
+ * the current task's memcg won't help us in this case.
+ *
+ * Respecting the original socket's memcg is a better
+ * decision in this case.
+ */
+ if (sk->sk_memcg) {
+ css_get(&sk->sk_memcg->css);
+ return;
+ }
+
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
if (memcg == root_mem_cgroup)
@@ -5874,7 +5807,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
if (in_softirq())
gfp_mask = GFP_NOWAIT;
- this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages);
+ mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
if (try_charge(memcg, gfp_mask, nr_pages) == 0)
return true;
@@ -5895,7 +5828,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
return;
}
- this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages);
+ mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
refill_stock(memcg, nr_pages);
}
@@ -6019,7 +5952,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
nr_entries);
VM_BUG_ON_PAGE(oldid, page);
- mem_cgroup_swap_statistics(swap_memcg, nr_entries);
+ mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
page->mem_cgroup = NULL;
@@ -6085,7 +6018,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
mem_cgroup_id_get_many(memcg, nr_pages - 1);
oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
VM_BUG_ON_PAGE(oldid, page);
- mem_cgroup_swap_statistics(memcg, nr_pages);
+ mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
return 0;
}
@@ -6113,7 +6046,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
else
page_counter_uncharge(&memcg->memsw, nr_pages);
}
- mem_cgroup_swap_statistics(memcg, -nr_pages);
+ mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
mem_cgroup_id_put_many(memcg, nr_pages);
}
rcu_read_unlock();
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 4acdf393a801..4b80ccee4535 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -178,25 +178,19 @@ EXPORT_SYMBOL_GPL(hwpoison_filter);
* ``action optional'' if they are not immediately affected by the error
* ``action required'' if error happened in current execution context
*/
-static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
+static int kill_proc(struct task_struct *t, unsigned long addr,
unsigned long pfn, struct page *page, int flags)
{
- struct siginfo si;
+ short addr_lsb;
int ret;
pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
pfn, t->comm, t->pid);
- si.si_signo = SIGBUS;
- si.si_errno = 0;
- si.si_addr = (void *)addr;
-#ifdef __ARCH_SI_TRAPNO
- si.si_trapno = trapno;
-#endif
- si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
+ addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
- si.si_code = BUS_MCEERR_AR;
- ret = force_sig_info(SIGBUS, &si, current);
+ ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr,
+ addr_lsb, current);
} else {
/*
* Don't use force here, it's convenient if the signal
@@ -204,8 +198,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
* This could cause a loop when the user sets SIGBUS
* to SIG_IGN, but hopefully no one will do that?
*/
- si.si_code = BUS_MCEERR_AO;
- ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
+ ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)addr,
+ addr_lsb, t); /* synchronous? */
}
if (ret < 0)
pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
@@ -323,7 +317,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* Also when FAIL is set do a force kill because something went
* wrong earlier.
*/
-static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
+static void kill_procs(struct list_head *to_kill, int forcekill,
bool fail, struct page *page, unsigned long pfn,
int flags)
{
@@ -348,7 +342,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
* check for that, but we need to tell the
* process anyways.
*/
- else if (kill_proc(tk->tsk, tk->addr, trapno,
+ else if (kill_proc(tk->tsk, tk->addr,
pfn, page, flags) < 0)
pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
@@ -927,7 +921,7 @@ EXPORT_SYMBOL_GPL(get_hwpoison_page);
* the pages and send SIGBUS to the processes if the data was dirty.
*/
static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
- int trapno, int flags, struct page **hpagep)
+ int flags, struct page **hpagep)
{
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
struct address_space *mapping;
@@ -1017,7 +1011,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* any accesses to the poisoned memory.
*/
forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
- kill_procs(&tokill, forcekill, trapno, !unmap_success, p, pfn, flags);
+ kill_procs(&tokill, forcekill, !unmap_success, p, pfn, flags);
return unmap_success;
}
@@ -1045,7 +1039,7 @@ static int identify_page_state(unsigned long pfn, struct page *p,
return page_action(ps, p, pfn);
}
-static int memory_failure_hugetlb(unsigned long pfn, int trapno, int flags)
+static int memory_failure_hugetlb(unsigned long pfn, int flags)
{
struct page *p = pfn_to_page(pfn);
struct page *head = compound_head(p);
@@ -1090,7 +1084,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int trapno, int flags)
return 0;
}
- if (!hwpoison_user_mappings(p, pfn, trapno, flags, &head)) {
+ if (!hwpoison_user_mappings(p, pfn, flags, &head)) {
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
goto out;
@@ -1105,7 +1099,6 @@ out:
/**
* memory_failure - Handle memory failure of a page.
* @pfn: Page Number of the corrupted page
- * @trapno: Trap number reported in the signal to user space.
* @flags: fine tune action taken
*
* This function is called by the low level machine check code
@@ -1120,7 +1113,7 @@ out:
* Must run in process context (e.g. a work queue) with interrupts
* enabled and no spinlocks hold.
*/
-int memory_failure(unsigned long pfn, int trapno, int flags)
+int memory_failure(unsigned long pfn, int flags)
{
struct page *p;
struct page *hpage;
@@ -1129,7 +1122,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
unsigned long page_flags;
if (!sysctl_memory_failure_recovery)
- panic("Memory failure from trap %d on page %lx", trapno, pfn);
+ panic("Memory failure on page %lx", pfn);
if (!pfn_valid(pfn)) {
pr_err("Memory failure: %#lx: memory outside kernel control\n",
@@ -1139,7 +1132,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
p = pfn_to_page(pfn);
if (PageHuge(p))
- return memory_failure_hugetlb(pfn, trapno, flags);
+ return memory_failure_hugetlb(pfn, flags);
if (TestSetPageHWPoison(p)) {
pr_err("Memory failure: %#lx: already hardware poisoned\n",
pfn);
@@ -1268,7 +1261,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* When the raw error page is thp tail page, hpage points to the raw
* page after thp split.
*/
- if (!hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)) {
+ if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
goto out;
@@ -1296,7 +1289,6 @@ EXPORT_SYMBOL_GPL(memory_failure);
struct memory_failure_entry {
unsigned long pfn;
- int trapno;
int flags;
};
@@ -1312,7 +1304,6 @@ static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
/**
* memory_failure_queue - Schedule handling memory failure of a page.
* @pfn: Page Number of the corrupted page
- * @trapno: Trap number reported in the signal to user space.
* @flags: Flags for memory failure handling
*
* This function is called by the low level hardware error handler
@@ -1326,13 +1317,12 @@ static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
*
* Can run in IRQ context.
*/
-void memory_failure_queue(unsigned long pfn, int trapno, int flags)
+void memory_failure_queue(unsigned long pfn, int flags)
{
struct memory_failure_cpu *mf_cpu;
unsigned long proc_flags;
struct memory_failure_entry entry = {
.pfn = pfn,
- .trapno = trapno,
.flags = flags,
};
@@ -1365,7 +1355,7 @@ static void memory_failure_work_func(struct work_struct *work)
if (entry.flags & MF_SOFT_OFFLINE)
soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
else
- memory_failure(entry.pfn, entry.trapno, entry.flags);
+ memory_failure(entry.pfn, entry.flags);
}
}
diff --git a/mm/memory.c b/mm/memory.c
index 793004608332..2248529e71c1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -400,10 +400,17 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
-/* tlb_gather_mmu
- * Called to initialize an (on-stack) mmu_gather structure for page-table
- * tear-down from @mm. The @fullmm argument is used when @mm is without
- * users and we're going to destroy the full address space (exit/execve).
+/**
+ * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
+ * @tlb: the mmu_gather structure to initialize
+ * @mm: the mm_struct of the target address space
+ * @start: start of the region that will be removed from the page-table
+ * @end: end of the region that will be removed from the page-table
+ *
+ * Called to initialize an (on-stack) mmu_gather structure for page-table
+ * tear-down from @mm. The @start and @end are set to 0 and -1
+ * respectively when @mm is without users and we're going to destroy
+ * the full address space (exit/execve).
*/
void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
unsigned long start, unsigned long end)
@@ -1897,12 +1904,26 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vm_insert_pfn_prot);
+static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
+{
+ /* these checks mirror the abort conditions in vm_normal_page */
+ if (vma->vm_flags & VM_MIXEDMAP)
+ return true;
+ if (pfn_t_devmap(pfn))
+ return true;
+ if (pfn_t_special(pfn))
+ return true;
+ if (is_zero_pfn(pfn_t_to_pfn(pfn)))
+ return true;
+ return false;
+}
+
static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn, bool mkwrite)
{
pgprot_t pgprot = vma->vm_page_prot;
- BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
+ BUG_ON(!vm_mixed_ok(vma, pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
@@ -2792,8 +2813,37 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
}
/**
+ * unmap_mapping_pages() - Unmap pages from processes.
+ * @mapping: The address space containing pages to be unmapped.
+ * @start: Index of first page to be unmapped.
+ * @nr: Number of pages to be unmapped. 0 to unmap to end of file.
+ * @even_cows: Whether to unmap even private COWed pages.
+ *
+ * Unmap the pages in this address space from any userspace process which
+ * has them mmaped. Generally, you want to remove COWed pages as well when
+ * a file is being truncated, but not when invalidating pages from the page
+ * cache.
+ */
+void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
+ pgoff_t nr, bool even_cows)
+{
+ struct zap_details details = { };
+
+ details.check_mapping = even_cows ? NULL : mapping;
+ details.first_index = start;
+ details.last_index = start + nr - 1;
+ if (details.last_index < details.first_index)
+ details.last_index = ULONG_MAX;
+
+ i_mmap_lock_write(mapping);
+ if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
+ unmap_mapping_range_tree(&mapping->i_mmap, &details);
+ i_mmap_unlock_write(mapping);
+}
+
+/**
* unmap_mapping_range - unmap the portion of all mmaps in the specified
- * address_space corresponding to the specified page range in the underlying
+ * address_space corresponding to the specified byte range in the underlying
* file.
*
* @mapping: the address space containing mmaps to be unmapped.
@@ -2811,7 +2861,6 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows)
{
- struct zap_details details = { };
pgoff_t hba = holebegin >> PAGE_SHIFT;
pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -2823,16 +2872,7 @@ void unmap_mapping_range(struct address_space *mapping,
hlen = ULONG_MAX - hba + 1;
}
- details.check_mapping = even_cows ? NULL : mapping;
- details.first_index = hba;
- details.last_index = hba + hlen - 1;
- if (details.last_index < details.first_index)
- details.last_index = ULONG_MAX;
-
- i_mmap_lock_write(mapping);
- if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
- unmap_mapping_range_tree(&mapping->i_mmap, &details);
- i_mmap_unlock_write(mapping);
+ unmap_mapping_pages(mapping, hba, hlen, even_cows);
}
EXPORT_SYMBOL(unmap_mapping_range);
@@ -3485,9 +3525,8 @@ static int fault_around_bytes_get(void *data, u64 *val)
}
/*
- * fault_around_pages() and fault_around_mask() expects fault_around_bytes
- * rounded down to nearest page order. It's what do_fault_around() expects to
- * see.
+ * fault_around_bytes must be rounded down to the nearest page order as it's
+ * what do_fault_around() expects to see.
*/
static int fault_around_bytes_set(void *data, u64 val)
{
@@ -3530,13 +3569,14 @@ late_initcall(fault_around_debugfs);
* This function doesn't cross the VMA boundaries, in order to call map_pages()
* only once.
*
- * fault_around_pages() defines how many pages we'll try to map.
- * do_fault_around() expects it to return a power of two less than or equal to
- * PTRS_PER_PTE.
+ * fault_around_bytes defines how many bytes we'll try to map.
+ * do_fault_around() expects it to be set to a power of two less than or equal
+ * to PTRS_PER_PTE.
*
- * The virtual address of the area that we map is naturally aligned to the
- * fault_around_pages() value (and therefore to page order). This way it's
- * easier to guarantee that we don't cross page table boundaries.
+ * The virtual address of the area that we map is naturally aligned to
+ * fault_around_bytes rounded down to the machine page size
+ * (and therefore to page order). This way it's easier to guarantee
+ * that we don't cross page table boundaries.
*/
static int do_fault_around(struct vm_fault *vmf)
{
@@ -3553,8 +3593,8 @@ static int do_fault_around(struct vm_fault *vmf)
start_pgoff -= off;
/*
- * end_pgoff is either end of page table or end of vma
- * or fault_around_pages() from start_pgoff, depending what is nearest.
+ * end_pgoff is either the end of the page table, the end of
+ * the vma or nr_pages from start_pgoff, depending what is nearest.
*/
end_pgoff = start_pgoff -
((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c52aa05b106c..b2bd52ff7605 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -184,7 +184,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
for (i = 0; i < mapsize; i++, page++)
get_page_bootmem(section_nr, page, SECTION_INFO);
- usemap = __nr_to_section(section_nr)->pageblock_flags;
+ usemap = ms->pageblock_flags;
page = virt_to_page(usemap);
mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
@@ -200,9 +200,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
struct mem_section *ms;
struct page *page, *memmap;
- if (!pfn_valid(start_pfn))
- return;
-
section_nr = pfn_to_section_nr(start_pfn);
ms = __nr_to_section(section_nr);
@@ -210,7 +207,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
- usemap = __nr_to_section(section_nr)->pageblock_flags;
+ usemap = ms->pageblock_flags;
page = virt_to_page(usemap);
mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
@@ -250,7 +247,7 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
- bool want_memblock)
+ struct vmem_altmap *altmap, bool want_memblock)
{
int ret;
int i;
@@ -258,7 +255,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
if (pfn_valid(phys_start_pfn))
return -EEXIST;
- ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn);
+ ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn, altmap);
if (ret < 0)
return ret;
@@ -292,18 +289,17 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
* add the new pages.
*/
int __ref __add_pages(int nid, unsigned long phys_start_pfn,
- unsigned long nr_pages, bool want_memblock)
+ unsigned long nr_pages, struct vmem_altmap *altmap,
+ bool want_memblock)
{
unsigned long i;
int err = 0;
int start_sec, end_sec;
- struct vmem_altmap *altmap;
/* during initialize mem_map, align hot-added range to section */
start_sec = pfn_to_section_nr(phys_start_pfn);
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
- altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn));
if (altmap) {
/*
* Validate altmap is within bounds of the total request
@@ -318,7 +314,8 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
}
for (i = start_sec; i <= end_sec; i++) {
- err = __add_section(nid, section_nr_to_pfn(i), want_memblock);
+ err = __add_section(nid, section_nr_to_pfn(i), altmap,
+ want_memblock);
/*
* EEXIST is finally dealt with by ioresource collision
@@ -334,7 +331,6 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
out:
return err;
}
-EXPORT_SYMBOL_GPL(__add_pages);
#ifdef CONFIG_MEMORY_HOTREMOVE
/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
@@ -537,7 +533,7 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
}
static int __remove_section(struct zone *zone, struct mem_section *ms,
- unsigned long map_offset)
+ unsigned long map_offset, struct vmem_altmap *altmap)
{
unsigned long start_pfn;
int scn_nr;
@@ -554,7 +550,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
__remove_zone(zone, start_pfn);
- sparse_remove_one_section(zone, ms, map_offset);
+ sparse_remove_one_section(zone, ms, map_offset, altmap);
return 0;
}
@@ -570,7 +566,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
* calling offline_pages().
*/
int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
- unsigned long nr_pages)
+ unsigned long nr_pages, struct vmem_altmap *altmap)
{
unsigned long i;
unsigned long map_offset = 0;
@@ -578,10 +574,6 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
/* In the ZONE_DEVICE case device driver owns the memory region */
if (is_dev_zone(zone)) {
- struct page *page = pfn_to_page(phys_start_pfn);
- struct vmem_altmap *altmap;
-
- altmap = to_vmem_altmap((unsigned long) page);
if (altmap)
map_offset = vmem_altmap_offset(altmap);
} else {
@@ -612,7 +604,8 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
for (i = 0; i < sections_to_remove; i++) {
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
- ret = __remove_section(zone, __pfn_to_section(pfn), map_offset);
+ ret = __remove_section(zone, __pfn_to_section(pfn), map_offset,
+ altmap);
map_offset = 0;
if (ret)
break;
@@ -802,8 +795,8 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
}
-void __ref move_pfn_range_to_zone(struct zone *zone,
- unsigned long start_pfn, unsigned long nr_pages)
+void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
+ unsigned long nr_pages, struct vmem_altmap *altmap)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nid = pgdat->node_id;
@@ -828,7 +821,8 @@ void __ref move_pfn_range_to_zone(struct zone *zone,
* expects the zone spans the pfn range. All the pages in the range
* are reserved so nobody should be touching them so we should be safe
*/
- memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG);
+ memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
+ MEMMAP_HOTPLUG, altmap);
set_zone_contiguous(zone);
}
@@ -900,7 +894,7 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid,
struct zone *zone;
zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
- move_pfn_range_to_zone(zone, start_pfn, nr_pages);
+ move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL);
return zone;
}
@@ -1149,7 +1143,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
}
/* call arch's memory hotadd */
- ret = arch_add_memory(nid, start, size, true);
+ ret = arch_add_memory(nid, start, size, NULL, true);
if (ret < 0)
goto error;
@@ -1637,7 +1631,7 @@ repeat:
goto failed_removal;
cond_resched();
- lru_add_drain_all_cpuslocked();
+ lru_add_drain_all();
drain_all_pages(zone);
pfn = scan_movable_pages(start_pfn, end_pfn);
@@ -1891,7 +1885,7 @@ void __ref remove_memory(int nid, u64 start, u64 size)
memblock_free(start, size);
memblock_remove(start, size);
- arch_remove_memory(start, size);
+ arch_remove_memory(start, size, NULL);
try_offline_node(nid);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4ce44d3ff03d..d879f1d8a44a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1121,8 +1121,8 @@ static struct page *new_page(struct page *page, unsigned long start, int **x)
}
if (PageHuge(page)) {
- BUG_ON(!vma);
- return alloc_huge_page_noerr(vma, address, 1);
+ return alloc_huge_page_vma(page_hstate(compound_head(page)),
+ vma, address);
} else if (thp_migration_supported() && PageTransHuge(page)) {
struct page *thp;
@@ -1263,6 +1263,7 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
unsigned long maxnode)
{
unsigned long k;
+ unsigned long t;
unsigned long nlongs;
unsigned long endmask;
@@ -1279,13 +1280,17 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
else
endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
- /* When the user specified more nodes than supported just check
- if the non supported part is all zero. */
+ /*
+ * When the user specified more nodes than supported just check
+ * if the non supported part is all zero.
+ *
+ * If maxnode have more longs than MAX_NUMNODES, check
+ * the bits in that area first. And then go through to
+ * check the rest bits which equal or bigger than MAX_NUMNODES.
+ * Otherwise, just check bits [MAX_NUMNODES, maxnode).
+ */
if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
- if (nlongs > PAGE_SIZE/sizeof(long))
- return -EINVAL;
for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
- unsigned long t;
if (get_user(t, nmask + k))
return -EFAULT;
if (k == nlongs - 1) {
@@ -1298,6 +1303,16 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
endmask = ~0UL;
}
+ if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
+ unsigned long valid_mask = endmask;
+
+ valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
+ if (get_user(t, nmask + nlongs - 1))
+ return -EFAULT;
+ if (t & valid_mask)
+ return -EINVAL;
+ }
+
if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
return -EFAULT;
nodes_addr(*nodes)[nlongs-1] &= endmask;
@@ -1418,10 +1433,14 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
goto out_put;
}
- if (!nodes_subset(*new, node_states[N_MEMORY])) {
- err = -EINVAL;
+ task_nodes = cpuset_mems_allowed(current);
+ nodes_and(*new, *new, task_nodes);
+ if (nodes_empty(*new))
+ goto out_put;
+
+ nodes_and(*new, *new, node_states[N_MEMORY]);
+ if (nodes_empty(*new))
goto out_put;
- }
err = security_task_movememory(task);
if (err)
diff --git a/mm/migrate.c b/mm/migrate.c
index 4d0be47a322a..1e5525a25691 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1323,9 +1323,8 @@ put_anon:
put_anon_vma(anon_vma);
if (rc == MIGRATEPAGE_SUCCESS) {
- hugetlb_cgroup_migrate(hpage, new_hpage);
+ move_hugetlb_state(hpage, new_hpage, reason);
put_new_page = NULL;
- set_page_owner_migrate_reason(new_hpage, reason);
}
unlock_page(hpage);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 96edb33fd09a..eff6b88a993f 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
}
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
+/*
+ * Must be called while holding mm->mmap_sem for either read or write.
+ * The result is guaranteed to be valid until mm->mmap_sem is dropped.
+ */
+bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+{
+ struct mmu_notifier *mn;
+ int id;
+ bool ret = false;
+
+ WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
+
+ if (!mm_has_notifiers(mm))
+ return ret;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ if (!mn->ops->invalidate_range &&
+ !mn->ops->invalidate_range_start &&
+ !mn->ops->invalidate_range_end)
+ continue;
+
+ if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
+ ret = true;
+ break;
+ }
+ }
+ srcu_read_unlock(&srcu, id);
+ return ret;
+}
+
static int do_mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm,
int take_mmap_sem)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 58b629bb70de..e3309fcf586b 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -84,6 +84,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (!page || PageKsm(page))
continue;
+ /* Also skip shared copy-on-write pages */
+ if (is_cow_mapping(vma->vm_flags) &&
+ page_mapcount(page) != 1)
+ continue;
+
/* Avoid TLB flush if possible */
if (pte_protnone(oldpte))
continue;
diff --git a/mm/nommu.c b/mm/nommu.c
index 17c00d93de2e..4b9864b17cb0 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1788,13 +1788,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
return -ENOMEM;
}
-void unmap_mapping_range(struct address_space *mapping,
- loff_t const holebegin, loff_t const holelen,
- int even_cows)
-{
-}
-EXPORT_SYMBOL(unmap_mapping_range);
-
int filemap_fault(struct vm_fault *vmf)
{
BUG();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 29f855551efe..f2e7dfb81eee 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
}
/*
- * If the mm has notifiers then we would need to invalidate them around
- * unmap_page_range and that is risky because notifiers can sleep and
- * what they do is basically undeterministic. So let's have a short
+ * If the mm has invalidate_{start,end}() notifiers that could block,
* sleep to give the oom victim some more time.
* TODO: we really want to get rid of this ugly hack and make sure that
- * notifiers cannot block for unbounded amount of time and add
- * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
+ * notifiers cannot block for unbounded amount of time
*/
- if (mm_has_notifiers(mm)) {
+ if (mm_has_blockable_invalidate_notifiers(mm)) {
up_read(&mm->mmap_sem);
schedule_timeout_idle(HZ);
goto unlock_oom;
@@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
* count elevated without a good reason.
*/
if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
- tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
- unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
- NULL);
- tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
+ const unsigned long start = vma->vm_start;
+ const unsigned long end = vma->vm_end;
+
+ tlb_gather_mmu(&tlb, mm, start, end);
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ unmap_page_range(&tlb, vma, start, end, NULL);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_finish_mmu(&tlb, start, end);
}
}
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 76c9688b6a0a..81e18ceef579 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -293,7 +293,7 @@ int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
- * Determine how many pages need to be initialized durig early boot
+ * Determine how many pages need to be initialized during early boot
* (non-deferred initialization).
* The value of first_deferred_pfn will be set later, once non-deferred pages
* are initialized, but for now set it ULONG_MAX.
@@ -344,7 +344,7 @@ static inline bool update_defer_init(pg_data_t *pgdat,
unsigned long pfn, unsigned long zone_end,
unsigned long *nr_initialised)
{
- /* Always populate low zones for address-contrained allocations */
+ /* Always populate low zones for address-constrained allocations */
if (zone_end < pgdat_end_pfn(pgdat))
return true;
(*nr_initialised)++;
@@ -1177,9 +1177,10 @@ static void free_one_page(struct zone *zone,
}
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
- unsigned long zone, int nid)
+ unsigned long zone, int nid, bool zero)
{
- mm_zero_struct_page(page);
+ if (zero)
+ mm_zero_struct_page(page);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
@@ -1194,9 +1195,9 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
}
static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
- int nid)
+ int nid, bool zero)
{
- return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
+ return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero);
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -1217,7 +1218,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
break;
}
- __init_single_pfn(pfn, zid, nid);
+ __init_single_pfn(pfn, zid, nid, true);
}
#else
static inline void init_reserved_page(unsigned long pfn)
@@ -1457,92 +1458,87 @@ static inline void __init pgdat_init_report_one_done(void)
}
/*
- * Helper for deferred_init_range, free the given range, reset the counters, and
- * return number of pages freed.
+ * Returns true if page needs to be initialized or freed to buddy allocator.
+ *
+ * First we check if pfn is valid on architectures where it is possible to have
+ * holes within pageblock_nr_pages. On systems where it is not possible, this
+ * function is optimized out.
+ *
+ * Then, we check if a current large page is valid by only checking the validity
+ * of the head pfn.
+ *
+ * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave
+ * within a node: a pfn is between start and end of a node, but does not belong
+ * to this memory node.
*/
-static inline unsigned long __init __def_free(unsigned long *nr_free,
- unsigned long *free_base_pfn,
- struct page **page)
+static inline bool __init
+deferred_pfn_valid(int nid, unsigned long pfn,
+ struct mminit_pfnnid_cache *nid_init_state)
{
- unsigned long nr = *nr_free;
+ if (!pfn_valid_within(pfn))
+ return false;
+ if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
+ return false;
+ if (!meminit_pfn_in_nid(pfn, nid, nid_init_state))
+ return false;
+ return true;
+}
- deferred_free_range(*free_base_pfn, nr);
- *free_base_pfn = 0;
- *nr_free = 0;
- *page = NULL;
+/*
+ * Free pages to buddy allocator. Try to free aligned pages in
+ * pageblock_nr_pages sizes.
+ */
+static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
+ unsigned long end_pfn)
+{
+ struct mminit_pfnnid_cache nid_init_state = { };
+ unsigned long nr_pgmask = pageblock_nr_pages - 1;
+ unsigned long nr_free = 0;
- return nr;
+ for (; pfn < end_pfn; pfn++) {
+ if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+ deferred_free_range(pfn - nr_free, nr_free);
+ nr_free = 0;
+ } else if (!(pfn & nr_pgmask)) {
+ deferred_free_range(pfn - nr_free, nr_free);
+ nr_free = 1;
+ cond_resched();
+ } else {
+ nr_free++;
+ }
+ }
+ /* Free the last block of pages to allocator */
+ deferred_free_range(pfn - nr_free, nr_free);
}
-static unsigned long __init deferred_init_range(int nid, int zid,
- unsigned long start_pfn,
- unsigned long end_pfn)
+/*
+ * Initialize struct pages. We minimize pfn page lookups and scheduler checks
+ * by performing it only once every pageblock_nr_pages.
+ * Return number of pages initialized.
+ */
+static unsigned long __init deferred_init_pages(int nid, int zid,
+ unsigned long pfn,
+ unsigned long end_pfn)
{
struct mminit_pfnnid_cache nid_init_state = { };
unsigned long nr_pgmask = pageblock_nr_pages - 1;
- unsigned long free_base_pfn = 0;
unsigned long nr_pages = 0;
- unsigned long nr_free = 0;
struct page *page = NULL;
- unsigned long pfn;
- /*
- * First we check if pfn is valid on architectures where it is possible
- * to have holes within pageblock_nr_pages. On systems where it is not
- * possible, this function is optimized out.
- *
- * Then, we check if a current large page is valid by only checking the
- * validity of the head pfn.
- *
- * meminit_pfn_in_nid is checked on systems where pfns can interleave
- * within a node: a pfn is between start and end of a node, but does not
- * belong to this memory node.
- *
- * Finally, we minimize pfn page lookups and scheduler checks by
- * performing it only once every pageblock_nr_pages.
- *
- * We do it in two loops: first we initialize struct page, than free to
- * buddy allocator, becuse while we are freeing pages we can access
- * pages that are ahead (computing buddy page in __free_one_page()).
- */
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- if (!pfn_valid_within(pfn))
+ for (; pfn < end_pfn; pfn++) {
+ if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+ page = NULL;
continue;
- if ((pfn & nr_pgmask) || pfn_valid(pfn)) {
- if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
- if (page && (pfn & nr_pgmask))
- page++;
- else
- page = pfn_to_page(pfn);
- __init_single_page(page, pfn, zid, nid);
- cond_resched();
- }
- }
- }
-
- page = NULL;
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- if (!pfn_valid_within(pfn)) {
- nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
- } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) {
- nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
- } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
- nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
- } else if (page && (pfn & nr_pgmask)) {
- page++;
- nr_free++;
- } else {
- nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+ } else if (!page || !(pfn & nr_pgmask)) {
page = pfn_to_page(pfn);
- free_base_pfn = pfn;
- nr_free = 1;
cond_resched();
+ } else {
+ page++;
}
+ __init_single_page(page, pfn, zid, nid, true);
+ nr_pages++;
}
- /* Free the last block of pages to allocator */
- nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
-
- return nr_pages;
+ return (nr_pages);
}
/* Initialise remaining memory on a node */
@@ -1582,10 +1578,21 @@ static int __init deferred_init_memmap(void *data)
}
first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
+ /*
+ * Initialize and free pages. We do it in two loops: first we initialize
+ * struct page, than free to buddy allocator, because while we are
+ * freeing pages we can access pages that are ahead (computing buddy
+ * page in __free_one_page()).
+ */
+ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
+ spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
+ epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
+ nr_pages += deferred_init_pages(nid, zid, spfn, epfn);
+ }
for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
- nr_pages += deferred_init_range(nid, zid, spfn, epfn);
+ deferred_free_pages(nid, zid, spfn, epfn);
}
/* Sanity check that the next zone really is unpopulated */
@@ -3391,7 +3398,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
if (gfp_mask & __GFP_THISNODE)
goto out;
- /* Exhausted what can be done so it's blamo time */
+ /* Exhausted what can be done so it's blame time */
if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
*did_some_progress = 1;
@@ -4272,7 +4279,7 @@ unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
struct page *page;
/*
- * __get_free_pages() returns a 32-bit address, which cannot represent
+ * __get_free_pages() returns a virtual address, which cannot represent
* a highmem page
*/
VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
@@ -5314,9 +5321,9 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
* done. Non-atomic initialization, single-pass.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn, enum memmap_context context)
+ unsigned long start_pfn, enum memmap_context context,
+ struct vmem_altmap *altmap)
{
- struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn));
unsigned long end_pfn = start_pfn + size;
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long pfn;
@@ -5393,15 +5400,20 @@ not_early:
* can be created for invalid pages (for alignment)
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
+ *
+ * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
+ * because this is done early in sparse_add_one_section
*/
if (!(pfn & (pageblock_nr_pages - 1))) {
struct page *page = pfn_to_page(pfn);
- __init_single_page(page, pfn, zone, nid);
+ __init_single_page(page, pfn, zone, nid,
+ context != MEMMAP_HOTPLUG);
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
} else {
- __init_single_pfn(pfn, zone, nid);
+ __init_single_pfn(pfn, zone, nid,
+ context != MEMMAP_HOTPLUG);
}
}
}
@@ -5417,7 +5429,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
#ifndef __HAVE_ARCH_MEMMAP_INIT
#define memmap_init(size, nid, zone, start_pfn) \
- memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
+ memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL)
#endif
static int zone_batchsize(struct zone *zone)
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 2c16216c29b6..5295ef331165 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -59,7 +59,9 @@
*/
static struct page_ext_operations *page_ext_ops[] = {
+#ifdef CONFIG_DEBUG_PAGEALLOC
&debug_guardpage_ops,
+#endif
#ifdef CONFIG_PAGE_OWNER
&page_owner_ops,
#endif
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 270a8219ccd0..9886c6073828 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -528,21 +528,18 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
{
- struct page *page;
- struct page_ext *page_ext;
- unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
- unsigned long end_pfn = pfn + zone->spanned_pages;
+ unsigned long pfn = zone->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(zone);
unsigned long count = 0;
- /* Scan block by block. First and last block may be incomplete */
- pfn = zone->zone_start_pfn;
-
/*
* Walk the zone in pageblock_nr_pages steps. If a page block spans
* a zone boundary, it will be double counted between zones. This does
* not matter as the mixed block count will still be correct
*/
for (; pfn < end_pfn; ) {
+ unsigned long block_end_pfn;
+
if (!pfn_valid(pfn)) {
pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
continue;
@@ -551,9 +548,10 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
block_end_pfn = min(block_end_pfn, end_pfn);
- page = pfn_to_page(pfn);
-
for (; pfn < block_end_pfn; pfn++) {
+ struct page *page;
+ struct page_ext *page_ext;
+
if (!pfn_valid_within(pfn))
continue;
@@ -635,9 +633,7 @@ static int __init pageowner_init(void)
dentry = debugfs_create_file("page_owner", S_IRUSR, NULL,
NULL, &proc_page_owner_operations);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- return 0;
+ return PTR_ERR_OR_ZERO(dentry);
}
late_initcall(pageowner_init)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 1e4ee763c190..cf2af04b34b9 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -181,12 +181,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
#endif
#ifndef __HAVE_ARCH_PMDP_INVALIDATE
-void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
- pmd_t entry = *pmdp;
- set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
+ pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp));
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return old;
}
#endif
diff --git a/mm/shmem.c b/mm/shmem.c
index 7fbe67be86fa..1907688b75ee 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2717,15 +2717,28 @@ continue_resched:
return error;
}
+static unsigned int *memfd_file_seals_ptr(struct file *file)
+{
+ if (file->f_op == &shmem_file_operations)
+ return &SHMEM_I(file_inode(file))->seals;
+
+#ifdef CONFIG_HUGETLBFS
+ if (file->f_op == &hugetlbfs_file_operations)
+ return &HUGETLBFS_I(file_inode(file))->seals;
+#endif
+
+ return NULL;
+}
+
#define F_ALL_SEALS (F_SEAL_SEAL | \
F_SEAL_SHRINK | \
F_SEAL_GROW | \
F_SEAL_WRITE)
-int shmem_add_seals(struct file *file, unsigned int seals)
+static int memfd_add_seals(struct file *file, unsigned int seals)
{
struct inode *inode = file_inode(file);
- struct shmem_inode_info *info = SHMEM_I(inode);
+ unsigned int *file_seals;
int error;
/*
@@ -2758,8 +2771,6 @@ int shmem_add_seals(struct file *file, unsigned int seals)
* other file types.
*/
- if (file->f_op != &shmem_file_operations)
- return -EINVAL;
if (!(file->f_mode & FMODE_WRITE))
return -EPERM;
if (seals & ~(unsigned int)F_ALL_SEALS)
@@ -2767,12 +2778,18 @@ int shmem_add_seals(struct file *file, unsigned int seals)
inode_lock(inode);
- if (info->seals & F_SEAL_SEAL) {
+ file_seals = memfd_file_seals_ptr(file);
+ if (!file_seals) {
+ error = -EINVAL;
+ goto unlock;
+ }
+
+ if (*file_seals & F_SEAL_SEAL) {
error = -EPERM;
goto unlock;
}
- if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) {
+ if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
error = mapping_deny_writable(file->f_mapping);
if (error)
goto unlock;
@@ -2784,25 +2801,22 @@ int shmem_add_seals(struct file *file, unsigned int seals)
}
}
- info->seals |= seals;
+ *file_seals |= seals;
error = 0;
unlock:
inode_unlock(inode);
return error;
}
-EXPORT_SYMBOL_GPL(shmem_add_seals);
-int shmem_get_seals(struct file *file)
+static int memfd_get_seals(struct file *file)
{
- if (file->f_op != &shmem_file_operations)
- return -EINVAL;
+ unsigned int *seals = memfd_file_seals_ptr(file);
- return SHMEM_I(file_inode(file))->seals;
+ return seals ? *seals : -EINVAL;
}
-EXPORT_SYMBOL_GPL(shmem_get_seals);
-long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
{
long error;
@@ -2812,10 +2826,10 @@ long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
if (arg > UINT_MAX)
return -EINVAL;
- error = shmem_add_seals(file, arg);
+ error = memfd_add_seals(file, arg);
break;
case F_GET_SEALS:
- error = shmem_get_seals(file);
+ error = memfd_get_seals(file);
break;
default:
error = -EINVAL;
@@ -3657,7 +3671,7 @@ SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
unsigned int, flags)
{
- struct shmem_inode_info *info;
+ unsigned int *file_seals;
struct file *file;
int fd, error;
char *name;
@@ -3667,9 +3681,6 @@ SYSCALL_DEFINE2(memfd_create,
if (flags & ~(unsigned int)MFD_ALL_FLAGS)
return -EINVAL;
} else {
- /* Sealing not supported in hugetlbfs (MFD_HUGETLB) */
- if (flags & MFD_ALLOW_SEALING)
- return -EINVAL;
/* Allow huge page size encoding in flags. */
if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
(MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
@@ -3722,12 +3733,8 @@ SYSCALL_DEFINE2(memfd_create,
file->f_flags |= O_RDWR | O_LARGEFILE;
if (flags & MFD_ALLOW_SEALING) {
- /*
- * flags check at beginning of function ensures
- * this is not a hugetlbfs (MFD_HUGETLB) file.
- */
- info = SHMEM_I(file_inode(file));
- info->seals &= ~F_SEAL_SEAL;
+ file_seals = memfd_file_seals_ptr(file);
+ *file_seals &= ~F_SEAL_SEAL;
}
fd_install(fd, file);
diff --git a/mm/slab.c b/mm/slab.c
index 4e51ef954026..cd86f15071ad 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1281,7 +1281,7 @@ void __init kmem_cache_init(void)
create_boot_cache(kmem_cache, "kmem_cache",
offsetof(struct kmem_cache, node) +
nr_node_ids * sizeof(struct kmem_cache_node *),
- SLAB_HWCACHE_ALIGN);
+ SLAB_HWCACHE_ALIGN, 0, 0);
list_add(&kmem_cache->list, &slab_caches);
slab_state = PARTIAL;
@@ -1291,7 +1291,8 @@ void __init kmem_cache_init(void)
*/
kmalloc_caches[INDEX_NODE] = create_kmalloc_cache(
kmalloc_info[INDEX_NODE].name,
- kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
+ kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS,
+ 0, kmalloc_size(INDEX_NODE));
slab_state = PARTIAL_NODE;
setup_kmalloc_cache_index_table();
@@ -1316,8 +1317,6 @@ void __init kmem_cache_init_late(void)
{
struct kmem_cache *cachep;
- slab_state = UP;
-
/* 6) resize the head arrays to their final sizes */
mutex_lock(&slab_mutex);
list_for_each_entry(cachep, &slab_caches, list)
@@ -1353,8 +1352,6 @@ static int __init cpucache_init(void)
slab_online_cpu, slab_offline_cpu);
WARN_ON(ret < 0);
- /* Done! */
- slab_state = FULL;
return 0;
}
__initcall(cpucache_init);
@@ -4389,13 +4386,15 @@ module_init(slab_proc_init);
#ifdef CONFIG_HARDENED_USERCOPY
/*
- * Rejects objects that are incorrectly sized.
+ * Rejects incorrectly sized objects and objects that are to be copied
+ * to/from userspace but do not fall entirely within the containing slab
+ * cache's usercopy region.
*
* Returns NULL if check passes, otherwise const char * to name of cache
* to indicate an error.
*/
-const char *__check_heap_object(const void *ptr, unsigned long n,
- struct page *page)
+void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
+ bool to_user)
{
struct kmem_cache *cachep;
unsigned int objnr;
@@ -4409,11 +4408,26 @@ const char *__check_heap_object(const void *ptr, unsigned long n,
/* Find offset within object. */
offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);
- /* Allow address range falling entirely within object size. */
- if (offset <= cachep->object_size && n <= cachep->object_size - offset)
- return NULL;
+ /* Allow address range falling entirely within usercopy region. */
+ if (offset >= cachep->useroffset &&
+ offset - cachep->useroffset <= cachep->usersize &&
+ n <= cachep->useroffset - offset + cachep->usersize)
+ return;
+
+ /*
+ * If the copy is still within the allocated object, produce
+ * a warning instead of rejecting the copy. This is intended
+ * to be a temporary method to find any missing usercopy
+ * whitelists.
+ */
+ if (usercopy_fallback &&
+ offset <= cachep->object_size &&
+ n <= cachep->object_size - offset) {
+ usercopy_warn("SLAB object", cachep->name, to_user, offset, n);
+ return;
+ }
- return cachep->name;
+ usercopy_abort("SLAB object", cachep->name, to_user, offset, n);
}
#endif /* CONFIG_HARDENED_USERCOPY */
diff --git a/mm/slab.h b/mm/slab.h
index ad657ffa44e5..51813236e773 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -22,6 +22,8 @@ struct kmem_cache {
unsigned int size; /* The aligned/padded/added on size */
unsigned int align; /* Alignment as calculated */
slab_flags_t flags; /* Active flags on the slab */
+ size_t useroffset; /* Usercopy region offset */
+ size_t usersize; /* Usercopy region size */
const char *name; /* Slab name for sysfs */
int refcount; /* Use counter */
void (*ctor)(void *); /* Called on object slot creation */
@@ -78,9 +80,6 @@ extern const struct kmalloc_info_struct {
unsigned long size;
} kmalloc_info[];
-unsigned long calculate_alignment(slab_flags_t flags,
- unsigned long align, unsigned long size);
-
#ifndef CONFIG_SLOB
/* Kmalloc array related functions */
void setup_kmalloc_cache_index_table(void);
@@ -95,9 +94,11 @@ struct kmem_cache *kmalloc_slab(size_t, gfp_t);
int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
- slab_flags_t flags);
+ slab_flags_t flags, size_t useroffset,
+ size_t usersize);
extern void create_boot_cache(struct kmem_cache *, const char *name,
- size_t size, slab_flags_t flags);
+ size_t size, slab_flags_t flags, size_t useroffset,
+ size_t usersize);
int slab_unmergeable(struct kmem_cache *s);
struct kmem_cache *find_mergeable(size_t size, size_t align,
diff --git a/mm/slab_common.c b/mm/slab_common.c
index c8cb36774ba1..10f127b2de7c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -31,6 +31,14 @@ LIST_HEAD(slab_caches);
DEFINE_MUTEX(slab_mutex);
struct kmem_cache *kmem_cache;
+#ifdef CONFIG_HARDENED_USERCOPY
+bool usercopy_fallback __ro_after_init =
+ IS_ENABLED(CONFIG_HARDENED_USERCOPY_FALLBACK);
+module_param(usercopy_fallback, bool, 0400);
+MODULE_PARM_DESC(usercopy_fallback,
+ "WARN instead of reject usercopy whitelist violations");
+#endif
+
static LIST_HEAD(slab_caches_to_rcu_destroy);
static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
@@ -268,6 +276,35 @@ static inline void memcg_unlink_cache(struct kmem_cache *s)
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
/*
+ * Figure out what the alignment of the objects will be given a set of
+ * flags, a user specified alignment and the size of the objects.
+ */
+static unsigned long calculate_alignment(unsigned long flags,
+ unsigned long align, unsigned long size)
+{
+ /*
+ * If the user wants hardware cache aligned objects then follow that
+ * suggestion if the object is sufficiently large.
+ *
+ * The hardware cache alignment cannot override the specified
+ * alignment though. If that is greater then use it.
+ */
+ if (flags & SLAB_HWCACHE_ALIGN) {
+ unsigned long ralign;
+
+ ralign = cache_line_size();
+ while (size <= ralign / 2)
+ ralign /= 2;
+ align = max(align, ralign);
+ }
+
+ if (align < ARCH_SLAB_MINALIGN)
+ align = ARCH_SLAB_MINALIGN;
+
+ return ALIGN(align, sizeof(void *));
+}
+
+/*
* Find a mergeable slab cache
*/
int slab_unmergeable(struct kmem_cache *s)
@@ -281,6 +318,9 @@ int slab_unmergeable(struct kmem_cache *s)
if (s->ctor)
return 1;
+ if (s->usersize)
+ return 1;
+
/*
* We may have set a slab to be unmergeable during bootstrap.
*/
@@ -337,41 +377,18 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
return NULL;
}
-/*
- * Figure out what the alignment of the objects will be given a set of
- * flags, a user specified alignment and the size of the objects.
- */
-unsigned long calculate_alignment(slab_flags_t flags,
- unsigned long align, unsigned long size)
-{
- /*
- * If the user wants hardware cache aligned objects then follow that
- * suggestion if the object is sufficiently large.
- *
- * The hardware cache alignment cannot override the specified
- * alignment though. If that is greater then use it.
- */
- if (flags & SLAB_HWCACHE_ALIGN) {
- unsigned long ralign = cache_line_size();
- while (size <= ralign / 2)
- ralign /= 2;
- align = max(align, ralign);
- }
-
- if (align < ARCH_SLAB_MINALIGN)
- align = ARCH_SLAB_MINALIGN;
-
- return ALIGN(align, sizeof(void *));
-}
-
static struct kmem_cache *create_cache(const char *name,
size_t object_size, size_t size, size_t align,
- slab_flags_t flags, void (*ctor)(void *),
+ slab_flags_t flags, size_t useroffset,
+ size_t usersize, void (*ctor)(void *),
struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
struct kmem_cache *s;
int err;
+ if (WARN_ON(useroffset + usersize > object_size))
+ useroffset = usersize = 0;
+
err = -ENOMEM;
s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
if (!s)
@@ -382,6 +399,8 @@ static struct kmem_cache *create_cache(const char *name,
s->size = size;
s->align = align;
s->ctor = ctor;
+ s->useroffset = useroffset;
+ s->usersize = usersize;
err = init_memcg_params(s, memcg, root_cache);
if (err)
@@ -406,11 +425,13 @@ out_free_cache:
}
/*
- * kmem_cache_create - Create a cache.
+ * kmem_cache_create_usercopy - Create a cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
* @size: The size of objects to be created in this cache.
* @align: The required alignment for the objects.
* @flags: SLAB flags
+ * @useroffset: Usercopy region offset
+ * @usersize: Usercopy region size
* @ctor: A constructor for the objects.
*
* Returns a ptr to the cache on success, NULL on failure.
@@ -430,8 +451,9 @@ out_free_cache:
* as davem.
*/
struct kmem_cache *
-kmem_cache_create(const char *name, size_t size, size_t align,
- slab_flags_t flags, void (*ctor)(void *))
+kmem_cache_create_usercopy(const char *name, size_t size, size_t align,
+ slab_flags_t flags, size_t useroffset, size_t usersize,
+ void (*ctor)(void *))
{
struct kmem_cache *s = NULL;
const char *cache_name;
@@ -462,7 +484,13 @@ kmem_cache_create(const char *name, size_t size, size_t align,
*/
flags &= CACHE_CREATE_MASK;
- s = __kmem_cache_alias(name, size, align, flags, ctor);
+ /* Fail closed on bad usersize of useroffset values. */
+ if (WARN_ON(!usersize && useroffset) ||
+ WARN_ON(size < usersize || size - usersize < useroffset))
+ usersize = useroffset = 0;
+
+ if (!usersize)
+ s = __kmem_cache_alias(name, size, align, flags, ctor);
if (s)
goto out_unlock;
@@ -474,7 +502,7 @@ kmem_cache_create(const char *name, size_t size, size_t align,
s = create_cache(cache_name, size, size,
calculate_alignment(flags, align, size),
- flags, ctor, NULL, NULL);
+ flags, useroffset, usersize, ctor, NULL, NULL);
if (IS_ERR(s)) {
err = PTR_ERR(s);
kfree_const(cache_name);
@@ -500,6 +528,15 @@ out_unlock:
}
return s;
}
+EXPORT_SYMBOL(kmem_cache_create_usercopy);
+
+struct kmem_cache *
+kmem_cache_create(const char *name, size_t size, size_t align,
+ slab_flags_t flags, void (*ctor)(void *))
+{
+ return kmem_cache_create_usercopy(name, size, align, flags, 0, 0,
+ ctor);
+}
EXPORT_SYMBOL(kmem_cache_create);
static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
@@ -612,6 +649,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
s = create_cache(cache_name, root_cache->object_size,
root_cache->size, root_cache->align,
root_cache->flags & CACHE_CREATE_MASK,
+ root_cache->useroffset, root_cache->usersize,
root_cache->ctor, memcg, root_cache);
/*
* If we could not create a memcg cache, do not complain, because
@@ -879,13 +917,15 @@ bool slab_is_available(void)
#ifndef CONFIG_SLOB
/* Create a cache during boot when no slab services are available yet */
void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
- slab_flags_t flags)
+ slab_flags_t flags, size_t useroffset, size_t usersize)
{
int err;
s->name = name;
s->size = s->object_size = size;
s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+ s->useroffset = useroffset;
+ s->usersize = usersize;
slab_init_memcg_params(s);
@@ -899,14 +939,15 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
}
struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
- slab_flags_t flags)
+ slab_flags_t flags, size_t useroffset,
+ size_t usersize)
{
struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
if (!s)
panic("Out of memory when creating slab %s\n", name);
- create_boot_cache(s, name, size, flags);
+ create_boot_cache(s, name, size, flags, useroffset, usersize);
list_add(&s->list, &slab_caches);
memcg_link_cache(s);
s->refcount = 1;
@@ -1060,7 +1101,8 @@ void __init setup_kmalloc_cache_index_table(void)
static void __init new_kmalloc_cache(int idx, slab_flags_t flags)
{
kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name,
- kmalloc_info[idx].size, flags);
+ kmalloc_info[idx].size, flags, 0,
+ kmalloc_info[idx].size);
}
/*
@@ -1101,7 +1143,7 @@ void __init create_kmalloc_caches(slab_flags_t flags)
BUG_ON(!n);
kmalloc_dma_caches[i] = create_kmalloc_cache(n,
- size, SLAB_CACHE_DMA | flags);
+ size, SLAB_CACHE_DMA | flags, 0, 0);
}
}
#endif
diff --git a/mm/slub.c b/mm/slub.c
index cfd56e5a35fb..cc71176c6eef 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -838,6 +838,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
u8 *start;
u8 *fault;
u8 *end;
+ u8 *pad;
int length;
int remainder;
@@ -851,8 +852,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
if (!remainder)
return 1;
+ pad = end - remainder;
metadata_access_enable();
- fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
+ fault = memchr_inv(pad, POISON_INUSE, remainder);
metadata_access_disable();
if (!fault)
return 1;
@@ -860,9 +862,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
end--;
slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
- print_section(KERN_ERR, "Padding ", end - remainder, remainder);
+ print_section(KERN_ERR, "Padding ", pad, remainder);
- restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
+ restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
return 0;
}
@@ -2220,9 +2222,7 @@ static void unfreeze_partials(struct kmem_cache *s,
/*
* Put a page that was just frozen (in __slab_free) into a partial page
- * slot if available. This is done without interrupts disabled and without
- * preemption disabled. The cmpxchg is racy and may put the partial page
- * onto a random cpus partial slot.
+ * slot if available.
*
* If we did not find a slot then simply move all the partials to the
* per node partial list.
@@ -3813,13 +3813,15 @@ EXPORT_SYMBOL(__kmalloc_node);
#ifdef CONFIG_HARDENED_USERCOPY
/*
- * Rejects objects that are incorrectly sized.
+ * Rejects incorrectly sized objects and objects that are to be copied
+ * to/from userspace but do not fall entirely within the containing slab
+ * cache's usercopy region.
*
* Returns NULL if check passes, otherwise const char * to name of cache
* to indicate an error.
*/
-const char *__check_heap_object(const void *ptr, unsigned long n,
- struct page *page)
+void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
+ bool to_user)
{
struct kmem_cache *s;
unsigned long offset;
@@ -3827,11 +3829,11 @@ const char *__check_heap_object(const void *ptr, unsigned long n,
/* Find object and usable object size. */
s = page->slab_cache;
- object_size = slab_ksize(s);
/* Reject impossible pointers. */
if (ptr < page_address(page))
- return s->name;
+ usercopy_abort("SLUB object not in SLUB page?!", NULL,
+ to_user, 0, n);
/* Find offset within object. */
offset = (ptr - page_address(page)) % s->size;
@@ -3839,15 +3841,31 @@ const char *__check_heap_object(const void *ptr, unsigned long n,
/* Adjust for redzone and reject if within the redzone. */
if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) {
if (offset < s->red_left_pad)
- return s->name;
+ usercopy_abort("SLUB object in left red zone",
+ s->name, to_user, offset, n);
offset -= s->red_left_pad;
}
- /* Allow address range falling entirely within object size. */
- if (offset <= object_size && n <= object_size - offset)
- return NULL;
+ /* Allow address range falling entirely within usercopy region. */
+ if (offset >= s->useroffset &&
+ offset - s->useroffset <= s->usersize &&
+ n <= s->useroffset - offset + s->usersize)
+ return;
+
+ /*
+ * If the copy is still within the allocated object, produce
+ * a warning instead of rejecting the copy. This is intended
+ * to be a temporary method to find any missing usercopy
+ * whitelists.
+ */
+ object_size = slab_ksize(s);
+ if (usercopy_fallback &&
+ offset <= object_size && n <= object_size - offset) {
+ usercopy_warn("SLUB object", s->name, to_user, offset, n);
+ return;
+ }
- return s->name;
+ usercopy_abort("SLUB object", s->name, to_user, offset, n);
}
#endif /* CONFIG_HARDENED_USERCOPY */
@@ -4181,7 +4199,7 @@ void __init kmem_cache_init(void)
kmem_cache = &boot_kmem_cache;
create_boot_cache(kmem_cache_node, "kmem_cache_node",
- sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
+ sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
register_hotmemory_notifier(&slab_memory_callback_nb);
@@ -4191,7 +4209,7 @@ void __init kmem_cache_init(void)
create_boot_cache(kmem_cache, "kmem_cache",
offsetof(struct kmem_cache, node) +
nr_node_ids * sizeof(struct kmem_cache_node *),
- SLAB_HWCACHE_ALIGN);
+ SLAB_HWCACHE_ALIGN, 0, 0);
kmem_cache = bootstrap(&boot_kmem_cache);
@@ -5061,6 +5079,12 @@ static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
SLAB_ATTR_RO(cache_dma);
#endif
+static ssize_t usersize_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%zu\n", s->usersize);
+}
+SLAB_ATTR_RO(usersize);
+
static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
@@ -5435,6 +5459,7 @@ static struct attribute *slab_attrs[] = {
#ifdef CONFIG_FAILSLAB
&failslab_attr.attr,
#endif
+ &usersize_attr.attr,
NULL
};
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 17acf01791fa..bd0276d5f66b 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -74,7 +74,7 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
}
/* need to make sure size is all the same during early stage */
-static void * __meminit alloc_block_buf(unsigned long size, int node)
+void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
{
void *ptr;
@@ -107,33 +107,16 @@ static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
}
/**
- * vmem_altmap_alloc - allocate pages from the vmem_altmap reservation
- * @altmap - reserved page pool for the allocation
- * @nr_pfns - size (in pages) of the allocation
+ * altmap_alloc_block_buf - allocate pages from the device page map
+ * @altmap: device page map
+ * @size: size (in bytes) of the allocation
*
- * Allocations are aligned to the size of the request
+ * Allocations are aligned to the size of the request.
*/
-static unsigned long __meminit vmem_altmap_alloc(struct vmem_altmap *altmap,
- unsigned long nr_pfns)
-{
- unsigned long pfn = vmem_altmap_next_pfn(altmap);
- unsigned long nr_align;
-
- nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG);
- nr_align = ALIGN(pfn, nr_align) - pfn;
-
- if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap))
- return ULONG_MAX;
- altmap->alloc += nr_pfns;
- altmap->align += nr_align;
- return pfn + nr_align;
-}
-
-static void * __meminit altmap_alloc_block_buf(unsigned long size,
+void * __meminit altmap_alloc_block_buf(unsigned long size,
struct vmem_altmap *altmap)
{
- unsigned long pfn, nr_pfns;
- void *ptr;
+ unsigned long pfn, nr_pfns, nr_align;
if (size & ~PAGE_MASK) {
pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n",
@@ -141,25 +124,20 @@ static void * __meminit altmap_alloc_block_buf(unsigned long size,
return NULL;
}
+ pfn = vmem_altmap_next_pfn(altmap);
nr_pfns = size >> PAGE_SHIFT;
- pfn = vmem_altmap_alloc(altmap, nr_pfns);
- if (pfn < ULONG_MAX)
- ptr = __va(__pfn_to_phys(pfn));
- else
- ptr = NULL;
- pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n",
- __func__, pfn, altmap->alloc, altmap->align, nr_pfns);
+ nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG);
+ nr_align = ALIGN(pfn, nr_align) - pfn;
+ if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap))
+ return NULL;
- return ptr;
-}
+ altmap->alloc += nr_pfns;
+ altmap->align += nr_align;
+ pfn += nr_align;
-/* need to make sure size is all the same during early stage */
-void * __meminit __vmemmap_alloc_block_buf(unsigned long size, int node,
- struct vmem_altmap *altmap)
-{
- if (altmap)
- return altmap_alloc_block_buf(size, altmap);
- return alloc_block_buf(size, node);
+ pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n",
+ __func__, pfn, altmap->alloc, altmap->align, nr_pfns);
+ return __va(__pfn_to_phys(pfn));
}
void __meminit vmemmap_verify(pte_t *pte, int node,
@@ -178,7 +156,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
pte_t *pte = pte_offset_kernel(pmd, addr);
if (pte_none(*pte)) {
pte_t entry;
- void *p = alloc_block_buf(PAGE_SIZE, node);
+ void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
if (!p)
return NULL;
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
@@ -278,7 +256,8 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
return 0;
}
-struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
+struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid,
+ struct vmem_altmap *altmap)
{
unsigned long start;
unsigned long end;
@@ -288,7 +267,7 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
start = (unsigned long)map;
end = (unsigned long)(map + PAGES_PER_SECTION);
- if (vmemmap_populate(start, end, nid))
+ if (vmemmap_populate(start, end, nid, altmap))
return NULL;
return map;
@@ -318,7 +297,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
if (!present_section_nr(pnum))
continue;
- map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
+ map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL);
if (map_map[pnum])
continue;
ms = __nr_to_section(pnum);
diff --git a/mm/sparse.c b/mm/sparse.c
index 2609aba121e8..7af5e7a92528 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -264,7 +264,11 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
*/
static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
{
- return (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
+ unsigned long coded_mem_map =
+ (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
+ BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
+ BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
+ return coded_mem_map;
}
/*
@@ -417,7 +421,8 @@ static void __init sparse_early_usemaps_alloc_node(void *data,
}
#ifndef CONFIG_SPARSEMEM_VMEMMAP
-struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
+struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid,
+ struct vmem_altmap *altmap)
{
struct page *map;
unsigned long size;
@@ -472,7 +477,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
if (!present_section_nr(pnum))
continue;
- map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
+ map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL);
if (map_map[pnum])
continue;
ms = __nr_to_section(pnum);
@@ -500,7 +505,7 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
struct mem_section *ms = __nr_to_section(pnum);
int nid = sparse_early_nid(ms);
- map = sparse_mem_map_populate(pnum, nid);
+ map = sparse_mem_map_populate(pnum, nid, NULL);
if (map)
return map;
@@ -678,17 +683,19 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
+static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+ struct vmem_altmap *altmap)
{
/* This will make the necessary allocations eventually. */
- return sparse_mem_map_populate(pnum, nid);
+ return sparse_mem_map_populate(pnum, nid, altmap);
}
-static void __kfree_section_memmap(struct page *memmap)
+static void __kfree_section_memmap(struct page *memmap,
+ struct vmem_altmap *altmap)
{
unsigned long start = (unsigned long)memmap;
unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
- vmemmap_free(start, end);
+ vmemmap_free(start, end, altmap);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
static void free_map_bootmem(struct page *memmap)
@@ -696,7 +703,7 @@ static void free_map_bootmem(struct page *memmap)
unsigned long start = (unsigned long)memmap;
unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
- vmemmap_free(start, end);
+ vmemmap_free(start, end, NULL);
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
#else
@@ -721,12 +728,14 @@ got_map_ptr:
return ret;
}
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
+static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+ struct vmem_altmap *altmap)
{
return __kmalloc_section_memmap();
}
-static void __kfree_section_memmap(struct page *memmap)
+static void __kfree_section_memmap(struct page *memmap,
+ struct vmem_altmap *altmap)
{
if (is_vmalloc_addr(memmap))
vfree(memmap);
@@ -773,7 +782,8 @@ static void free_map_bootmem(struct page *memmap)
* set. If this is <=0, then that means that the passed-in
* map was not consumed and must be freed.
*/
-int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn)
+int __meminit sparse_add_one_section(struct pglist_data *pgdat,
+ unsigned long start_pfn, struct vmem_altmap *altmap)
{
unsigned long section_nr = pfn_to_section_nr(start_pfn);
struct mem_section *ms;
@@ -789,12 +799,12 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long st
ret = sparse_index_init(section_nr, pgdat->node_id);
if (ret < 0 && ret != -EEXIST)
return ret;
- memmap = kmalloc_section_memmap(section_nr, pgdat->node_id);
+ memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, altmap);
if (!memmap)
return -ENOMEM;
usemap = __kmalloc_section_usemap();
if (!usemap) {
- __kfree_section_memmap(memmap);
+ __kfree_section_memmap(memmap, altmap);
return -ENOMEM;
}
@@ -816,7 +826,7 @@ out:
pgdat_resize_unlock(pgdat, &flags);
if (ret <= 0) {
kfree(usemap);
- __kfree_section_memmap(memmap);
+ __kfree_section_memmap(memmap, altmap);
}
return ret;
}
@@ -843,7 +853,8 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
}
#endif
-static void free_section_usemap(struct page *memmap, unsigned long *usemap)
+static void free_section_usemap(struct page *memmap, unsigned long *usemap,
+ struct vmem_altmap *altmap)
{
struct page *usemap_page;
@@ -857,7 +868,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
kfree(usemap);
if (memmap)
- __kfree_section_memmap(memmap);
+ __kfree_section_memmap(memmap, altmap);
return;
}
@@ -871,7 +882,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
}
void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
- unsigned long map_offset)
+ unsigned long map_offset, struct vmem_altmap *altmap)
{
struct page *memmap = NULL;
unsigned long *usemap = NULL, flags;
@@ -889,7 +900,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
clear_hwpoisoned_pages(memmap + map_offset,
PAGES_PER_SECTION - map_offset);
- free_section_usemap(memmap, usemap);
+ free_section_usemap(memmap, usemap, altmap);
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/swap.c b/mm/swap.c
index 38e1b6374a97..10568b1548d4 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -411,7 +411,7 @@ static void __lru_cache_add(struct page *page)
}
/**
- * lru_cache_add: add a page to the page lists
+ * lru_cache_add_anon - add a page to the page lists
* @page: the page to add
*/
void lru_cache_add_anon(struct page *page)
@@ -688,7 +688,14 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
-void lru_add_drain_all_cpuslocked(void)
+/*
+ * Doesn't need any cpu hotplug locking because we do rely on per-cpu
+ * kworkers being shut down before our page_alloc_cpu_dead callback is
+ * executed on the offlined cpu.
+ * Calling this function with cpu hotplug locks held can actually lead
+ * to obscure indirect dependencies via WQ context.
+ */
+void lru_add_drain_all(void)
{
static DEFINE_MUTEX(lock);
static struct cpumask has_work;
@@ -724,13 +731,6 @@ void lru_add_drain_all_cpuslocked(void)
mutex_unlock(&lock);
}
-void lru_add_drain_all(void)
-{
- get_online_cpus();
- lru_add_drain_all_cpuslocked();
- put_online_cpus();
-}
-
/**
* release_pages - batched put_page()
* @pages: array of pages to release
@@ -930,10 +930,10 @@ EXPORT_SYMBOL(__pagevec_lru_add);
*/
unsigned pagevec_lookup_entries(struct pagevec *pvec,
struct address_space *mapping,
- pgoff_t start, unsigned nr_pages,
+ pgoff_t start, unsigned nr_entries,
pgoff_t *indices)
{
- pvec->nr = find_get_entries(mapping, start, nr_pages,
+ pvec->nr = find_get_entries(mapping, start, nr_entries,
pvec->pages, indices);
return pagevec_count(pvec);
}
@@ -965,9 +965,8 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
* @mapping: The address_space to search
* @start: The starting page index
* @end: The final page index
- * @nr_pages: The maximum number of pages
*
- * pagevec_lookup_range() will search for and return a group of up to @nr_pages
+ * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE
* pages in the mapping starting from index @start and upto index @end
* (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a
* reference against the pages in @pvec.
@@ -977,7 +976,7 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
* also update @start to index the next page for the traversal.
*
* pagevec_lookup_range() returns the number of pages which were found. If this
- * number is smaller than @nr_pages, the end of specified range has been
+ * number is smaller than PAGEVEC_SIZE, the end of specified range has been
* reached.
*/
unsigned pagevec_lookup_range(struct pagevec *pvec,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 3074b02eaa09..42fe5653814a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2697,7 +2697,7 @@ out:
}
#ifdef CONFIG_PROC_FS
-static unsigned swaps_poll(struct file *file, poll_table *wait)
+static __poll_t swaps_poll(struct file *file, poll_table *wait)
{
struct seq_file *seq = file->private_data;
diff --git a/mm/truncate.c b/mm/truncate.c
index e4b4cf0f4070..c34e2fd4f583 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -179,12 +179,8 @@ static void
truncate_cleanup_page(struct address_space *mapping, struct page *page)
{
if (page_mapped(page)) {
- loff_t holelen;
-
- holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE;
- unmap_mapping_range(mapping,
- (loff_t)page->index << PAGE_SHIFT,
- holelen, 0);
+ pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1;
+ unmap_mapping_pages(mapping, page->index, nr, false);
}
if (page_has_private(page))
@@ -715,19 +711,15 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
/*
* Zap the rest of the file in one hit.
*/
- unmap_mapping_range(mapping,
- (loff_t)index << PAGE_SHIFT,
- (loff_t)(1 + end - index)
- << PAGE_SHIFT,
- 0);
+ unmap_mapping_pages(mapping, index,
+ (1 + end - index), false);
did_range_unmap = 1;
} else {
/*
* Just zap this page
*/
- unmap_mapping_range(mapping,
- (loff_t)index << PAGE_SHIFT,
- PAGE_SIZE, 0);
+ unmap_mapping_pages(mapping, index,
+ 1, false);
}
}
BUG_ON(page_mapped(page));
@@ -753,8 +745,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
* get remapped later.
*/
if (dax_mapping(mapping)) {
- unmap_mapping_range(mapping, (loff_t)start << PAGE_SHIFT,
- (loff_t)(end - start + 1) << PAGE_SHIFT, 0);
+ unmap_mapping_pages(mapping, start, end - start + 1, false);
}
out:
cleancache_invalidate_inode(mapping);
diff --git a/mm/usercopy.c b/mm/usercopy.c
index a9852b24715d..e9e9325f7638 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -58,12 +58,40 @@ static noinline int check_stack_object(const void *obj, unsigned long len)
return GOOD_STACK;
}
-static void report_usercopy(const void *ptr, unsigned long len,
- bool to_user, const char *type)
+/*
+ * If these functions are reached, then CONFIG_HARDENED_USERCOPY has found
+ * an unexpected state during a copy_from_user() or copy_to_user() call.
+ * There are several checks being performed on the buffer by the
+ * __check_object_size() function. Normal stack buffer usage should never
+ * trip the checks, and kernel text addressing will always trip the check.
+ * For cache objects, it is checking that only the whitelisted range of
+ * bytes for a given cache is being accessed (via the cache's usersize and
+ * useroffset fields). To adjust a cache whitelist, use the usercopy-aware
+ * kmem_cache_create_usercopy() function to create the cache (and
+ * carefully audit the whitelist range).
+ */
+void usercopy_warn(const char *name, const char *detail, bool to_user,
+ unsigned long offset, unsigned long len)
+{
+ WARN_ONCE(1, "Bad or missing usercopy whitelist? Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n",
+ to_user ? "exposure" : "overwrite",
+ to_user ? "from" : "to",
+ name ? : "unknown?!",
+ detail ? " '" : "", detail ? : "", detail ? "'" : "",
+ offset, len);
+}
+
+void __noreturn usercopy_abort(const char *name, const char *detail,
+ bool to_user, unsigned long offset,
+ unsigned long len)
{
- pr_emerg("kernel memory %s attempt detected %s %p (%s) (%lu bytes)\n",
- to_user ? "exposure" : "overwrite",
- to_user ? "from" : "to", ptr, type ? : "unknown", len);
+ pr_emerg("Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n",
+ to_user ? "exposure" : "overwrite",
+ to_user ? "from" : "to",
+ name ? : "unknown?!",
+ detail ? " '" : "", detail ? : "", detail ? "'" : "",
+ offset, len);
+
/*
* For greater effect, it would be nice to do do_group_exit(),
* but BUG() actually hooks all the lock-breaking and per-arch
@@ -73,10 +101,10 @@ static void report_usercopy(const void *ptr, unsigned long len,
}
/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */
-static bool overlaps(const void *ptr, unsigned long n, unsigned long low,
- unsigned long high)
+static bool overlaps(const unsigned long ptr, unsigned long n,
+ unsigned long low, unsigned long high)
{
- unsigned long check_low = (uintptr_t)ptr;
+ const unsigned long check_low = ptr;
unsigned long check_high = check_low + n;
/* Does not overlap if entirely above or entirely below. */
@@ -87,15 +115,15 @@ static bool overlaps(const void *ptr, unsigned long n, unsigned long low,
}
/* Is this address range in the kernel text area? */
-static inline const char *check_kernel_text_object(const void *ptr,
- unsigned long n)
+static inline void check_kernel_text_object(const unsigned long ptr,
+ unsigned long n, bool to_user)
{
unsigned long textlow = (unsigned long)_stext;
unsigned long texthigh = (unsigned long)_etext;
unsigned long textlow_linear, texthigh_linear;
if (overlaps(ptr, n, textlow, texthigh))
- return "<kernel text>";
+ usercopy_abort("kernel text", NULL, to_user, ptr - textlow, n);
/*
* Some architectures have virtual memory mappings with a secondary
@@ -108,32 +136,30 @@ static inline const char *check_kernel_text_object(const void *ptr,
textlow_linear = (unsigned long)lm_alias(textlow);
/* No different mapping: we're done. */
if (textlow_linear == textlow)
- return NULL;
+ return;
/* Check the secondary mapping... */
texthigh_linear = (unsigned long)lm_alias(texthigh);
if (overlaps(ptr, n, textlow_linear, texthigh_linear))
- return "<linear kernel text>";
-
- return NULL;
+ usercopy_abort("linear kernel text", NULL, to_user,
+ ptr - textlow_linear, n);
}
-static inline const char *check_bogus_address(const void *ptr, unsigned long n)
+static inline void check_bogus_address(const unsigned long ptr, unsigned long n,
+ bool to_user)
{
/* Reject if object wraps past end of memory. */
- if ((unsigned long)ptr + n < (unsigned long)ptr)
- return "<wrapped address>";
+ if (ptr + n < ptr)
+ usercopy_abort("wrapped address", NULL, to_user, 0, ptr + n);
/* Reject if NULL or ZERO-allocation. */
if (ZERO_OR_NULL_PTR(ptr))
- return "<null>";
-
- return NULL;
+ usercopy_abort("null address", NULL, to_user, ptr, n);
}
/* Checks for allocs that are marked in some way as spanning multiple pages. */
-static inline const char *check_page_span(const void *ptr, unsigned long n,
- struct page *page, bool to_user)
+static inline void check_page_span(const void *ptr, unsigned long n,
+ struct page *page, bool to_user)
{
#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN
const void *end = ptr + n - 1;
@@ -150,28 +176,28 @@ static inline const char *check_page_span(const void *ptr, unsigned long n,
if (ptr >= (const void *)__start_rodata &&
end <= (const void *)__end_rodata) {
if (!to_user)
- return "<rodata>";
- return NULL;
+ usercopy_abort("rodata", NULL, to_user, 0, n);
+ return;
}
/* Allow kernel data region (if not marked as Reserved). */
if (ptr >= (const void *)_sdata && end <= (const void *)_edata)
- return NULL;
+ return;
/* Allow kernel bss region (if not marked as Reserved). */
if (ptr >= (const void *)__bss_start &&
end <= (const void *)__bss_stop)
- return NULL;
+ return;
/* Is the object wholly within one base page? */
if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) ==
((unsigned long)end & (unsigned long)PAGE_MASK)))
- return NULL;
+ return;
/* Allow if fully inside the same compound (__GFP_COMP) page. */
endpage = virt_to_head_page(end);
if (likely(endpage == page))
- return NULL;
+ return;
/*
* Reject if range is entirely either Reserved (i.e. special or
@@ -181,36 +207,37 @@ static inline const char *check_page_span(const void *ptr, unsigned long n,
is_reserved = PageReserved(page);
is_cma = is_migrate_cma_page(page);
if (!is_reserved && !is_cma)
- return "<spans multiple pages>";
+ usercopy_abort("spans multiple pages", NULL, to_user, 0, n);
for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) {
page = virt_to_head_page(ptr);
if (is_reserved && !PageReserved(page))
- return "<spans Reserved and non-Reserved pages>";
+ usercopy_abort("spans Reserved and non-Reserved pages",
+ NULL, to_user, 0, n);
if (is_cma && !is_migrate_cma_page(page))
- return "<spans CMA and non-CMA pages>";
+ usercopy_abort("spans CMA and non-CMA pages", NULL,
+ to_user, 0, n);
}
#endif
-
- return NULL;
}
-static inline const char *check_heap_object(const void *ptr, unsigned long n,
- bool to_user)
+static inline void check_heap_object(const void *ptr, unsigned long n,
+ bool to_user)
{
struct page *page;
if (!virt_addr_valid(ptr))
- return NULL;
+ return;
page = virt_to_head_page(ptr);
- /* Check slab allocator for flags and size. */
- if (PageSlab(page))
- return __check_heap_object(ptr, n, page);
-
- /* Verify object does not incorrectly span multiple pages. */
- return check_page_span(ptr, n, page, to_user);
+ if (PageSlab(page)) {
+ /* Check slab allocator for flags and size. */
+ __check_heap_object(ptr, n, page, to_user);
+ } else {
+ /* Verify object does not incorrectly span multiple pages. */
+ check_page_span(ptr, n, page, to_user);
+ }
}
/*
@@ -221,21 +248,15 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n,
*/
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
{
- const char *err;
-
/* Skip all tests if size is zero. */
if (!n)
return;
/* Check for invalid addresses. */
- err = check_bogus_address(ptr, n);
- if (err)
- goto report;
+ check_bogus_address((const unsigned long)ptr, n, to_user);
/* Check for bad heap object. */
- err = check_heap_object(ptr, n, to_user);
- if (err)
- goto report;
+ check_heap_object(ptr, n, to_user);
/* Check for bad stack object. */
switch (check_stack_object(ptr, n)) {
@@ -251,16 +272,10 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
*/
return;
default:
- err = "<process stack>";
- goto report;
+ usercopy_abort("process stack", NULL, to_user, 0, n);
}
/* Check for object in kernel to avoid text exposure. */
- err = check_kernel_text_object(ptr, n);
- if (!err)
- return;
-
-report:
- report_usercopy(ptr, n, to_user, err);
+ check_kernel_text_object((const unsigned long)ptr, n, to_user);
}
EXPORT_SYMBOL(__check_object_size);
diff --git a/mm/util.c b/mm/util.c
index 34e57fae959d..c1250501364f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -150,18 +150,14 @@ EXPORT_SYMBOL(kmemdup_nul);
* @src: source address in user space
* @len: number of bytes to copy
*
- * Returns an ERR_PTR() on failure.
+ * Returns an ERR_PTR() on failure. Result is physically
+ * contiguous, to be freed by kfree().
*/
void *memdup_user(const void __user *src, size_t len)
{
void *p;
- /*
- * Always use GFP_KERNEL, since copy_from_user() can sleep and
- * cause pagefault, which makes it pointless to use GFP_NOFS
- * or GFP_ATOMIC.
- */
- p = kmalloc_track_caller(len, GFP_KERNEL);
+ p = kmalloc_track_caller(len, GFP_USER);
if (!p)
return ERR_PTR(-ENOMEM);
@@ -174,6 +170,32 @@ void *memdup_user(const void __user *src, size_t len)
}
EXPORT_SYMBOL(memdup_user);
+/**
+ * vmemdup_user - duplicate memory region from user space
+ *
+ * @src: source address in user space
+ * @len: number of bytes to copy
+ *
+ * Returns an ERR_PTR() on failure. Result may be not
+ * physically contiguous. Use kvfree() to free.
+ */
+void *vmemdup_user(const void __user *src, size_t len)
+{
+ void *p;
+
+ p = kvmalloc(len, GFP_USER);
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+
+ if (copy_from_user(p, src, len)) {
+ kvfree(p);
+ return ERR_PTR(-EFAULT);
+ }
+
+ return p;
+}
+EXPORT_SYMBOL(vmemdup_user);
+
/*
* strndup_user - duplicate an existing string from user space
* @s: The string to duplicate
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 47d5ced51f2d..fdd3fc6be862 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -220,22 +220,6 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
return nr;
}
-unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
-{
- unsigned long nr;
-
- nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) +
- node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) +
- node_page_state_snapshot(pgdat, NR_ISOLATED_FILE);
-
- if (get_nr_swap_pages() > 0)
- nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) +
- node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) +
- node_page_state_snapshot(pgdat, NR_ISOLATED_ANON);
-
- return nr;
-}
-
/**
* lruvec_lru_size - Returns the number of pages on the given LRU list.
* @lruvec: lru vector
@@ -310,9 +294,7 @@ EXPORT_SYMBOL(unregister_shrinker);
#define SHRINK_BATCH 128
static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
- struct shrinker *shrinker,
- unsigned long nr_scanned,
- unsigned long nr_eligible)
+ struct shrinker *shrinker, int priority)
{
unsigned long freed = 0;
unsigned long long delta;
@@ -337,9 +319,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
total_scan = nr;
- delta = (4 * nr_scanned) / shrinker->seeks;
- delta *= freeable;
- do_div(delta, nr_eligible + 1);
+ delta = freeable >> priority;
+ delta *= 4;
+ do_div(delta, shrinker->seeks);
total_scan += delta;
if (total_scan < 0) {
pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
@@ -373,8 +355,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
total_scan = freeable * 2;
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
- nr_scanned, nr_eligible,
- freeable, delta, total_scan);
+ freeable, delta, total_scan, priority);
/*
* Normally, we should not scan less than batch_size objects in one
@@ -434,8 +415,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
* @gfp_mask: allocation context
* @nid: node whose slab caches to target
* @memcg: memory cgroup whose slab caches to target
- * @nr_scanned: pressure numerator
- * @nr_eligible: pressure denominator
+ * @priority: the reclaim priority
*
* Call the shrink functions to age shrinkable caches.
*
@@ -447,20 +427,14 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
* objects from the memory cgroup specified. Otherwise, only unaware
* shrinkers are called.
*
- * @nr_scanned and @nr_eligible form a ratio that indicate how much of
- * the available objects should be scanned. Page reclaim for example
- * passes the number of pages scanned and the number of pages on the
- * LRU lists that it considered on @nid, plus a bias in @nr_scanned
- * when it encountered mapped pages. The ratio is further biased by
- * the ->seeks setting of the shrink function, which indicates the
- * cost to recreate an object relative to that of an LRU page.
+ * @priority is sc->priority, we take the number of objects and >> by priority
+ * in order to get the scan target.
*
* Returns the number of reclaimed slab objects.
*/
static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg,
- unsigned long nr_scanned,
- unsigned long nr_eligible)
+ int priority)
{
struct shrinker *shrinker;
unsigned long freed = 0;
@@ -468,9 +442,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
return 0;
- if (nr_scanned == 0)
- nr_scanned = SWAP_CLUSTER_MAX;
-
if (!down_read_trylock(&shrinker_rwsem)) {
/*
* If we would return 0, our callers would understand that we
@@ -501,7 +472,16 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
sc.nid = 0;
- freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
+ freed += do_shrink_slab(&sc, shrinker, priority);
+ /*
+ * Bail out if someone want to register a new shrinker to
+ * prevent the regsitration from being stalled for long periods
+ * by parallel ongoing shrinking.
+ */
+ if (rwsem_is_contended(&shrinker_rwsem)) {
+ freed = freed ? : 1;
+ break;
+ }
}
up_read(&shrinker_rwsem);
@@ -519,8 +499,7 @@ void drop_slab_node(int nid)
freed = 0;
do {
- freed += shrink_slab(GFP_KERNEL, nid, memcg,
- 1000, 1000);
+ freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
} while (freed > 10);
}
@@ -1436,14 +1415,24 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
if (PageDirty(page)) {
struct address_space *mapping;
+ bool migrate_dirty;
/*
* Only pages without mappings or that have a
* ->migratepage callback are possible to migrate
- * without blocking
+ * without blocking. However, we can be racing with
+ * truncation so it's necessary to lock the page
+ * to stabilise the mapping as truncation holds
+ * the page lock until after the page is removed
+ * from the page cache.
*/
+ if (!trylock_page(page))
+ return ret;
+
mapping = page_mapping(page);
- if (mapping && !mapping->a_ops->migratepage)
+ migrate_dirty = mapping && mapping->a_ops->migratepage;
+ unlock_page(page);
+ if (!migrate_dirty)
return ret;
}
}
@@ -2615,14 +2604,12 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
reclaimed = sc->nr_reclaimed;
scanned = sc->nr_scanned;
-
shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
node_lru_pages += lru_pages;
if (memcg)
shrink_slab(sc->gfp_mask, pgdat->node_id,
- memcg, sc->nr_scanned - scanned,
- lru_pages);
+ memcg, sc->priority);
/* Record the group's reclaim efficiency */
vmpressure(sc->gfp_mask, memcg, false,
@@ -2646,14 +2633,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
}
} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
- /*
- * Shrink the slab caches in the same proportion that
- * the eligible LRU pages were scanned.
- */
if (global_reclaim(sc))
shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
- sc->nr_scanned - nr_scanned,
- node_lru_pages);
+ sc->priority);
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
diff --git a/mm/zpool.c b/mm/zpool.c
index fd3ff719c32c..e1e7aa6d1d06 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -21,6 +21,7 @@ struct zpool {
struct zpool_driver *driver;
void *pool;
const struct zpool_ops *ops;
+ bool evictable;
struct list_head list;
};
@@ -142,7 +143,7 @@ EXPORT_SYMBOL(zpool_has_pool);
*
* This creates a new zpool of the specified type. The gfp flags will be
* used when allocating memory, if the implementation supports it. If the
- * ops param is NULL, then the created zpool will not be shrinkable.
+ * ops param is NULL, then the created zpool will not be evictable.
*
* Implementations must guarantee this to be thread-safe.
*
@@ -180,6 +181,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
zpool->driver = driver;
zpool->pool = driver->create(name, gfp, ops, zpool);
zpool->ops = ops;
+ zpool->evictable = driver->shrink && ops && ops->evict;
if (!zpool->pool) {
pr_err("couldn't create %s pool\n", type);
@@ -296,7 +298,8 @@ void zpool_free(struct zpool *zpool, unsigned long handle)
int zpool_shrink(struct zpool *zpool, unsigned int pages,
unsigned int *reclaimed)
{
- return zpool->driver->shrink(zpool->pool, pages, reclaimed);
+ return zpool->driver->shrink ?
+ zpool->driver->shrink(zpool->pool, pages, reclaimed) : -EINVAL;
}
/**
@@ -355,6 +358,24 @@ u64 zpool_get_total_size(struct zpool *zpool)
return zpool->driver->total_size(zpool->pool);
}
+/**
+ * zpool_evictable() - Test if zpool is potentially evictable
+ * @pool The zpool to test
+ *
+ * Zpool is only potentially evictable when it's created with struct
+ * zpool_ops.evict and its driver implements struct zpool_driver.shrink.
+ *
+ * However, it doesn't necessarily mean driver will use zpool_ops.evict
+ * in its implementation of zpool_driver.shrink. It could do internal
+ * defragmentation instead.
+ *
+ * Returns: true if potentially evictable; false otherwise.
+ */
+bool zpool_evictable(struct zpool *zpool)
+{
+ return zpool->evictable;
+}
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 683c0651098c..c3013505c305 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -46,6 +46,7 @@
#include <linux/vmalloc.h>
#include <linux/preempt.h>
#include <linux/spinlock.h>
+#include <linux/shrinker.h>
#include <linux/types.h>
#include <linux/debugfs.h>
#include <linux/zsmalloc.h>
@@ -257,11 +258,7 @@ struct zs_pool {
/* Compact classes */
struct shrinker shrinker;
- /*
- * To signify that register_shrinker() was successful
- * and unregister_shrinker() will not Oops.
- */
- bool shrinker_enabled;
+
#ifdef CONFIG_ZSMALLOC_STAT
struct dentry *stat_dentry;
#endif
@@ -407,12 +404,6 @@ static void zs_zpool_free(void *pool, unsigned long handle)
zs_free(pool, handle);
}
-static int zs_zpool_shrink(void *pool, unsigned int pages,
- unsigned int *reclaimed)
-{
- return -EINVAL;
-}
-
static void *zs_zpool_map(void *pool, unsigned long handle,
enum zpool_mapmode mm)
{
@@ -450,7 +441,6 @@ static struct zpool_driver zs_zpool_driver = {
.destroy = zs_zpool_destroy,
.malloc = zs_zpool_malloc,
.free = zs_zpool_free,
- .shrink = zs_zpool_shrink,
.map = zs_zpool_map,
.unmap = zs_zpool_unmap,
.total_size = zs_zpool_total_size,
@@ -1057,7 +1047,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
* Reset OBJ_TAG_BITS bit to last link to tell
* whether it's allocated object or not.
*/
- link->next = -1 << OBJ_TAG_BITS;
+ link->next = -1UL << OBJ_TAG_BITS;
}
kunmap_atomic(vaddr);
page = next_page;
@@ -2324,10 +2314,7 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker,
static void zs_unregister_shrinker(struct zs_pool *pool)
{
- if (pool->shrinker_enabled) {
- unregister_shrinker(&pool->shrinker);
- pool->shrinker_enabled = false;
- }
+ unregister_shrinker(&pool->shrinker);
}
static int zs_register_shrinker(struct zs_pool *pool)
@@ -2426,11 +2413,13 @@ struct zs_pool *zs_create_pool(const char *name)
goto err;
/*
- * Not critical, we still can use the pool
- * and user can trigger compaction manually.
+ * Not critical since shrinker is only used to trigger internal
+ * defragmentation of the pool which is pretty optional thing. If
+ * registration fails we still can use the pool normally and user can
+ * trigger compaction manually. Thus, ignore return code.
*/
- if (zs_register_shrinker(pool) == 0)
- pool->shrinker_enabled = true;
+ zs_register_shrinker(pool);
+
return pool;
err:
diff --git a/mm/zswap.c b/mm/zswap.c
index d39581a076c3..c004aa4fd3f4 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -49,6 +49,8 @@
static u64 zswap_pool_total_size;
/* The number of compressed pages currently stored in zswap */
static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
+/* The number of same-value filled pages currently stored in zswap */
+static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0);
/*
* The statistics below are not protected from concurrent access for
@@ -116,6 +118,11 @@ module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
static unsigned int zswap_max_pool_percent = 20;
module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
+/* Enable/disable handling same-value filled pages (enabled by default) */
+static bool zswap_same_filled_pages_enabled = true;
+module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
+ bool, 0644);
+
/*********************************
* data structures
**********************************/
@@ -145,9 +152,10 @@ struct zswap_pool {
* be held while changing the refcount. Since the lock must
* be held, there is no reason to also make refcount atomic.
* length - the length in bytes of the compressed page data. Needed during
- * decompression
+ * decompression. For a same value filled page length is 0.
* pool - the zswap_pool the entry's data is in
* handle - zpool allocation handle that stores the compressed page data
+ * value - value of the same-value filled pages which have same content
*/
struct zswap_entry {
struct rb_node rbnode;
@@ -155,7 +163,10 @@ struct zswap_entry {
int refcount;
unsigned int length;
struct zswap_pool *pool;
- unsigned long handle;
+ union {
+ unsigned long handle;
+ unsigned long value;
+ };
};
struct zswap_header {
@@ -320,8 +331,12 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
*/
static void zswap_free_entry(struct zswap_entry *entry)
{
- zpool_free(entry->pool->zpool, entry->handle);
- zswap_pool_put(entry->pool);
+ if (!entry->length)
+ atomic_dec(&zswap_same_filled_pages);
+ else {
+ zpool_free(entry->pool->zpool, entry->handle);
+ zswap_pool_put(entry->pool);
+ }
zswap_entry_cache_free(entry);
atomic_dec(&zswap_stored_pages);
zswap_update_total_size();
@@ -953,6 +968,28 @@ static int zswap_shrink(void)
return ret;
}
+static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
+{
+ unsigned int pos;
+ unsigned long *page;
+
+ page = (unsigned long *)ptr;
+ for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) {
+ if (page[pos] != page[0])
+ return 0;
+ }
+ *value = page[0];
+ return 1;
+}
+
+static void zswap_fill_page(void *ptr, unsigned long value)
+{
+ unsigned long *page;
+
+ page = (unsigned long *)ptr;
+ memset_l(page, value, PAGE_SIZE / sizeof(unsigned long));
+}
+
/*********************************
* frontswap hooks
**********************************/
@@ -964,11 +1001,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
struct zswap_entry *entry, *dupentry;
struct crypto_comp *tfm;
int ret;
- unsigned int dlen = PAGE_SIZE, len;
- unsigned long handle;
+ unsigned int hlen, dlen = PAGE_SIZE;
+ unsigned long handle, value;
char *buf;
u8 *src, *dst;
- struct zswap_header *zhdr;
+ struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
if (!zswap_enabled || !tree) {
ret = -ENODEV;
@@ -993,6 +1030,19 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
goto reject;
}
+ if (zswap_same_filled_pages_enabled) {
+ src = kmap_atomic(page);
+ if (zswap_is_page_same_filled(src, &value)) {
+ kunmap_atomic(src);
+ entry->offset = offset;
+ entry->length = 0;
+ entry->value = value;
+ atomic_inc(&zswap_same_filled_pages);
+ goto insert_entry;
+ }
+ kunmap_atomic(src);
+ }
+
/* if entry is successfully added, it keeps the reference */
entry->pool = zswap_pool_current_get();
if (!entry->pool) {
@@ -1013,8 +1063,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
}
/* store */
- len = dlen + sizeof(struct zswap_header);
- ret = zpool_malloc(entry->pool->zpool, len,
+ hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
+ ret = zpool_malloc(entry->pool->zpool, hlen + dlen,
__GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM,
&handle);
if (ret == -ENOSPC) {
@@ -1025,10 +1075,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
zswap_reject_alloc_fail++;
goto put_dstmem;
}
- zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
- zhdr->swpentry = swp_entry(type, offset);
- buf = (u8 *)(zhdr + 1);
- memcpy(buf, dst, dlen);
+ buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
+ memcpy(buf, &zhdr, hlen);
+ memcpy(buf + hlen, dst, dlen);
zpool_unmap_handle(entry->pool->zpool, handle);
put_cpu_var(zswap_dstmem);
@@ -1037,6 +1086,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
entry->handle = handle;
entry->length = dlen;
+insert_entry:
/* map */
spin_lock(&tree->lock);
do {
@@ -1089,10 +1139,18 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
}
spin_unlock(&tree->lock);
+ if (!entry->length) {
+ dst = kmap_atomic(page);
+ zswap_fill_page(dst, entry->value);
+ kunmap_atomic(dst);
+ goto freeentry;
+ }
+
/* decompress */
dlen = PAGE_SIZE;
- src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
- ZPOOL_MM_RO) + sizeof(struct zswap_header);
+ src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
+ if (zpool_evictable(entry->pool->zpool))
+ src += sizeof(struct zswap_header);
dst = kmap_atomic(page);
tfm = *get_cpu_ptr(entry->pool->tfm);
ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
@@ -1101,6 +1159,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
zpool_unmap_handle(entry->pool->zpool, entry->handle);
BUG_ON(ret);
+freeentry:
spin_lock(&tree->lock);
zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
@@ -1209,6 +1268,8 @@ static int __init zswap_debugfs_init(void)
zswap_debugfs_root, &zswap_pool_total_size);
debugfs_create_atomic_t("stored_pages", S_IRUGO,
zswap_debugfs_root, &zswap_stored_pages);
+ debugfs_create_atomic_t("same_filled_pages", 0444,
+ zswap_debugfs_root, &zswap_same_filled_pages);
return 0;
}