summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig54
-rw-r--r--mm/Makefile1
-rw-r--r--mm/cleancache.c2
-rw-r--r--mm/compaction.c5
-rw-r--r--mm/filemap.c78
-rw-r--r--mm/gup.c236
-rw-r--r--mm/huge_memory.c39
-rw-r--r--mm/hugetlb.c103
-rw-r--r--mm/khugepaged.c1
-rw-r--r--mm/kmemleak.c136
-rw-r--r--mm/ksm.c823
-rw-r--r--mm/memblock.c26
-rw-r--r--mm/memcontrol.c91
-rw-r--r--mm/memory-failure.c26
-rw-r--r--mm/memory.c84
-rw-r--r--mm/memory_hotplug.c533
-rw-r--r--mm/mempolicy.c181
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/migrate.c21
-rw-r--r--mm/mlock.c5
-rw-r--r--mm/mmap.c162
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/nobootmem.c2
-rw-r--r--mm/oom_kill.c5
-rw-r--r--mm/page-writeback.c34
-rw-r--r--mm/page_alloc.c171
-rw-r--r--mm/page_io.c4
-rw-r--r--mm/page_isolation.c26
-rw-r--r--mm/page_vma_mapped.c3
-rw-r--r--mm/pagewalk.c3
-rw-r--r--mm/percpu-internal.h166
-rw-r--r--mm/percpu-km.c11
-rw-r--r--mm/percpu-stats.c222
-rw-r--r--mm/percpu-vm.c12
-rw-r--r--mm/percpu.c85
-rw-r--r--mm/rmap.c31
-rw-r--r--mm/shmem.c15
-rw-r--r--mm/slab.c20
-rw-r--r--mm/slab.h18
-rw-r--r--mm/slab_common.c5
-rw-r--r--mm/slub.c159
-rw-r--r--mm/sparse.c104
-rw-r--r--mm/swap.c1
-rw-r--r--mm/swap_cgroup.c43
-rw-r--r--mm/swap_slots.c16
-rw-r--r--mm/swap_state.c98
-rw-r--r--mm/swapfile.c289
-rw-r--r--mm/util.c7
-rw-r--r--mm/vmalloc.c22
-rw-r--r--mm/vmpressure.c6
-rw-r--r--mm/vmscan.c79
-rw-r--r--mm/vmstat.c26
-rw-r--r--mm/workingset.c9
-rw-r--r--mm/zswap.c11
54 files changed, 2917 insertions, 1397 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index beb7a455915d..46ef77d5c332 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,7 +137,7 @@ config HAVE_MEMBLOCK_NODE_MAP
config HAVE_MEMBLOCK_PHYS_MAP
bool
-config HAVE_GENERIC_RCU_GUP
+config HAVE_GENERIC_GUP
bool
config ARCH_DISCARD_MEMBLOCK
@@ -149,32 +149,6 @@ config NO_BOOTMEM
config MEMORY_ISOLATION
bool
-config MOVABLE_NODE
- bool "Enable to assign a node which has only movable memory"
- depends on HAVE_MEMBLOCK
- depends on NO_BOOTMEM
- depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG
- depends on NUMA
- default n
- help
- Allow a node to have only movable memory. Pages used by the kernel,
- such as direct mapping pages cannot be migrated. So the corresponding
- memory device cannot be hotplugged. This option allows the following
- two things:
- - When the system is booting, node full of hotpluggable memory can
- be arranged to have only movable memory so that the whole node can
- be hot-removed. (need movable_node boot option specified).
- - After the system is up, the option allows users to online all the
- memory of a node as movable memory so that the whole node can be
- hot-removed.
-
- Users who don't use the memory hotplug feature are fine with this
- option on since they don't specify movable_node boot option or they
- don't online memory as movable.
-
- Say Y here if you want to hotplug a whole node.
- Say N here if you want kernel to use memory on all nodes evenly.
-
#
# Only be set on architectures that have completely implemented memory hotplug
# feature. If you are not sure, don't touch it.
@@ -446,6 +420,18 @@ choice
benefit.
endchoice
+config ARCH_WANTS_THP_SWAP
+ def_bool n
+
+config THP_SWAP
+ def_bool y
+ depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP
+ help
+ Swap transparent huge pages in one piece, without splitting.
+ XXX: For now this only does clustered swap space allocation.
+
+ For selection by architectures with reasonable THP sizes.
+
config TRANSPARENT_HUGE_PAGECACHE
def_bool y
depends on TRANSPARENT_HUGEPAGE
@@ -683,12 +669,16 @@ config IDLE_PAGE_TRACKING
See Documentation/vm/idle_page_tracking.txt for more details.
+# arch_add_memory() comprehends device memory
+config ARCH_HAS_ZONE_DEVICE
+ bool
+
config ZONE_DEVICE
bool "Device memory (pmem, etc...) hotplug support"
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
- depends on X86_64 #arch_add_memory() comprehends device memory
+ depends on ARCH_HAS_ZONE_DEVICE
help
Device memory hotplug support allows for establishing pmem,
@@ -706,3 +696,11 @@ config ARCH_USES_HIGH_VMA_FLAGS
bool
config ARCH_HAS_PKEYS
bool
+
+config PERCPU_STATS
+ bool "Collect percpu memory statistics"
+ default n
+ help
+ This feature collects and exposes statistics via debugfs. The
+ information includes global and per chunk statistics, which can
+ be used to help understand percpu memory usage.
diff --git a/mm/Makefile b/mm/Makefile
index 026f6a828a50..411bd24d4a7c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -103,3 +103,4 @@ obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
+obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
diff --git a/mm/cleancache.c b/mm/cleancache.c
index ba5d8f3e6d68..f7b9fdc79d97 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -130,7 +130,7 @@ void __cleancache_init_shared_fs(struct super_block *sb)
int pool_id = CLEANCACHE_NO_BACKEND_SHARED;
if (cleancache_ops) {
- pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE);
+ pool_id = cleancache_ops->init_shared_fs(&sb->s_uuid, PAGE_SIZE);
if (pool_id < 0)
pool_id = CLEANCACHE_NO_POOL;
}
diff --git a/mm/compaction.c b/mm/compaction.c
index 613c59e928cb..fb548e4c7bd4 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -236,10 +236,9 @@ static void __reset_isolation_suitable(struct zone *zone)
cond_resched();
- if (!pfn_valid(pfn))
+ page = pfn_to_online_page(pfn);
+ if (!page)
continue;
-
- page = pfn_to_page(pfn);
if (zone != page_zone(page))
continue;
diff --git a/mm/filemap.c b/mm/filemap.c
index d7a30aefee0d..3247b4208034 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -386,6 +386,38 @@ int filemap_flush(struct address_space *mapping)
}
EXPORT_SYMBOL(filemap_flush);
+/**
+ * filemap_range_has_page - check if a page exists in range.
+ * @mapping: address space within which to check
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Find at least one page in the range supplied, usually used to check if
+ * direct writing in this range will trigger a writeback.
+ */
+bool filemap_range_has_page(struct address_space *mapping,
+ loff_t start_byte, loff_t end_byte)
+{
+ pgoff_t index = start_byte >> PAGE_SHIFT;
+ pgoff_t end = end_byte >> PAGE_SHIFT;
+ struct pagevec pvec;
+ bool ret;
+
+ if (end_byte < start_byte)
+ return false;
+
+ if (mapping->nrpages == 0)
+ return false;
+
+ pagevec_init(&pvec, 0);
+ if (!pagevec_lookup(&pvec, mapping, index, 1))
+ return false;
+ ret = (pvec.pages[0]->index <= end);
+ pagevec_release(&pvec);
+ return ret;
+}
+EXPORT_SYMBOL(filemap_range_has_page);
+
static void __filemap_fdatawait_range(struct address_space *mapping,
loff_t start_byte, loff_t end_byte)
{
@@ -860,10 +892,10 @@ struct wait_page_key {
struct wait_page_queue {
struct page *page;
int bit_nr;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
};
-static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
+static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
{
struct wait_page_key *key = arg;
struct wait_page_queue *wait_page
@@ -926,7 +958,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
struct page *page, int bit_nr, int state, bool lock)
{
struct wait_page_queue wait_page;
- wait_queue_t *wait = &wait_page.wait;
+ wait_queue_entry_t *wait = &wait_page.wait;
int ret = 0;
init_wait(wait);
@@ -937,9 +969,9 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
for (;;) {
spin_lock_irq(&q->lock);
- if (likely(list_empty(&wait->task_list))) {
+ if (likely(list_empty(&wait->entry))) {
if (lock)
- __add_wait_queue_tail_exclusive(q, wait);
+ __add_wait_queue_entry_tail_exclusive(q, wait);
else
__add_wait_queue(q, wait);
SetPageWaiters(page);
@@ -999,7 +1031,7 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
*
* Add an arbitrary @waiter to the wait queue for the nominated @page.
*/
-void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
+void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
{
wait_queue_head_t *q = page_waitqueue(page);
unsigned long flags;
@@ -2130,10 +2162,17 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
loff_t size;
size = i_size_read(inode);
- retval = filemap_write_and_wait_range(mapping, iocb->ki_pos,
- iocb->ki_pos + count - 1);
- if (retval < 0)
- goto out;
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (filemap_range_has_page(mapping, iocb->ki_pos,
+ iocb->ki_pos + count - 1))
+ return -EAGAIN;
+ } else {
+ retval = filemap_write_and_wait_range(mapping,
+ iocb->ki_pos,
+ iocb->ki_pos + count - 1);
+ if (retval < 0)
+ goto out;
+ }
file_accessed(file);
@@ -2318,7 +2357,7 @@ int filemap_fault(struct vm_fault *vmf)
/* No page in the page cache at all */
do_sync_mmap_readahead(vmf->vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
+ count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
retry_find:
page = find_get_page(mapping, offset);
@@ -2734,6 +2773,9 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
pos = iocb->ki_pos;
+ if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+ return -EINVAL;
+
if (limit != RLIM_INFINITY) {
if (iocb->ki_pos >= limit) {
send_sig(SIGXFSZ, current, 0);
@@ -2802,9 +2844,17 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
write_len = iov_iter_count(from);
end = (pos + write_len - 1) >> PAGE_SHIFT;
- written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
- if (written)
- goto out;
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ /* If there are pages to writeback, return */
+ if (filemap_range_has_page(inode->i_mapping, pos,
+ pos + iov_iter_count(from)))
+ return -EAGAIN;
+ } else {
+ written = filemap_write_and_wait_range(mapping, pos,
+ pos + write_len - 1);
+ if (written)
+ goto out;
+ }
/*
* After a write we want buffered reads to be sure to go to disk to get
diff --git a/mm/gup.c b/mm/gup.c
index d9e6fddcc51f..23f01c40c88f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -208,72 +208,28 @@ no_page:
return no_page_table(vma, flags);
}
-/**
- * follow_page_mask - look up a page descriptor from a user-virtual address
- * @vma: vm_area_struct mapping @address
- * @address: virtual address to look up
- * @flags: flags modifying lookup behaviour
- * @page_mask: on output, *page_mask is set according to the size of the page
- *
- * @flags can have FOLL_ flags set, defined in <linux/mm.h>
- *
- * Returns the mapped (struct page *), %NULL if no mapping exists, or
- * an error pointer if there is a mapping to something not represented
- * by a page descriptor (see also vm_normal_page()).
- */
-struct page *follow_page_mask(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags,
- unsigned int *page_mask)
+static struct page *follow_pmd_mask(struct vm_area_struct *vma,
+ unsigned long address, pud_t *pudp,
+ unsigned int flags, unsigned int *page_mask)
{
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
pmd_t *pmd;
spinlock_t *ptl;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
- *page_mask = 0;
-
- page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
- if (!IS_ERR(page)) {
- BUG_ON(flags & FOLL_GET);
- return page;
- }
-
- pgd = pgd_offset(mm, address);
- if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
- return no_page_table(vma, flags);
- p4d = p4d_offset(pgd, address);
- if (p4d_none(*p4d))
- return no_page_table(vma, flags);
- BUILD_BUG_ON(p4d_huge(*p4d));
- if (unlikely(p4d_bad(*p4d)))
- return no_page_table(vma, flags);
- pud = pud_offset(p4d, address);
- if (pud_none(*pud))
+ pmd = pmd_offset(pudp, address);
+ if (pmd_none(*pmd))
return no_page_table(vma, flags);
- if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
- page = follow_huge_pud(mm, address, pud, flags);
+ if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
+ page = follow_huge_pmd(mm, address, pmd, flags);
if (page)
return page;
return no_page_table(vma, flags);
}
- if (pud_devmap(*pud)) {
- ptl = pud_lock(mm, pud);
- page = follow_devmap_pud(vma, address, pud, flags);
- spin_unlock(ptl);
- if (page)
- return page;
- }
- if (unlikely(pud_bad(*pud)))
- return no_page_table(vma, flags);
-
- pmd = pmd_offset(pud, address);
- if (pmd_none(*pmd))
- return no_page_table(vma, flags);
- if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
- page = follow_huge_pmd(mm, address, pmd, flags);
+ if (is_hugepd(__hugepd(pmd_val(*pmd)))) {
+ page = follow_huge_pd(vma, address,
+ __hugepd(pmd_val(*pmd)), flags,
+ PMD_SHIFT);
if (page)
return page;
return no_page_table(vma, flags);
@@ -319,13 +275,131 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
return ret ? ERR_PTR(ret) :
follow_page_pte(vma, address, pmd, flags);
}
-
page = follow_trans_huge_pmd(vma, address, pmd, flags);
spin_unlock(ptl);
*page_mask = HPAGE_PMD_NR - 1;
return page;
}
+
+static struct page *follow_pud_mask(struct vm_area_struct *vma,
+ unsigned long address, p4d_t *p4dp,
+ unsigned int flags, unsigned int *page_mask)
+{
+ pud_t *pud;
+ spinlock_t *ptl;
+ struct page *page;
+ struct mm_struct *mm = vma->vm_mm;
+
+ pud = pud_offset(p4dp, address);
+ if (pud_none(*pud))
+ return no_page_table(vma, flags);
+ if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
+ page = follow_huge_pud(mm, address, pud, flags);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+ if (is_hugepd(__hugepd(pud_val(*pud)))) {
+ page = follow_huge_pd(vma, address,
+ __hugepd(pud_val(*pud)), flags,
+ PUD_SHIFT);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+ if (pud_devmap(*pud)) {
+ ptl = pud_lock(mm, pud);
+ page = follow_devmap_pud(vma, address, pud, flags);
+ spin_unlock(ptl);
+ if (page)
+ return page;
+ }
+ if (unlikely(pud_bad(*pud)))
+ return no_page_table(vma, flags);
+
+ return follow_pmd_mask(vma, address, pud, flags, page_mask);
+}
+
+
+static struct page *follow_p4d_mask(struct vm_area_struct *vma,
+ unsigned long address, pgd_t *pgdp,
+ unsigned int flags, unsigned int *page_mask)
+{
+ p4d_t *p4d;
+ struct page *page;
+
+ p4d = p4d_offset(pgdp, address);
+ if (p4d_none(*p4d))
+ return no_page_table(vma, flags);
+ BUILD_BUG_ON(p4d_huge(*p4d));
+ if (unlikely(p4d_bad(*p4d)))
+ return no_page_table(vma, flags);
+
+ if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
+ page = follow_huge_pd(vma, address,
+ __hugepd(p4d_val(*p4d)), flags,
+ P4D_SHIFT);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+ return follow_pud_mask(vma, address, p4d, flags, page_mask);
+}
+
+/**
+ * follow_page_mask - look up a page descriptor from a user-virtual address
+ * @vma: vm_area_struct mapping @address
+ * @address: virtual address to look up
+ * @flags: flags modifying lookup behaviour
+ * @page_mask: on output, *page_mask is set according to the size of the page
+ *
+ * @flags can have FOLL_ flags set, defined in <linux/mm.h>
+ *
+ * Returns the mapped (struct page *), %NULL if no mapping exists, or
+ * an error pointer if there is a mapping to something not represented
+ * by a page descriptor (see also vm_normal_page()).
+ */
+struct page *follow_page_mask(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags,
+ unsigned int *page_mask)
+{
+ pgd_t *pgd;
+ struct page *page;
+ struct mm_struct *mm = vma->vm_mm;
+
+ *page_mask = 0;
+
+ /* make this handle hugepd */
+ page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
+ if (!IS_ERR(page)) {
+ BUG_ON(flags & FOLL_GET);
+ return page;
+ }
+
+ pgd = pgd_offset(mm, address);
+
+ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ return no_page_table(vma, flags);
+
+ if (pgd_huge(*pgd)) {
+ page = follow_huge_pgd(mm, address, pgd, flags);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+ if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
+ page = follow_huge_pd(vma, address,
+ __hugepd(pgd_val(*pgd)), flags,
+ PGDIR_SHIFT);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+
+ return follow_p4d_mask(vma, address, pgd, flags, page_mask);
+}
+
static int get_gate_page(struct mm_struct *mm, unsigned long address,
unsigned int gup_flags, struct vm_area_struct **vma,
struct page **page)
@@ -387,11 +461,6 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
/* mlock all present pages, but do not fault in new pages */
if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
return -ENOENT;
- /* For mm_populate(), just skip the stack guard page. */
- if ((*flags & FOLL_POPULATE) &&
- (stack_guard_page_start(vma, address) ||
- stack_guard_page_end(vma, address + PAGE_SIZE)))
- return -ENOENT;
if (*flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
if (*flags & FOLL_REMOTE)
@@ -407,12 +476,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
ret = handle_mm_fault(vma, address, fault_flags);
if (ret & VM_FAULT_ERROR) {
- if (ret & VM_FAULT_OOM)
- return -ENOMEM;
- if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
- return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
- if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
- return -EFAULT;
+ int err = vm_fault_to_errno(ret, *flags);
+
+ if (err)
+ return err;
BUG();
}
@@ -723,12 +790,10 @@ retry:
ret = handle_mm_fault(vma, address, fault_flags);
major |= ret & VM_FAULT_MAJOR;
if (ret & VM_FAULT_ERROR) {
- if (ret & VM_FAULT_OOM)
- return -ENOMEM;
- if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
- return -EHWPOISON;
- if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
- return -EFAULT;
+ int err = vm_fault_to_errno(ret, 0);
+
+ if (err)
+ return err;
BUG();
}
@@ -1155,7 +1220,7 @@ struct page *get_dump_page(unsigned long addr)
#endif /* CONFIG_ELF_CORE */
/*
- * Generic RCU Fast GUP
+ * Generic Fast GUP
*
* get_user_pages_fast attempts to pin user pages by walking the page
* tables directly and avoids taking locks. Thus the walker needs to be
@@ -1176,8 +1241,8 @@ struct page *get_dump_page(unsigned long addr)
* Before activating this code, please be aware that the following assumptions
* are currently made:
*
- * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
- * pages containing page tables.
+ * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
+ * free pages containing page tables or TLB flushing requires IPI broadcast.
*
* *) ptes can be read atomically by the architecture.
*
@@ -1187,7 +1252,7 @@ struct page *get_dump_page(unsigned long addr)
*
* This code is based heavily on the PowerPC implementation by Nick Piggin.
*/
-#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
+#ifdef CONFIG_HAVE_GENERIC_GUP
#ifndef gup_get_pte
/*
@@ -1358,16 +1423,15 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return __gup_device_huge_pmd(orig, addr, end, pages, nr);
refs = 0;
- head = pmd_page(orig);
- page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
do {
- VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
(*nr)++;
page++;
refs++;
} while (addr += PAGE_SIZE, addr != end);
+ head = compound_head(pmd_page(orig));
if (!page_cache_add_speculative(head, refs)) {
*nr -= refs;
return 0;
@@ -1397,16 +1461,15 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return __gup_device_huge_pud(orig, addr, end, pages, nr);
refs = 0;
- head = pud_page(orig);
- page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
do {
- VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
(*nr)++;
page++;
refs++;
} while (addr += PAGE_SIZE, addr != end);
+ head = compound_head(pud_page(orig));
if (!page_cache_add_speculative(head, refs)) {
*nr -= refs;
return 0;
@@ -1435,16 +1498,15 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
BUILD_BUG_ON(pgd_devmap(orig));
refs = 0;
- head = pgd_page(orig);
- page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
+ page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
do {
- VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
(*nr)++;
page++;
refs++;
} while (addr += PAGE_SIZE, addr != end);
+ head = compound_head(pgd_page(orig));
if (!page_cache_add_speculative(head, refs)) {
*nr -= refs;
return 0;
@@ -1677,4 +1739,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
return ret;
}
-#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */
+#endif /* CONFIG_HAVE_GENERIC_GUP */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a84909cf20d3..86975dec0ba1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1426,8 +1426,11 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
*/
if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
page = pmd_page(*vmf->pmd);
+ if (!get_page_unless_zero(page))
+ goto out_unlock;
spin_unlock(vmf->ptl);
wait_on_page_locked(page);
+ put_page(page);
goto out;
}
@@ -1459,9 +1462,12 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
+ page_nid = -1;
+ if (!get_page_unless_zero(page))
+ goto out_unlock;
spin_unlock(vmf->ptl);
wait_on_page_locked(page);
- page_nid = -1;
+ put_page(page);
goto out;
}
@@ -1569,8 +1575,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
get_page(page);
spin_unlock(ptl);
split_huge_page(page);
- put_page(page);
unlock_page(page);
+ put_page(page);
goto out_unlocked;
}
@@ -2197,7 +2203,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
* atomic_set() here would be safe on all archs (and not only on x86),
* it's safer to use atomic_inc()/atomic_add().
*/
- if (PageAnon(head)) {
+ if (PageAnon(head) && !PageSwapCache(head)) {
page_ref_inc(page_tail);
} else {
/* Additional pin to radix tree */
@@ -2208,6 +2214,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
page_tail->flags |= (head->flags &
((1L << PG_referenced) |
(1L << PG_swapbacked) |
+ (1L << PG_swapcache) |
(1L << PG_mlocked) |
(1L << PG_uptodate) |
(1L << PG_active) |
@@ -2270,7 +2277,11 @@ static void __split_huge_page(struct page *page, struct list_head *list,
ClearPageCompound(head);
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
- page_ref_inc(head);
+ /* Additional pin to radix tree of swap cache */
+ if (PageSwapCache(head))
+ page_ref_add(head, 2);
+ else
+ page_ref_inc(head);
} else {
/* Additional pin to radix tree */
page_ref_add(head, 2);
@@ -2379,6 +2390,21 @@ int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
return ret;
}
+/* Racy check whether the huge page can be split */
+bool can_split_huge_page(struct page *page, int *pextra_pins)
+{
+ int extra_pins;
+
+ /* Additional pins from radix tree */
+ if (PageAnon(page))
+ extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
+ else
+ extra_pins = HPAGE_PMD_NR;
+ if (pextra_pins)
+ *pextra_pins = extra_pins;
+ return total_mapcount(page) == page_count(page) - extra_pins - 1;
+}
+
/*
* This function splits huge page into normal pages. @page can point to any
* subpage of huge page to split. Split doesn't change the position of @page.
@@ -2426,7 +2452,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
ret = -EBUSY;
goto out;
}
- extra_pins = 0;
mapping = NULL;
anon_vma_lock_write(anon_vma);
} else {
@@ -2438,8 +2463,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out;
}
- /* Addidional pins from radix tree */
- extra_pins = HPAGE_PMD_NR;
anon_vma = NULL;
i_mmap_lock_read(mapping);
}
@@ -2448,7 +2471,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
* Racy check if we can split the page, before freeze_page() will
* split PMDs
*/
- if (total_mapcount(head) != page_count(head) - extra_pins - 1) {
+ if (!can_split_huge_page(head, &extra_pins)) {
ret = -EBUSY;
goto out_unlock;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e5828875f7bb..1a88006ec634 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -867,7 +867,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
h->free_huge_pages_node[nid]++;
}
-static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
{
struct page *page;
@@ -887,6 +887,22 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
return page;
}
+static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+{
+ struct page *page;
+ int node;
+
+ if (nid != NUMA_NO_NODE)
+ return dequeue_huge_page_node_exact(h, nid);
+
+ for_each_online_node(node) {
+ page = dequeue_huge_page_node_exact(h, node);
+ if (page)
+ return page;
+ }
+ return NULL;
+}
+
/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
@@ -904,6 +920,8 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
struct page *page = NULL;
struct mempolicy *mpol;
nodemask_t *nodemask;
+ gfp_t gfp_mask;
+ int nid;
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
@@ -924,12 +942,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
- zonelist = huge_zonelist(vma, address,
- htlb_alloc_mask(h), &mpol, &nodemask);
+ gfp_mask = htlb_alloc_mask(h);
+ nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
+ zonelist = node_zonelist(nid, gfp_mask);
for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) {
- if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) {
+ if (cpuset_zone_allowed(zone, gfp_mask)) {
page = dequeue_huge_page_node(h, zone_to_nid(zone));
if (page) {
if (avoid_reserve)
@@ -1024,9 +1043,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
-#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \
- ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \
- defined(CONFIG_CMA))
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
static void destroy_compound_gigantic_page(struct page *page,
unsigned int order)
{
@@ -1158,8 +1175,7 @@ static int alloc_fresh_gigantic_page(struct hstate *h,
return 0;
}
-static inline bool gigantic_page_supported(void) { return true; }
-#else
+#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
static inline bool gigantic_page_supported(void) { return false; }
static inline void free_gigantic_page(struct page *page, unsigned int order) { }
static inline void destroy_compound_gigantic_page(struct page *page,
@@ -1545,13 +1561,13 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
do {
struct page *page;
struct mempolicy *mpol;
- struct zonelist *zl;
+ int nid;
nodemask_t *nodemask;
cpuset_mems_cookie = read_mems_allowed_begin();
- zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask);
+ nid = huge_node(vma, addr, gfp, &mpol, &nodemask);
mpol_cond_put(mpol);
- page = __alloc_pages_nodemask(gfp, order, zl, nodemask);
+ page = __alloc_pages_nodemask(gfp, order, nid, nodemask);
if (page)
return page;
} while (read_mems_allowed_retry(cpuset_mems_cookie));
@@ -3185,17 +3201,17 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
update_mmu_cache(vma, address, ptep);
}
-static int is_hugetlb_entry_migration(pte_t pte)
+bool is_hugetlb_entry_migration(pte_t pte)
{
swp_entry_t swp;
if (huge_pte_none(pte) || pte_present(pte))
- return 0;
+ return false;
swp = pte_to_swp_entry(pte);
if (non_swap_entry(swp) && is_migration_entry(swp))
- return 1;
+ return true;
else
- return 0;
+ return false;
}
static int is_hugetlb_entry_hwpoisoned(pte_t pte)
@@ -3233,7 +3249,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
- src_pte = huge_pte_offset(src, addr);
+ src_pte = huge_pte_offset(src, addr, sz);
if (!src_pte)
continue;
dst_pte = huge_pte_alloc(dst, addr, sz);
@@ -3263,9 +3279,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
*/
make_migration_entry_read(&swp_entry);
entry = swp_entry_to_pte(swp_entry);
- set_huge_pte_at(src, addr, src_pte, entry);
+ set_huge_swap_pte_at(src, addr, src_pte,
+ entry, sz);
}
- set_huge_pte_at(dst, addr, dst_pte, entry);
+ set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
} else {
if (cow) {
huge_ptep_set_wrprotect(src, addr, src_pte);
@@ -3317,7 +3334,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
address = start;
for (; address < end; address += sz) {
- ptep = huge_pte_offset(mm, address);
+ ptep = huge_pte_offset(mm, address, sz);
if (!ptep)
continue;
@@ -3338,7 +3355,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* unmapped and its refcount is dropped, so just clear pte here.
*/
if (unlikely(!pte_present(pte))) {
- huge_pte_clear(mm, address, ptep);
+ huge_pte_clear(mm, address, ptep, sz);
spin_unlock(ptl);
continue;
}
@@ -3535,7 +3552,8 @@ retry_avoidcopy:
unmap_ref_private(mm, vma, old_page, address);
BUG_ON(huge_pte_none(pte));
spin_lock(ptl);
- ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+ ptep = huge_pte_offset(mm, address & huge_page_mask(h),
+ huge_page_size(h));
if (likely(ptep &&
pte_same(huge_ptep_get(ptep), pte)))
goto retry_avoidcopy;
@@ -3574,7 +3592,8 @@ retry_avoidcopy:
* before the page tables are altered
*/
spin_lock(ptl);
- ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+ ptep = huge_pte_offset(mm, address & huge_page_mask(h),
+ huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
ClearPagePrivate(new_page);
@@ -3861,7 +3880,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
address &= huge_page_mask(h);
- ptep = huge_pte_offset(mm, address);
+ ptep = huge_pte_offset(mm, address, huge_page_size(h));
if (ptep) {
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
@@ -4118,7 +4137,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
*
* Note that page table lock is not held when pte is null.
*/
- pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
+ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
+ huge_page_size(h));
if (pte)
ptl = huge_pte_lock(h, mm, pte);
absent = !pte || huge_pte_none(huge_ptep_get(pte));
@@ -4170,6 +4190,11 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
if (ret & VM_FAULT_ERROR) {
+ int err = vm_fault_to_errno(ret, flags);
+
+ if (err)
+ return err;
+
remainder = 0;
break;
}
@@ -4252,7 +4277,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
i_mmap_lock_write(vma->vm_file->f_mapping);
for (; address < end; address += huge_page_size(h)) {
spinlock_t *ptl;
- ptep = huge_pte_offset(mm, address);
+ ptep = huge_pte_offset(mm, address, huge_page_size(h));
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
@@ -4274,7 +4299,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
make_migration_entry_read(&entry);
newpte = swp_entry_to_pte(entry);
- set_huge_pte_at(mm, address, ptep, newpte);
+ set_huge_swap_pte_at(mm, address, ptep,
+ newpte, huge_page_size(h));
pages++;
}
spin_unlock(ptl);
@@ -4516,7 +4542,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
saddr = page_table_shareable(svma, vma, addr, idx);
if (saddr) {
- spte = huge_pte_offset(svma->vm_mm, saddr);
+ spte = huge_pte_offset(svma->vm_mm, saddr,
+ vma_mmu_pagesize(svma));
if (spte) {
get_page(virt_to_page(spte));
break;
@@ -4612,7 +4639,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
return pte;
}
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm,
+ unsigned long addr, unsigned long sz)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -4648,6 +4676,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address,
}
struct page * __weak
+follow_huge_pd(struct vm_area_struct *vma,
+ unsigned long address, hugepd_t hpd, int flags, int pdshift)
+{
+ WARN(1, "hugepd follow called with no support for hugepage directory format\n");
+ return NULL;
+}
+
+struct page * __weak
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
pmd_t *pmd, int flags)
{
@@ -4694,6 +4730,15 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
}
+struct page * __weak
+follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
+{
+ if (flags & FOLL_GET)
+ return NULL;
+
+ return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
+}
+
#ifdef CONFIG_MEMORY_FAILURE
/*
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 945fd1ca49b5..df4ebdb2b10a 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -652,7 +652,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
- cond_resched();
}
}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 20036d4f9f13..7780cd83a495 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -150,7 +150,7 @@ struct kmemleak_scan_area {
*/
struct kmemleak_object {
spinlock_t lock;
- unsigned long flags; /* object status flags */
+ unsigned int flags; /* object status flags */
struct list_head object_list;
struct list_head gray_list;
struct rb_node rb_node;
@@ -159,6 +159,8 @@ struct kmemleak_object {
atomic_t use_count;
unsigned long pointer;
size_t size;
+ /* pass surplus references to this pointer */
+ unsigned long excess_ref;
/* minimum number of a pointers found before it is considered leak */
int min_count;
/* the total number of pointers found pointing to this object */
@@ -253,7 +255,8 @@ enum {
KMEMLEAK_NOT_LEAK,
KMEMLEAK_IGNORE,
KMEMLEAK_SCAN_AREA,
- KMEMLEAK_NO_SCAN
+ KMEMLEAK_NO_SCAN,
+ KMEMLEAK_SET_EXCESS_REF
};
/*
@@ -262,9 +265,12 @@ enum {
*/
struct early_log {
int op_type; /* kmemleak operation type */
- const void *ptr; /* allocated/freed memory block */
- size_t size; /* memory block size */
int min_count; /* minimum reference count */
+ const void *ptr; /* allocated/freed memory block */
+ union {
+ size_t size; /* memory block size */
+ unsigned long excess_ref; /* surplus reference passing */
+ };
unsigned long trace[MAX_TRACE]; /* stack trace */
unsigned int trace_len; /* stack trace length */
};
@@ -393,7 +399,7 @@ static void dump_object_info(struct kmemleak_object *object)
object->comm, object->pid, object->jiffies);
pr_notice(" min_count = %d\n", object->min_count);
pr_notice(" count = %d\n", object->count);
- pr_notice(" flags = 0x%lx\n", object->flags);
+ pr_notice(" flags = 0x%x\n", object->flags);
pr_notice(" checksum = %u\n", object->checksum);
pr_notice(" backtrace:\n");
print_stack_trace(&trace, 4);
@@ -562,6 +568,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
object->flags = OBJECT_ALLOCATED;
object->pointer = ptr;
object->size = size;
+ object->excess_ref = 0;
object->min_count = min_count;
object->count = 0; /* white color initially */
object->jiffies = jiffies;
@@ -795,6 +802,30 @@ out:
}
/*
+ * Any surplus references (object already gray) to 'ptr' are passed to
+ * 'excess_ref'. This is used in the vmalloc() case where a pointer to
+ * vm_struct may be used as an alternative reference to the vmalloc'ed object
+ * (see free_thread_stack()).
+ */
+static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+
+ object = find_and_get_object(ptr, 0);
+ if (!object) {
+ kmemleak_warn("Setting excess_ref on unknown object at 0x%08lx\n",
+ ptr);
+ return;
+ }
+
+ spin_lock_irqsave(&object->lock, flags);
+ object->excess_ref = excess_ref;
+ spin_unlock_irqrestore(&object->lock, flags);
+ put_object(object);
+}
+
+/*
* Set the OBJECT_NO_SCAN flag for the object corresponding to the give
* pointer. Such object will not be scanned by kmemleak but references to it
* are searched.
@@ -908,7 +939,7 @@ static void early_alloc_percpu(struct early_log *log)
* @gfp: kmalloc() flags used for kmemleak internal memory allocations
*
* This function is called from the kernel allocators when a new object
- * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.).
+ * (memory block) is allocated (kmem_cache_alloc, kmalloc etc.).
*/
void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
gfp_t gfp)
@@ -952,6 +983,36 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
/**
+ * kmemleak_vmalloc - register a newly vmalloc'ed object
+ * @area: pointer to vm_struct
+ * @size: size of the object
+ * @gfp: __vmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is called from the vmalloc() kernel allocator when a new
+ * object (memory block) is allocated.
+ */
+void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp)
+{
+ pr_debug("%s(0x%p, %zu)\n", __func__, area, size);
+
+ /*
+ * A min_count = 2 is needed because vm_struct contains a reference to
+ * the virtual address of the vmalloc'ed block.
+ */
+ if (kmemleak_enabled) {
+ create_object((unsigned long)area->addr, size, 2, gfp);
+ object_set_excess_ref((unsigned long)area,
+ (unsigned long)area->addr);
+ } else if (kmemleak_early_log) {
+ log_early(KMEMLEAK_ALLOC, area->addr, size, 2);
+ /* reusing early_log.size for storing area->addr */
+ log_early(KMEMLEAK_SET_EXCESS_REF,
+ area, (unsigned long)area->addr, 0);
+ }
+}
+EXPORT_SYMBOL_GPL(kmemleak_vmalloc);
+
+/**
* kmemleak_free - unregister a previously registered object
* @ptr: pointer to beginning of the object
*
@@ -1188,6 +1249,30 @@ static bool update_checksum(struct kmemleak_object *object)
}
/*
+ * Update an object's references. object->lock must be held by the caller.
+ */
+static void update_refs(struct kmemleak_object *object)
+{
+ if (!color_white(object)) {
+ /* non-orphan, ignored or new */
+ return;
+ }
+
+ /*
+ * Increase the object's reference count (number of pointers to the
+ * memory block). If this count reaches the required minimum, the
+ * object's color will become gray and it will be added to the
+ * gray_list.
+ */
+ object->count++;
+ if (color_gray(object)) {
+ /* put_object() called when removing from gray_list */
+ WARN_ON(!get_object(object));
+ list_add_tail(&object->gray_list, &gray_list);
+ }
+}
+
+/*
* Memory scanning is a long process and it needs to be interruptable. This
* function checks whether such interrupt condition occurred.
*/
@@ -1224,6 +1309,7 @@ static void scan_block(void *_start, void *_end,
for (ptr = start; ptr < end; ptr++) {
struct kmemleak_object *object;
unsigned long pointer;
+ unsigned long excess_ref;
if (scan_should_stop())
break;
@@ -1259,25 +1345,27 @@ static void scan_block(void *_start, void *_end,
* enclosed by scan_mutex.
*/
spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
- if (!color_white(object)) {
- /* non-orphan, ignored or new */
- spin_unlock(&object->lock);
- continue;
- }
-
- /*
- * Increase the object's reference count (number of pointers
- * to the memory block). If this count reaches the required
- * minimum, the object's color will become gray and it will be
- * added to the gray_list.
- */
- object->count++;
+ /* only pass surplus references (object already gray) */
if (color_gray(object)) {
- /* put_object() called when removing from gray_list */
- WARN_ON(!get_object(object));
- list_add_tail(&object->gray_list, &gray_list);
+ excess_ref = object->excess_ref;
+ /* no need for update_refs() if object already gray */
+ } else {
+ excess_ref = 0;
+ update_refs(object);
}
spin_unlock(&object->lock);
+
+ if (excess_ref) {
+ object = lookup_object(excess_ref, 0);
+ if (!object)
+ continue;
+ if (object == scanned)
+ /* circular reference, ignore */
+ continue;
+ spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
+ update_refs(object);
+ spin_unlock(&object->lock);
+ }
}
read_unlock_irqrestore(&kmemleak_lock, flags);
}
@@ -1980,6 +2068,10 @@ void __init kmemleak_init(void)
case KMEMLEAK_NO_SCAN:
kmemleak_no_scan(log->ptr);
break;
+ case KMEMLEAK_SET_EXCESS_REF:
+ object_set_excess_ref((unsigned long)log->ptr,
+ log->excess_ref);
+ break;
default:
kmemleak_warn("Unknown early log operation: %d\n",
log->op_type);
diff --git a/mm/ksm.c b/mm/ksm.c
index d9fc0e456128..4dc92f138786 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -128,9 +128,12 @@ struct ksm_scan {
* struct stable_node - node of the stable rbtree
* @node: rb node of this ksm page in the stable tree
* @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
+ * @hlist_dup: linked into the stable_node->hlist with a stable_node chain
* @list: linked into migrate_nodes, pending placement in the proper node tree
* @hlist: hlist head of rmap_items using this ksm page
* @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
+ * @chain_prune_time: time of the last full garbage collection
+ * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
* @nid: NUMA node id of stable tree in which linked (may not match kpfn)
*/
struct stable_node {
@@ -138,11 +141,24 @@ struct stable_node {
struct rb_node node; /* when node of stable tree */
struct { /* when listed for migration */
struct list_head *head;
- struct list_head list;
+ struct {
+ struct hlist_node hlist_dup;
+ struct list_head list;
+ };
};
};
struct hlist_head hlist;
- unsigned long kpfn;
+ union {
+ unsigned long kpfn;
+ unsigned long chain_prune_time;
+ };
+ /*
+ * STABLE_NODE_CHAIN can be any negative number in
+ * rmap_hlist_len negative range, but better not -1 to be able
+ * to reliably detect underflows.
+ */
+#define STABLE_NODE_CHAIN -1024
+ int rmap_hlist_len;
#ifdef CONFIG_NUMA
int nid;
#endif
@@ -192,6 +208,7 @@ static struct rb_root *root_unstable_tree = one_unstable_tree;
/* Recently migrated nodes of stable tree, pending proper placement */
static LIST_HEAD(migrate_nodes);
+#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
#define MM_SLOTS_HASH_BITS 10
static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -219,6 +236,18 @@ static unsigned long ksm_pages_unshared;
/* The number of rmap_items in use: to calculate pages_volatile */
static unsigned long ksm_rmap_items;
+/* The number of stable_node chains */
+static unsigned long ksm_stable_node_chains;
+
+/* The number of stable_node dups linked to the stable_node chains */
+static unsigned long ksm_stable_node_dups;
+
+/* Delay in pruning stale stable_node_dups in the stable_node_chains */
+static int ksm_stable_node_chains_prune_millisecs = 2000;
+
+/* Maximum number of page slots sharing a stable node */
+static int ksm_max_page_sharing = 256;
+
/* Number of pages ksmd should scan in one batch */
static unsigned int ksm_thread_pages_to_scan = 100;
@@ -287,6 +316,45 @@ static void __init ksm_slab_free(void)
mm_slot_cache = NULL;
}
+static __always_inline bool is_stable_node_chain(struct stable_node *chain)
+{
+ return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
+}
+
+static __always_inline bool is_stable_node_dup(struct stable_node *dup)
+{
+ return dup->head == STABLE_NODE_DUP_HEAD;
+}
+
+static inline void stable_node_chain_add_dup(struct stable_node *dup,
+ struct stable_node *chain)
+{
+ VM_BUG_ON(is_stable_node_dup(dup));
+ dup->head = STABLE_NODE_DUP_HEAD;
+ VM_BUG_ON(!is_stable_node_chain(chain));
+ hlist_add_head(&dup->hlist_dup, &chain->hlist);
+ ksm_stable_node_dups++;
+}
+
+static inline void __stable_node_dup_del(struct stable_node *dup)
+{
+ VM_BUG_ON(!is_stable_node_dup(dup));
+ hlist_del(&dup->hlist_dup);
+ ksm_stable_node_dups--;
+}
+
+static inline void stable_node_dup_del(struct stable_node *dup)
+{
+ VM_BUG_ON(is_stable_node_chain(dup));
+ if (is_stable_node_dup(dup))
+ __stable_node_dup_del(dup);
+ else
+ rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
+#ifdef CONFIG_DEBUG_VM
+ dup->head = NULL;
+#endif
+}
+
static inline struct rmap_item *alloc_rmap_item(void)
{
struct rmap_item *rmap_item;
@@ -317,6 +385,8 @@ static inline struct stable_node *alloc_stable_node(void)
static inline void free_stable_node(struct stable_node *stable_node)
{
+ VM_BUG_ON(stable_node->rmap_hlist_len &&
+ !is_stable_node_chain(stable_node));
kmem_cache_free(stable_node_cache, stable_node);
}
@@ -498,25 +568,82 @@ static inline int get_kpfn_nid(unsigned long kpfn)
return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
}
+static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
+ struct rb_root *root)
+{
+ struct stable_node *chain = alloc_stable_node();
+ VM_BUG_ON(is_stable_node_chain(dup));
+ if (likely(chain)) {
+ INIT_HLIST_HEAD(&chain->hlist);
+ chain->chain_prune_time = jiffies;
+ chain->rmap_hlist_len = STABLE_NODE_CHAIN;
+#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
+ chain->nid = -1; /* debug */
+#endif
+ ksm_stable_node_chains++;
+
+ /*
+ * Put the stable node chain in the first dimension of
+ * the stable tree and at the same time remove the old
+ * stable node.
+ */
+ rb_replace_node(&dup->node, &chain->node, root);
+
+ /*
+ * Move the old stable node to the second dimension
+ * queued in the hlist_dup. The invariant is that all
+ * dup stable_nodes in the chain->hlist point to pages
+ * that are wrprotected and have the exact same
+ * content.
+ */
+ stable_node_chain_add_dup(dup, chain);
+ }
+ return chain;
+}
+
+static inline void free_stable_node_chain(struct stable_node *chain,
+ struct rb_root *root)
+{
+ rb_erase(&chain->node, root);
+ free_stable_node(chain);
+ ksm_stable_node_chains--;
+}
+
static void remove_node_from_stable_tree(struct stable_node *stable_node)
{
struct rmap_item *rmap_item;
+ /* check it's not STABLE_NODE_CHAIN or negative */
+ BUG_ON(stable_node->rmap_hlist_len < 0);
+
hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
if (rmap_item->hlist.next)
ksm_pages_sharing--;
else
ksm_pages_shared--;
+ VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
+ stable_node->rmap_hlist_len--;
put_anon_vma(rmap_item->anon_vma);
rmap_item->address &= PAGE_MASK;
cond_resched();
}
+ /*
+ * We need the second aligned pointer of the migrate_nodes
+ * list_head to stay clear from the rb_parent_color union
+ * (aligned and different than any node) and also different
+ * from &migrate_nodes. This will verify that future list.h changes
+ * don't break STABLE_NODE_DUP_HEAD.
+ */
+#if GCC_VERSION >= 40903 /* only recent gcc can handle it */
+ BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
+ BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
+#endif
+
if (stable_node->head == &migrate_nodes)
list_del(&stable_node->list);
else
- rb_erase(&stable_node->node,
- root_stable_tree + NUMA(stable_node->nid));
+ stable_node_dup_del(stable_node);
free_stable_node(stable_node);
}
@@ -635,6 +762,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
ksm_pages_sharing--;
else
ksm_pages_shared--;
+ VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
+ stable_node->rmap_hlist_len--;
put_anon_vma(rmap_item->anon_vma);
rmap_item->address &= PAGE_MASK;
@@ -743,6 +872,31 @@ static int remove_stable_node(struct stable_node *stable_node)
return err;
}
+static int remove_stable_node_chain(struct stable_node *stable_node,
+ struct rb_root *root)
+{
+ struct stable_node *dup;
+ struct hlist_node *hlist_safe;
+
+ if (!is_stable_node_chain(stable_node)) {
+ VM_BUG_ON(is_stable_node_dup(stable_node));
+ if (remove_stable_node(stable_node))
+ return true;
+ else
+ return false;
+ }
+
+ hlist_for_each_entry_safe(dup, hlist_safe,
+ &stable_node->hlist, hlist_dup) {
+ VM_BUG_ON(!is_stable_node_dup(dup));
+ if (remove_stable_node(dup))
+ return true;
+ }
+ BUG_ON(!hlist_empty(&stable_node->hlist));
+ free_stable_node_chain(stable_node, root);
+ return false;
+}
+
static int remove_all_stable_nodes(void)
{
struct stable_node *stable_node, *next;
@@ -753,7 +907,8 @@ static int remove_all_stable_nodes(void)
while (root_stable_tree[nid].rb_node) {
stable_node = rb_entry(root_stable_tree[nid].rb_node,
struct stable_node, node);
- if (remove_stable_node(stable_node)) {
+ if (remove_stable_node_chain(stable_node,
+ root_stable_tree + nid)) {
err = -EBUSY;
break; /* proceed to next nid */
}
@@ -1028,8 +1183,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
goto out;
if (PageTransCompound(page)) {
- err = split_huge_page(page);
- if (err)
+ if (split_huge_page(page))
goto out_unlock;
}
@@ -1139,6 +1293,214 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
return err ? NULL : page;
}
+static __always_inline
+bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
+{
+ VM_BUG_ON(stable_node->rmap_hlist_len < 0);
+ /*
+ * Check that at least one mapping still exists, otherwise
+ * there's no much point to merge and share with this
+ * stable_node, as the underlying tree_page of the other
+ * sharer is going to be freed soon.
+ */
+ return stable_node->rmap_hlist_len &&
+ stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
+}
+
+static __always_inline
+bool is_page_sharing_candidate(struct stable_node *stable_node)
+{
+ return __is_page_sharing_candidate(stable_node, 0);
+}
+
+struct page *stable_node_dup(struct stable_node **_stable_node_dup,
+ struct stable_node **_stable_node,
+ struct rb_root *root,
+ bool prune_stale_stable_nodes)
+{
+ struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
+ struct hlist_node *hlist_safe;
+ struct page *_tree_page, *tree_page = NULL;
+ int nr = 0;
+ int found_rmap_hlist_len;
+
+ if (!prune_stale_stable_nodes ||
+ time_before(jiffies, stable_node->chain_prune_time +
+ msecs_to_jiffies(
+ ksm_stable_node_chains_prune_millisecs)))
+ prune_stale_stable_nodes = false;
+ else
+ stable_node->chain_prune_time = jiffies;
+
+ hlist_for_each_entry_safe(dup, hlist_safe,
+ &stable_node->hlist, hlist_dup) {
+ cond_resched();
+ /*
+ * We must walk all stable_node_dup to prune the stale
+ * stable nodes during lookup.
+ *
+ * get_ksm_page can drop the nodes from the
+ * stable_node->hlist if they point to freed pages
+ * (that's why we do a _safe walk). The "dup"
+ * stable_node parameter itself will be freed from
+ * under us if it returns NULL.
+ */
+ _tree_page = get_ksm_page(dup, false);
+ if (!_tree_page)
+ continue;
+ nr += 1;
+ if (is_page_sharing_candidate(dup)) {
+ if (!found ||
+ dup->rmap_hlist_len > found_rmap_hlist_len) {
+ if (found)
+ put_page(tree_page);
+ found = dup;
+ found_rmap_hlist_len = found->rmap_hlist_len;
+ tree_page = _tree_page;
+
+ /* skip put_page for found dup */
+ if (!prune_stale_stable_nodes)
+ break;
+ continue;
+ }
+ }
+ put_page(_tree_page);
+ }
+
+ if (found) {
+ /*
+ * nr is counting all dups in the chain only if
+ * prune_stale_stable_nodes is true, otherwise we may
+ * break the loop at nr == 1 even if there are
+ * multiple entries.
+ */
+ if (prune_stale_stable_nodes && nr == 1) {
+ /*
+ * If there's not just one entry it would
+ * corrupt memory, better BUG_ON. In KSM
+ * context with no lock held it's not even
+ * fatal.
+ */
+ BUG_ON(stable_node->hlist.first->next);
+
+ /*
+ * There's just one entry and it is below the
+ * deduplication limit so drop the chain.
+ */
+ rb_replace_node(&stable_node->node, &found->node,
+ root);
+ free_stable_node(stable_node);
+ ksm_stable_node_chains--;
+ ksm_stable_node_dups--;
+ /*
+ * NOTE: the caller depends on the stable_node
+ * to be equal to stable_node_dup if the chain
+ * was collapsed.
+ */
+ *_stable_node = found;
+ /*
+ * Just for robustneess as stable_node is
+ * otherwise left as a stable pointer, the
+ * compiler shall optimize it away at build
+ * time.
+ */
+ stable_node = NULL;
+ } else if (stable_node->hlist.first != &found->hlist_dup &&
+ __is_page_sharing_candidate(found, 1)) {
+ /*
+ * If the found stable_node dup can accept one
+ * more future merge (in addition to the one
+ * that is underway) and is not at the head of
+ * the chain, put it there so next search will
+ * be quicker in the !prune_stale_stable_nodes
+ * case.
+ *
+ * NOTE: it would be inaccurate to use nr > 1
+ * instead of checking the hlist.first pointer
+ * directly, because in the
+ * prune_stale_stable_nodes case "nr" isn't
+ * the position of the found dup in the chain,
+ * but the total number of dups in the chain.
+ */
+ hlist_del(&found->hlist_dup);
+ hlist_add_head(&found->hlist_dup,
+ &stable_node->hlist);
+ }
+ }
+
+ *_stable_node_dup = found;
+ return tree_page;
+}
+
+static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
+ struct rb_root *root)
+{
+ if (!is_stable_node_chain(stable_node))
+ return stable_node;
+ if (hlist_empty(&stable_node->hlist)) {
+ free_stable_node_chain(stable_node, root);
+ return NULL;
+ }
+ return hlist_entry(stable_node->hlist.first,
+ typeof(*stable_node), hlist_dup);
+}
+
+/*
+ * Like for get_ksm_page, this function can free the *_stable_node and
+ * *_stable_node_dup if the returned tree_page is NULL.
+ *
+ * It can also free and overwrite *_stable_node with the found
+ * stable_node_dup if the chain is collapsed (in which case
+ * *_stable_node will be equal to *_stable_node_dup like if the chain
+ * never existed). It's up to the caller to verify tree_page is not
+ * NULL before dereferencing *_stable_node or *_stable_node_dup.
+ *
+ * *_stable_node_dup is really a second output parameter of this
+ * function and will be overwritten in all cases, the caller doesn't
+ * need to initialize it.
+ */
+static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
+ struct stable_node **_stable_node,
+ struct rb_root *root,
+ bool prune_stale_stable_nodes)
+{
+ struct stable_node *stable_node = *_stable_node;
+ if (!is_stable_node_chain(stable_node)) {
+ if (is_page_sharing_candidate(stable_node)) {
+ *_stable_node_dup = stable_node;
+ return get_ksm_page(stable_node, false);
+ }
+ /*
+ * _stable_node_dup set to NULL means the stable_node
+ * reached the ksm_max_page_sharing limit.
+ */
+ *_stable_node_dup = NULL;
+ return NULL;
+ }
+ return stable_node_dup(_stable_node_dup, _stable_node, root,
+ prune_stale_stable_nodes);
+}
+
+static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
+ struct stable_node **s_n,
+ struct rb_root *root)
+{
+ return __stable_node_chain(s_n_d, s_n, root, true);
+}
+
+static __always_inline struct page *chain(struct stable_node **s_n_d,
+ struct stable_node *s_n,
+ struct rb_root *root)
+{
+ struct stable_node *old_stable_node = s_n;
+ struct page *tree_page;
+
+ tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
+ /* not pruning dups so s_n cannot have changed */
+ VM_BUG_ON(s_n != old_stable_node);
+ return tree_page;
+}
+
/*
* stable_tree_search - search for page inside the stable tree
*
@@ -1154,7 +1516,7 @@ static struct page *stable_tree_search(struct page *page)
struct rb_root *root;
struct rb_node **new;
struct rb_node *parent;
- struct stable_node *stable_node;
+ struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
struct stable_node *page_node;
page_node = page_stable_node(page);
@@ -1176,7 +1538,44 @@ again:
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
- tree_page = get_ksm_page(stable_node, false);
+ stable_node_any = NULL;
+ tree_page = chain_prune(&stable_node_dup, &stable_node, root);
+ /*
+ * NOTE: stable_node may have been freed by
+ * chain_prune() if the returned stable_node_dup is
+ * not NULL. stable_node_dup may have been inserted in
+ * the rbtree instead as a regular stable_node (in
+ * order to collapse the stable_node chain if a single
+ * stable_node dup was found in it). In such case the
+ * stable_node is overwritten by the calleee to point
+ * to the stable_node_dup that was collapsed in the
+ * stable rbtree and stable_node will be equal to
+ * stable_node_dup like if the chain never existed.
+ */
+ if (!stable_node_dup) {
+ /*
+ * Either all stable_node dups were full in
+ * this stable_node chain, or this chain was
+ * empty and should be rb_erased.
+ */
+ stable_node_any = stable_node_dup_any(stable_node,
+ root);
+ if (!stable_node_any) {
+ /* rb_erase just run */
+ goto again;
+ }
+ /*
+ * Take any of the stable_node dups page of
+ * this stable_node chain to let the tree walk
+ * continue. All KSM pages belonging to the
+ * stable_node dups in a stable_node chain
+ * have the same content and they're
+ * wrprotected at all times. Any will work
+ * fine to continue the walk.
+ */
+ tree_page = get_ksm_page(stable_node_any, false);
+ }
+ VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
if (!tree_page) {
/*
* If we walked over a stale stable_node,
@@ -1199,6 +1598,34 @@ again:
else if (ret > 0)
new = &parent->rb_right;
else {
+ if (page_node) {
+ VM_BUG_ON(page_node->head != &migrate_nodes);
+ /*
+ * Test if the migrated page should be merged
+ * into a stable node dup. If the mapcount is
+ * 1 we can migrate it with another KSM page
+ * without adding it to the chain.
+ */
+ if (page_mapcount(page) > 1)
+ goto chain_append;
+ }
+
+ if (!stable_node_dup) {
+ /*
+ * If the stable_node is a chain and
+ * we got a payload match in memcmp
+ * but we cannot merge the scanned
+ * page in any of the existing
+ * stable_node dups because they're
+ * all full, we need to wait the
+ * scanned page to find itself a match
+ * in the unstable tree to create a
+ * brand new KSM page to add later to
+ * the dups of this stable_node.
+ */
+ return NULL;
+ }
+
/*
* Lock and unlock the stable_node's page (which
* might already have been migrated) so that page
@@ -1206,23 +1633,21 @@ again:
* It would be more elegant to return stable_node
* than kpage, but that involves more changes.
*/
- tree_page = get_ksm_page(stable_node, true);
- if (tree_page) {
- unlock_page(tree_page);
- if (get_kpfn_nid(stable_node->kpfn) !=
- NUMA(stable_node->nid)) {
- put_page(tree_page);
- goto replace;
- }
- return tree_page;
- }
- /*
- * There is now a place for page_node, but the tree may
- * have been rebalanced, so re-evaluate parent and new.
- */
- if (page_node)
+ tree_page = get_ksm_page(stable_node_dup, true);
+ if (unlikely(!tree_page))
+ /*
+ * The tree may have been rebalanced,
+ * so re-evaluate parent and new.
+ */
goto again;
- return NULL;
+ unlock_page(tree_page);
+
+ if (get_kpfn_nid(stable_node_dup->kpfn) !=
+ NUMA(stable_node_dup->nid)) {
+ put_page(tree_page);
+ goto replace;
+ }
+ return tree_page;
}
}
@@ -1233,22 +1658,95 @@ again:
DO_NUMA(page_node->nid = nid);
rb_link_node(&page_node->node, parent, new);
rb_insert_color(&page_node->node, root);
- get_page(page);
- return page;
+out:
+ if (is_page_sharing_candidate(page_node)) {
+ get_page(page);
+ return page;
+ } else
+ return NULL;
replace:
- if (page_node) {
- list_del(&page_node->list);
- DO_NUMA(page_node->nid = nid);
- rb_replace_node(&stable_node->node, &page_node->node, root);
- get_page(page);
+ /*
+ * If stable_node was a chain and chain_prune collapsed it,
+ * stable_node has been updated to be the new regular
+ * stable_node. A collapse of the chain is indistinguishable
+ * from the case there was no chain in the stable
+ * rbtree. Otherwise stable_node is the chain and
+ * stable_node_dup is the dup to replace.
+ */
+ if (stable_node_dup == stable_node) {
+ VM_BUG_ON(is_stable_node_chain(stable_node_dup));
+ VM_BUG_ON(is_stable_node_dup(stable_node_dup));
+ /* there is no chain */
+ if (page_node) {
+ VM_BUG_ON(page_node->head != &migrate_nodes);
+ list_del(&page_node->list);
+ DO_NUMA(page_node->nid = nid);
+ rb_replace_node(&stable_node_dup->node,
+ &page_node->node,
+ root);
+ if (is_page_sharing_candidate(page_node))
+ get_page(page);
+ else
+ page = NULL;
+ } else {
+ rb_erase(&stable_node_dup->node, root);
+ page = NULL;
+ }
} else {
- rb_erase(&stable_node->node, root);
- page = NULL;
+ VM_BUG_ON(!is_stable_node_chain(stable_node));
+ __stable_node_dup_del(stable_node_dup);
+ if (page_node) {
+ VM_BUG_ON(page_node->head != &migrate_nodes);
+ list_del(&page_node->list);
+ DO_NUMA(page_node->nid = nid);
+ stable_node_chain_add_dup(page_node, stable_node);
+ if (is_page_sharing_candidate(page_node))
+ get_page(page);
+ else
+ page = NULL;
+ } else {
+ page = NULL;
+ }
}
- stable_node->head = &migrate_nodes;
- list_add(&stable_node->list, stable_node->head);
+ stable_node_dup->head = &migrate_nodes;
+ list_add(&stable_node_dup->list, stable_node_dup->head);
return page;
+
+chain_append:
+ /* stable_node_dup could be null if it reached the limit */
+ if (!stable_node_dup)
+ stable_node_dup = stable_node_any;
+ /*
+ * If stable_node was a chain and chain_prune collapsed it,
+ * stable_node has been updated to be the new regular
+ * stable_node. A collapse of the chain is indistinguishable
+ * from the case there was no chain in the stable
+ * rbtree. Otherwise stable_node is the chain and
+ * stable_node_dup is the dup to replace.
+ */
+ if (stable_node_dup == stable_node) {
+ VM_BUG_ON(is_stable_node_chain(stable_node_dup));
+ VM_BUG_ON(is_stable_node_dup(stable_node_dup));
+ /* chain is missing so create it */
+ stable_node = alloc_stable_node_chain(stable_node_dup,
+ root);
+ if (!stable_node)
+ return NULL;
+ }
+ /*
+ * Add this stable_node dup that was
+ * migrated to the stable_node chain
+ * of the current nid for this page
+ * content.
+ */
+ VM_BUG_ON(!is_stable_node_chain(stable_node));
+ VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
+ VM_BUG_ON(page_node->head != &migrate_nodes);
+ list_del(&page_node->list);
+ DO_NUMA(page_node->nid = nid);
+ stable_node_chain_add_dup(page_node, stable_node);
+ goto out;
}
/*
@@ -1265,7 +1763,8 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
struct rb_root *root;
struct rb_node **new;
struct rb_node *parent;
- struct stable_node *stable_node;
+ struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
+ bool need_chain = false;
kpfn = page_to_pfn(kpage);
nid = get_kpfn_nid(kpfn);
@@ -1280,7 +1779,32 @@ again:
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
- tree_page = get_ksm_page(stable_node, false);
+ stable_node_any = NULL;
+ tree_page = chain(&stable_node_dup, stable_node, root);
+ if (!stable_node_dup) {
+ /*
+ * Either all stable_node dups were full in
+ * this stable_node chain, or this chain was
+ * empty and should be rb_erased.
+ */
+ stable_node_any = stable_node_dup_any(stable_node,
+ root);
+ if (!stable_node_any) {
+ /* rb_erase just run */
+ goto again;
+ }
+ /*
+ * Take any of the stable_node dups page of
+ * this stable_node chain to let the tree walk
+ * continue. All KSM pages belonging to the
+ * stable_node dups in a stable_node chain
+ * have the same content and they're
+ * wrprotected at all times. Any will work
+ * fine to continue the walk.
+ */
+ tree_page = get_ksm_page(stable_node_any, false);
+ }
+ VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
if (!tree_page) {
/*
* If we walked over a stale stable_node,
@@ -1303,27 +1827,37 @@ again:
else if (ret > 0)
new = &parent->rb_right;
else {
- /*
- * It is not a bug that stable_tree_search() didn't
- * find this node: because at that time our page was
- * not yet write-protected, so may have changed since.
- */
- return NULL;
+ need_chain = true;
+ break;
}
}
- stable_node = alloc_stable_node();
- if (!stable_node)
+ stable_node_dup = alloc_stable_node();
+ if (!stable_node_dup)
return NULL;
- INIT_HLIST_HEAD(&stable_node->hlist);
- stable_node->kpfn = kpfn;
- set_page_stable_node(kpage, stable_node);
- DO_NUMA(stable_node->nid = nid);
- rb_link_node(&stable_node->node, parent, new);
- rb_insert_color(&stable_node->node, root);
+ INIT_HLIST_HEAD(&stable_node_dup->hlist);
+ stable_node_dup->kpfn = kpfn;
+ set_page_stable_node(kpage, stable_node_dup);
+ stable_node_dup->rmap_hlist_len = 0;
+ DO_NUMA(stable_node_dup->nid = nid);
+ if (!need_chain) {
+ rb_link_node(&stable_node_dup->node, parent, new);
+ rb_insert_color(&stable_node_dup->node, root);
+ } else {
+ if (!is_stable_node_chain(stable_node)) {
+ struct stable_node *orig = stable_node;
+ /* chain is missing so create it */
+ stable_node = alloc_stable_node_chain(orig, root);
+ if (!stable_node) {
+ free_stable_node(stable_node_dup);
+ return NULL;
+ }
+ }
+ stable_node_chain_add_dup(stable_node_dup, stable_node);
+ }
- return stable_node;
+ return stable_node_dup;
}
/*
@@ -1413,8 +1947,27 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
* the same ksm page.
*/
static void stable_tree_append(struct rmap_item *rmap_item,
- struct stable_node *stable_node)
+ struct stable_node *stable_node,
+ bool max_page_sharing_bypass)
{
+ /*
+ * rmap won't find this mapping if we don't insert the
+ * rmap_item in the right stable_node
+ * duplicate. page_migration could break later if rmap breaks,
+ * so we can as well crash here. We really need to check for
+ * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
+ * for other negative values as an undeflow if detected here
+ * for the first time (and not when decreasing rmap_hlist_len)
+ * would be sign of memory corruption in the stable_node.
+ */
+ BUG_ON(stable_node->rmap_hlist_len < 0);
+
+ stable_node->rmap_hlist_len++;
+ if (!max_page_sharing_bypass)
+ /* possibly non fatal but unexpected overflow, only warn */
+ WARN_ON_ONCE(stable_node->rmap_hlist_len >
+ ksm_max_page_sharing);
+
rmap_item->head = stable_node;
rmap_item->address |= STABLE_FLAG;
hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
@@ -1442,19 +1995,26 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
struct page *kpage;
unsigned int checksum;
int err;
+ bool max_page_sharing_bypass = false;
stable_node = page_stable_node(page);
if (stable_node) {
if (stable_node->head != &migrate_nodes &&
- get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
- rb_erase(&stable_node->node,
- root_stable_tree + NUMA(stable_node->nid));
+ get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
+ NUMA(stable_node->nid)) {
+ stable_node_dup_del(stable_node);
stable_node->head = &migrate_nodes;
list_add(&stable_node->list, stable_node->head);
}
if (stable_node->head != &migrate_nodes &&
rmap_item->head == stable_node)
return;
+ /*
+ * If it's a KSM fork, allow it to go over the sharing limit
+ * without warnings.
+ */
+ if (!is_page_sharing_candidate(stable_node))
+ max_page_sharing_bypass = true;
}
/* We first start with searching the page inside the stable tree */
@@ -1474,7 +2034,8 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
* add its rmap_item to the stable tree.
*/
lock_page(kpage);
- stable_tree_append(rmap_item, page_stable_node(kpage));
+ stable_tree_append(rmap_item, page_stable_node(kpage),
+ max_page_sharing_bypass);
unlock_page(kpage);
}
put_page(kpage);
@@ -1524,8 +2085,10 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
lock_page(kpage);
stable_node = stable_tree_insert(kpage);
if (stable_node) {
- stable_tree_append(tree_rmap_item, stable_node);
- stable_tree_append(rmap_item, stable_node);
+ stable_tree_append(tree_rmap_item, stable_node,
+ false);
+ stable_tree_append(rmap_item, stable_node,
+ false);
}
unlock_page(kpage);
@@ -2029,6 +2592,48 @@ static void wait_while_offlining(void)
}
}
+static bool stable_node_dup_remove_range(struct stable_node *stable_node,
+ unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ if (stable_node->kpfn >= start_pfn &&
+ stable_node->kpfn < end_pfn) {
+ /*
+ * Don't get_ksm_page, page has already gone:
+ * which is why we keep kpfn instead of page*
+ */
+ remove_node_from_stable_tree(stable_node);
+ return true;
+ }
+ return false;
+}
+
+static bool stable_node_chain_remove_range(struct stable_node *stable_node,
+ unsigned long start_pfn,
+ unsigned long end_pfn,
+ struct rb_root *root)
+{
+ struct stable_node *dup;
+ struct hlist_node *hlist_safe;
+
+ if (!is_stable_node_chain(stable_node)) {
+ VM_BUG_ON(is_stable_node_dup(stable_node));
+ return stable_node_dup_remove_range(stable_node, start_pfn,
+ end_pfn);
+ }
+
+ hlist_for_each_entry_safe(dup, hlist_safe,
+ &stable_node->hlist, hlist_dup) {
+ VM_BUG_ON(!is_stable_node_dup(dup));
+ stable_node_dup_remove_range(dup, start_pfn, end_pfn);
+ }
+ if (hlist_empty(&stable_node->hlist)) {
+ free_stable_node_chain(stable_node, root);
+ return true; /* notify caller that tree was rebalanced */
+ } else
+ return false;
+}
+
static void ksm_check_stable_tree(unsigned long start_pfn,
unsigned long end_pfn)
{
@@ -2040,15 +2645,12 @@ static void ksm_check_stable_tree(unsigned long start_pfn,
node = rb_first(root_stable_tree + nid);
while (node) {
stable_node = rb_entry(node, struct stable_node, node);
- if (stable_node->kpfn >= start_pfn &&
- stable_node->kpfn < end_pfn) {
- /*
- * Don't get_ksm_page, page has already gone:
- * which is why we keep kpfn instead of page*
- */
- remove_node_from_stable_tree(stable_node);
+ if (stable_node_chain_remove_range(stable_node,
+ start_pfn, end_pfn,
+ root_stable_tree +
+ nid))
node = rb_first(root_stable_tree + nid);
- } else
+ else
node = rb_next(node);
cond_resched();
}
@@ -2294,6 +2896,47 @@ static ssize_t use_zero_pages_store(struct kobject *kobj,
}
KSM_ATTR(use_zero_pages);
+static ssize_t max_page_sharing_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_max_page_sharing);
+}
+
+static ssize_t max_page_sharing_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ int knob;
+
+ err = kstrtoint(buf, 10, &knob);
+ if (err)
+ return err;
+ /*
+ * When a KSM page is created it is shared by 2 mappings. This
+ * being a signed comparison, it implicitly verifies it's not
+ * negative.
+ */
+ if (knob < 2)
+ return -EINVAL;
+
+ if (READ_ONCE(ksm_max_page_sharing) == knob)
+ return count;
+
+ mutex_lock(&ksm_thread_mutex);
+ wait_while_offlining();
+ if (ksm_max_page_sharing != knob) {
+ if (ksm_pages_shared || remove_all_stable_nodes())
+ err = -EBUSY;
+ else
+ ksm_max_page_sharing = knob;
+ }
+ mutex_unlock(&ksm_thread_mutex);
+
+ return err ? err : count;
+}
+KSM_ATTR(max_page_sharing);
+
static ssize_t pages_shared_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -2332,6 +2975,46 @@ static ssize_t pages_volatile_show(struct kobject *kobj,
}
KSM_ATTR_RO(pages_volatile);
+static ssize_t stable_node_dups_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_stable_node_dups);
+}
+KSM_ATTR_RO(stable_node_dups);
+
+static ssize_t stable_node_chains_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_stable_node_chains);
+}
+KSM_ATTR_RO(stable_node_chains);
+
+static ssize_t
+stable_node_chains_prune_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
+}
+
+static ssize_t
+stable_node_chains_prune_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = kstrtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ ksm_stable_node_chains_prune_millisecs = msecs;
+
+ return count;
+}
+KSM_ATTR(stable_node_chains_prune_millisecs);
+
static ssize_t full_scans_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -2351,6 +3034,10 @@ static struct attribute *ksm_attrs[] = {
#ifdef CONFIG_NUMA
&merge_across_nodes_attr.attr,
#endif
+ &max_page_sharing_attr.attr,
+ &stable_node_chains_attr.attr,
+ &stable_node_dups_attr.attr,
+ &stable_node_chains_prune_millisecs_attr.attr,
&use_zero_pages_attr.attr,
NULL,
};
diff --git a/mm/memblock.c b/mm/memblock.c
index b049c9b2dba8..2cb25fe4452c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -54,9 +54,6 @@ struct memblock memblock __initdata_memblock = {
};
int memblock_debug __initdata_memblock;
-#ifdef CONFIG_MOVABLE_NODE
-bool movable_node_enabled __initdata_memblock = false;
-#endif
static bool system_has_some_mirror __initdata_memblock = false;
static int memblock_can_resize __initdata_memblock;
static int memblock_memory_in_slab __initdata_memblock = 0;
@@ -1739,6 +1736,29 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
}
}
+extern unsigned long __init_memblock
+memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr)
+{
+ struct memblock_region *rgn;
+ unsigned long size = 0;
+ int idx;
+
+ for_each_memblock_type((&memblock.reserved), rgn) {
+ phys_addr_t start, end;
+
+ if (rgn->base + rgn->size < start_addr)
+ continue;
+ if (rgn->base > end_addr)
+ continue;
+
+ start = rgn->base;
+ end = start + rgn->size;
+ size += end - start;
+ }
+
+ return size;
+}
+
void __init_memblock __memblock_dump_all(void)
{
pr_info("MEMBLOCK configuration:\n");
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 94172089f52f..425aa0caa712 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -170,7 +170,7 @@ struct mem_cgroup_event {
*/
poll_table pt;
wait_queue_head_t *wqh;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
struct work_struct remove;
};
@@ -1479,10 +1479,10 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
struct oom_wait_info {
struct mem_cgroup *memcg;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
};
-static int memcg_oom_wake_function(wait_queue_t *wait,
+static int memcg_oom_wake_function(wait_queue_entry_t *wait,
unsigned mode, int sync, void *arg)
{
struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
@@ -1570,7 +1570,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
owait.wait.flags = 0;
owait.wait.func = memcg_oom_wake_function;
owait.wait.private = current;
- INIT_LIST_HEAD(&owait.wait.task_list);
+ INIT_LIST_HEAD(&owait.wait.entry);
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
mem_cgroup_mark_under_oom(memcg);
@@ -2376,10 +2376,9 @@ void mem_cgroup_split_huge_fixup(struct page *head)
#ifdef CONFIG_MEMCG_SWAP
static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
- bool charge)
+ int nr_entries)
{
- int val = (charge) ? 1 : -1;
- this_cpu_add(memcg->stat->count[MEMCG_SWAP], val);
+ this_cpu_add(memcg->stat->count[MEMCG_SWAP], nr_entries);
}
/**
@@ -2405,8 +2404,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
new_id = mem_cgroup_id(to);
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
- mem_cgroup_swap_statistics(from, false);
- mem_cgroup_swap_statistics(to, true);
+ mem_cgroup_swap_statistics(from, -1);
+ mem_cgroup_swap_statistics(to, 1);
return 0;
}
return -EINVAL;
@@ -3574,6 +3573,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
+ seq_printf(sf, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
return 0;
}
@@ -3725,7 +3725,7 @@ static void memcg_event_remove(struct work_struct *work)
*
* Called with wqh->lock held and interrupts disabled.
*/
-static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
+static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
int sync, void *key)
{
struct mem_cgroup_event *event =
@@ -4122,6 +4122,12 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return 1;
+ pn->lruvec_stat = alloc_percpu(struct lruvec_stat);
+ if (!pn->lruvec_stat) {
+ kfree(pn);
+ return 1;
+ }
+
lruvec_init(&pn->lruvec);
pn->usage_in_excess = 0;
pn->on_tree = false;
@@ -4133,7 +4139,10 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
- kfree(memcg->nodeinfo[node]);
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+
+ free_percpu(pn->lruvec_stat);
+ kfree(pn);
}
static void __mem_cgroup_free(struct mem_cgroup *memcg)
@@ -5165,6 +5174,7 @@ static int memory_events_show(struct seq_file *m, void *v)
seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH));
seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX));
seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM));
+ seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
return 0;
}
@@ -5197,8 +5207,8 @@ static int memory_stat_show(struct seq_file *m, void *v)
seq_printf(m, "kernel_stack %llu\n",
(u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
seq_printf(m, "slab %llu\n",
- (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
- stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
+ (u64)(stat[NR_SLAB_RECLAIMABLE] +
+ stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
seq_printf(m, "sock %llu\n",
(u64)stat[MEMCG_SOCK] * PAGE_SIZE);
@@ -5222,15 +5232,25 @@ static int memory_stat_show(struct seq_file *m, void *v)
}
seq_printf(m, "slab_reclaimable %llu\n",
- (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE);
+ (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
seq_printf(m, "slab_unreclaimable %llu\n",
- (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+ (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
/* Accumulated memory events */
seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
+ seq_printf(m, "pgrefill %lu\n", events[PGREFILL]);
+ seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
+ events[PGSCAN_DIRECT]);
+ seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
+ events[PGSTEAL_DIRECT]);
+ seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]);
+ seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
+ seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
+ seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
+
seq_printf(m, "workingset_refault %lu\n",
stat[WORKINGSET_REFAULT]);
seq_printf(m, "workingset_activate %lu\n",
@@ -5445,7 +5465,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
* let's not wait for it. The page already received a
* memory+swap charge, drop the swap entry duplicate.
*/
- mem_cgroup_uncharge_swap(entry);
+ mem_cgroup_uncharge_swap(entry, nr_pages);
}
}
@@ -5873,9 +5893,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* ancestor for the swap instead and transfer the memory+swap charge.
*/
swap_memcg = mem_cgroup_id_get_online(memcg);
- oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg));
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1);
VM_BUG_ON_PAGE(oldid, page);
- mem_cgroup_swap_statistics(swap_memcg, true);
+ mem_cgroup_swap_statistics(swap_memcg, 1);
page->mem_cgroup = NULL;
@@ -5902,19 +5922,20 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
css_put(&memcg->css);
}
-/*
- * mem_cgroup_try_charge_swap - try charging a swap entry
+/**
+ * mem_cgroup_try_charge_swap - try charging swap space for a page
* @page: page being added to swap
* @entry: swap entry to charge
*
- * Try to charge @entry to the memcg that @page belongs to.
+ * Try to charge @page's memcg for the swap space at @entry.
*
* Returns 0 on success, -ENOMEM on failure.
*/
int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
{
- struct mem_cgroup *memcg;
+ unsigned int nr_pages = hpage_nr_pages(page);
struct page_counter *counter;
+ struct mem_cgroup *memcg;
unsigned short oldid;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
@@ -5929,25 +5950,27 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
memcg = mem_cgroup_id_get_online(memcg);
if (!mem_cgroup_is_root(memcg) &&
- !page_counter_try_charge(&memcg->swap, 1, &counter)) {
+ !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
mem_cgroup_id_put(memcg);
return -ENOMEM;
}
- oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+ /* Get references for the tail pages, too */
+ if (nr_pages > 1)
+ mem_cgroup_id_get_many(memcg, nr_pages - 1);
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
VM_BUG_ON_PAGE(oldid, page);
- mem_cgroup_swap_statistics(memcg, true);
+ mem_cgroup_swap_statistics(memcg, nr_pages);
return 0;
}
/**
- * mem_cgroup_uncharge_swap - uncharge a swap entry
+ * mem_cgroup_uncharge_swap - uncharge swap space
* @entry: swap entry to uncharge
- *
- * Drop the swap charge associated with @entry.
+ * @nr_pages: the amount of swap space to uncharge
*/
-void mem_cgroup_uncharge_swap(swp_entry_t entry)
+void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
{
struct mem_cgroup *memcg;
unsigned short id;
@@ -5955,18 +5978,18 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
if (!do_swap_account)
return;
- id = swap_cgroup_record(entry, 0);
+ id = swap_cgroup_record(entry, 0, nr_pages);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
if (!mem_cgroup_is_root(memcg)) {
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
- page_counter_uncharge(&memcg->swap, 1);
+ page_counter_uncharge(&memcg->swap, nr_pages);
else
- page_counter_uncharge(&memcg->memsw, 1);
+ page_counter_uncharge(&memcg->memsw, nr_pages);
}
- mem_cgroup_swap_statistics(memcg, false);
- mem_cgroup_id_put(memcg);
+ mem_cgroup_swap_statistics(memcg, -nr_pages);
+ mem_cgroup_id_put_many(memcg, nr_pages);
}
rcu_read_unlock();
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3c5e0b8162f3..dbe3e50c9aa5 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1184,7 +1184,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* page_remove_rmap() in try_to_unmap_one(). So to determine page status
* correctly, we save a copy of the page flags at this time.
*/
- page_flags = p->flags;
+ if (PageHuge(p))
+ page_flags = hpage->flags;
+ else
+ page_flags = p->flags;
/*
* unpoison always clear PG_hwpoison inside page lock
@@ -1489,11 +1492,16 @@ EXPORT_SYMBOL(unpoison_memory);
static struct page *new_page(struct page *p, unsigned long private, int **x)
{
int nid = page_to_nid(p);
- if (PageHuge(p))
- return alloc_huge_page_node(page_hstate(compound_head(p)),
- nid);
- else
+ if (PageHuge(p)) {
+ struct hstate *hstate = page_hstate(compound_head(p));
+
+ if (hstate_is_gigantic(hstate))
+ return alloc_huge_page_node(hstate, NUMA_NO_NODE);
+
+ return alloc_huge_page_node(hstate, nid);
+ } else {
return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+ }
}
/*
@@ -1595,12 +1603,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
if (ret) {
pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
pfn, ret, page->flags, &page->flags);
- /*
- * We know that soft_offline_huge_page() tries to migrate
- * only one hugepage pointed to by hpage, so we need not
- * run through the pagelist here.
- */
- putback_active_hugepage(hpage);
+ if (!list_empty(&pagelist))
+ putback_movable_pages(&pagelist);
if (ret > 0)
ret = -EIO;
} else {
diff --git a/mm/memory.c b/mm/memory.c
index 6ff5d729ded0..e31dd97e6114 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2719,7 +2719,7 @@ int do_swap_page(struct vm_fault *vmf)
/* Had to read the page from swap area: Major fault */
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
} else if (PageHWPoison(page)) {
/*
* hwpoisoned dirty swapcache pages are kept for killing
@@ -2855,40 +2855,6 @@ out_release:
}
/*
- * This is like a special single-page "expand_{down|up}wards()",
- * except we must first make sure that 'address{-|+}PAGE_SIZE'
- * doesn't hit another vma.
- */
-static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
-{
- address &= PAGE_MASK;
- if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
- struct vm_area_struct *prev = vma->vm_prev;
-
- /*
- * Is there a mapping abutting this one below?
- *
- * That's only ok if it's the same stack mapping
- * that has gotten split..
- */
- if (prev && prev->vm_end == address)
- return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
-
- return expand_downwards(vma, address - PAGE_SIZE);
- }
- if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
- struct vm_area_struct *next = vma->vm_next;
-
- /* As VM_GROWSDOWN but s/below/above/ */
- if (next && next->vm_start == address + PAGE_SIZE)
- return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
-
- return expand_upwards(vma, address + PAGE_SIZE);
- }
- return 0;
-}
-
-/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
@@ -2904,10 +2870,6 @@ static int do_anonymous_page(struct vm_fault *vmf)
if (vma->vm_flags & VM_SHARED)
return VM_FAULT_SIGBUS;
- /* Check if we need to add a guard page to the stack */
- if (check_stack_guard_page(vma, vmf->address) < 0)
- return VM_FAULT_SIGSEGV;
-
/*
* Use pte_alloc() instead of pte_alloc_map(). We can't run
* pte_offset_map() on pmds where a huge pmd might be created
@@ -3029,6 +2991,17 @@ static int __do_fault(struct vm_fault *vmf)
return ret;
}
+/*
+ * The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
+ * If we check pmd_trans_unstable() first we will trip the bad_pmd() check
+ * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
+ * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
+ */
+static int pmd_devmap_trans_unstable(pmd_t *pmd)
+{
+ return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
+}
+
static int pte_alloc_one_map(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
@@ -3052,18 +3025,27 @@ static int pte_alloc_one_map(struct vm_fault *vmf)
map_pte:
/*
* If a huge pmd materialized under us just retry later. Use
- * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
- * didn't become pmd_trans_huge under us and then back to pmd_none, as
- * a result of MADV_DONTNEED running immediately after a huge pmd fault
- * in a different thread of this mm, in turn leading to a misleading
- * pmd_trans_huge() retval. All we have to ensure is that it is a
- * regular pmd that we can walk with pte_offset_map() and we can do that
- * through an atomic read in C, which is what pmd_trans_unstable()
- * provides.
+ * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
+ * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
+ * under us and then back to pmd_none, as a result of MADV_DONTNEED
+ * running immediately after a huge pmd fault in a different thread of
+ * this mm, in turn leading to a misleading pmd_trans_huge() retval.
+ * All we have to ensure is that it is a regular pmd that we can walk
+ * with pte_offset_map() and we can do that through an atomic read in
+ * C, which is what pmd_trans_unstable() provides.
*/
- if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
+ if (pmd_devmap_trans_unstable(vmf->pmd))
return VM_FAULT_NOPAGE;
+ /*
+ * At this point we know that our vmf->pmd points to a page of ptes
+ * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
+ * for the duration of the fault. If a racing MADV_DONTNEED runs and
+ * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
+ * be valid and we will re-check to make sure the vmf->pte isn't
+ * pte_none() under vmf->ptl protection when we return to
+ * alloc_set_pte().
+ */
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
return 0;
@@ -3690,7 +3672,7 @@ static int handle_pte_fault(struct vm_fault *vmf)
vmf->pte = NULL;
} else {
/* See comment in pte_alloc_one_map() */
- if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
+ if (pmd_devmap_trans_unstable(vmf->pmd))
return 0;
/*
* A regular pmd is established and it can't morph into a huge
@@ -3855,7 +3837,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
__set_current_state(TASK_RUNNING);
count_vm_event(PGFAULT);
- mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT);
+ count_memcg_event_mm(vma->vm_mm, PGFAULT);
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
@@ -4032,8 +4014,6 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
goto out;
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
- if (!ptep)
- goto out;
if (!pte_present(*ptep))
goto unlock;
*ptepp = ptep;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b63d7d1239df..f79aac7a12b5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -79,6 +79,8 @@ static struct {
#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
+bool movable_node_enabled = false;
+
#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
bool memhp_auto_online;
#else
@@ -300,229 +302,38 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
}
#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
-static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- unsigned long old_zone_end_pfn;
-
- zone_span_writelock(zone);
-
- old_zone_end_pfn = zone_end_pfn(zone);
- if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
- zone->zone_start_pfn = start_pfn;
-
- zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
- zone->zone_start_pfn;
-
- zone_span_writeunlock(zone);
-}
-
-static void resize_zone(struct zone *zone, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- zone_span_writelock(zone);
-
- if (end_pfn - start_pfn) {
- zone->zone_start_pfn = start_pfn;
- zone->spanned_pages = end_pfn - start_pfn;
- } else {
- /*
- * make it consist as free_area_init_core(),
- * if spanned_pages = 0, then keep start_pfn = 0
- */
- zone->zone_start_pfn = 0;
- zone->spanned_pages = 0;
- }
-
- zone_span_writeunlock(zone);
-}
-
-static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- enum zone_type zid = zone_idx(zone);
- int nid = zone->zone_pgdat->node_id;
- unsigned long pfn;
-
- for (pfn = start_pfn; pfn < end_pfn; pfn++)
- set_page_links(pfn_to_page(pfn), zid, nid, pfn);
-}
-
-/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
- * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */
-static int __ref ensure_zone_is_initialized(struct zone *zone,
- unsigned long start_pfn, unsigned long num_pages)
-{
- if (!zone_is_initialized(zone))
- return init_currently_empty_zone(zone, start_pfn, num_pages);
-
- return 0;
-}
-
-static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
- unsigned long start_pfn, unsigned long end_pfn)
+static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
+ bool want_memblock)
{
int ret;
- unsigned long flags;
- unsigned long z1_start_pfn;
-
- ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
- if (ret)
- return ret;
-
- pgdat_resize_lock(z1->zone_pgdat, &flags);
-
- /* can't move pfns which are higher than @z2 */
- if (end_pfn > zone_end_pfn(z2))
- goto out_fail;
- /* the move out part must be at the left most of @z2 */
- if (start_pfn > z2->zone_start_pfn)
- goto out_fail;
- /* must included/overlap */
- if (end_pfn <= z2->zone_start_pfn)
- goto out_fail;
-
- /* use start_pfn for z1's start_pfn if z1 is empty */
- if (!zone_is_empty(z1))
- z1_start_pfn = z1->zone_start_pfn;
- else
- z1_start_pfn = start_pfn;
-
- resize_zone(z1, z1_start_pfn, end_pfn);
- resize_zone(z2, end_pfn, zone_end_pfn(z2));
-
- pgdat_resize_unlock(z1->zone_pgdat, &flags);
-
- fix_zone_id(z1, start_pfn, end_pfn);
-
- return 0;
-out_fail:
- pgdat_resize_unlock(z1->zone_pgdat, &flags);
- return -1;
-}
-
-static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
- unsigned long start_pfn, unsigned long end_pfn)
-{
- int ret;
- unsigned long flags;
- unsigned long z2_end_pfn;
-
- ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
- if (ret)
- return ret;
-
- pgdat_resize_lock(z1->zone_pgdat, &flags);
-
- /* can't move pfns which are lower than @z1 */
- if (z1->zone_start_pfn > start_pfn)
- goto out_fail;
- /* the move out part mast at the right most of @z1 */
- if (zone_end_pfn(z1) > end_pfn)
- goto out_fail;
- /* must included/overlap */
- if (start_pfn >= zone_end_pfn(z1))
- goto out_fail;
-
- /* use end_pfn for z2's end_pfn if z2 is empty */
- if (!zone_is_empty(z2))
- z2_end_pfn = zone_end_pfn(z2);
- else
- z2_end_pfn = end_pfn;
-
- resize_zone(z1, z1->zone_start_pfn, start_pfn);
- resize_zone(z2, start_pfn, z2_end_pfn);
-
- pgdat_resize_unlock(z1->zone_pgdat, &flags);
-
- fix_zone_id(z2, start_pfn, end_pfn);
-
- return 0;
-out_fail:
- pgdat_resize_unlock(z1->zone_pgdat, &flags);
- return -1;
-}
-
-static struct zone * __meminit move_pfn_range(int zone_shift,
- unsigned long start_pfn, unsigned long end_pfn)
-{
- struct zone *zone = page_zone(pfn_to_page(start_pfn));
- int ret = 0;
-
- if (zone_shift < 0)
- ret = move_pfn_range_left(zone + zone_shift, zone,
- start_pfn, end_pfn);
- else if (zone_shift)
- ret = move_pfn_range_right(zone, zone + zone_shift,
- start_pfn, end_pfn);
-
- if (ret)
- return NULL;
-
- return zone + zone_shift;
-}
-
-static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
-
- if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
- pgdat->node_start_pfn = start_pfn;
-
- pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
- pgdat->node_start_pfn;
-}
+ int i;
-static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
-{
- struct pglist_data *pgdat = zone->zone_pgdat;
- int nr_pages = PAGES_PER_SECTION;
- int nid = pgdat->node_id;
- int zone_type;
- unsigned long flags, pfn;
- int ret;
+ if (pfn_valid(phys_start_pfn))
+ return -EEXIST;
- zone_type = zone - pgdat->node_zones;
- ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
- if (ret)
+ ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn);
+ if (ret < 0)
return ret;
- pgdat_resize_lock(zone->zone_pgdat, &flags);
- grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
- grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
- phys_start_pfn + nr_pages);
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
- memmap_init_zone(nr_pages, nid, zone_type,
- phys_start_pfn, MEMMAP_HOTPLUG);
-
- /* online_page_range is called later and expects pages reserved */
- for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) {
+ /*
+ * Make all the pages reserved so that nobody will stumble over half
+ * initialized state.
+ * FIXME: We also have to associate it with a node because pfn_to_node
+ * relies on having page with the proper node.
+ */
+ for (i = 0; i < PAGES_PER_SECTION; i++) {
+ unsigned long pfn = phys_start_pfn + i;
+ struct page *page;
if (!pfn_valid(pfn))
continue;
- SetPageReserved(pfn_to_page(pfn));
+ page = pfn_to_page(pfn);
+ set_page_node(page, nid);
+ SetPageReserved(page);
}
- return 0;
-}
-
-static int __meminit __add_section(int nid, struct zone *zone,
- unsigned long phys_start_pfn)
-{
- int ret;
-
- if (pfn_valid(phys_start_pfn))
- return -EEXIST;
-
- ret = sparse_add_one_section(zone, phys_start_pfn);
-
- if (ret < 0)
- return ret;
-
- ret = __add_zone(zone, phys_start_pfn);
- if (ret < 0)
- return ret;
+ if (!want_memblock)
+ return 0;
return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
}
@@ -533,16 +344,14 @@ static int __meminit __add_section(int nid, struct zone *zone,
* call this function after deciding the zone to which to
* add the new pages.
*/
-int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
- unsigned long nr_pages)
+int __ref __add_pages(int nid, unsigned long phys_start_pfn,
+ unsigned long nr_pages, bool want_memblock)
{
unsigned long i;
int err = 0;
int start_sec, end_sec;
struct vmem_altmap *altmap;
- clear_zone_contiguous(zone);
-
/* during initialize mem_map, align hot-added range to section */
start_sec = pfn_to_section_nr(phys_start_pfn);
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
@@ -562,7 +371,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
}
for (i = start_sec; i <= end_sec; i++) {
- err = __add_section(nid, zone, section_nr_to_pfn(i));
+ err = __add_section(nid, section_nr_to_pfn(i), want_memblock);
/*
* EEXIST is finally dealt with by ioresource collision
@@ -575,7 +384,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
}
vmemmap_populate_print_last();
out:
- set_zone_contiguous(zone);
return err;
}
EXPORT_SYMBOL_GPL(__add_pages);
@@ -939,33 +747,20 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
unsigned long i;
unsigned long onlined_pages = *(unsigned long *)arg;
struct page *page;
+
if (PageReserved(pfn_to_page(start_pfn)))
for (i = 0; i < nr_pages; i++) {
page = pfn_to_page(start_pfn + i);
(*online_page_callback)(page);
onlined_pages++;
}
+
+ online_mem_sections(start_pfn, start_pfn + nr_pages);
+
*(unsigned long *)arg = onlined_pages;
return 0;
}
-#ifdef CONFIG_MOVABLE_NODE
-/*
- * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
- * normal memory.
- */
-static bool can_online_high_movable(struct zone *zone)
-{
- return true;
-}
-#else /* CONFIG_MOVABLE_NODE */
-/* ensure every online node has NORMAL memory */
-static bool can_online_high_movable(struct zone *zone)
-{
- return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
-}
-#endif /* CONFIG_MOVABLE_NODE */
-
/* check which state of node_states will be changed when online memory */
static void node_states_check_changes_online(unsigned long nr_pages,
struct zone *zone, struct memory_notify *arg)
@@ -1040,39 +835,131 @@ static void node_states_set_node(int node, struct memory_notify *arg)
node_set_state(node, N_MEMORY);
}
-bool zone_can_shift(unsigned long pfn, unsigned long nr_pages,
- enum zone_type target, int *zone_shift)
+bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
{
- struct zone *zone = page_zone(pfn_to_page(pfn));
- enum zone_type idx = zone_idx(zone);
- int i;
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
+ struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages);
- *zone_shift = 0;
+ /*
+ * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
+ * physically before ZONE_MOVABLE. All we need is they do not
+ * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
+ * though so let's stick with it for simplicity for now.
+ * TODO make sure we do not overlap with ZONE_DEVICE
+ */
+ if (online_type == MMOP_ONLINE_KERNEL) {
+ if (zone_is_empty(movable_zone))
+ return true;
+ return movable_zone->zone_start_pfn >= pfn + nr_pages;
+ } else if (online_type == MMOP_ONLINE_MOVABLE) {
+ return zone_end_pfn(default_zone) <= pfn;
+ }
- if (idx < target) {
- /* pages must be at end of current zone */
- if (pfn + nr_pages != zone_end_pfn(zone))
- return false;
+ /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
+ return online_type == MMOP_ONLINE_KEEP;
+}
+
+static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long old_end_pfn = zone_end_pfn(zone);
+
+ if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
+ zone->zone_start_pfn = start_pfn;
+
+ zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
+}
+
+static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
+
+ if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
+ pgdat->node_start_pfn = start_pfn;
+
+ pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
+}
+
+void __ref move_pfn_range_to_zone(struct zone *zone,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int nid = pgdat->node_id;
+ unsigned long flags;
- /* no zones in use between current zone and target */
- for (i = idx + 1; i < target; i++)
- if (zone_is_initialized(zone - idx + i))
- return false;
+ if (zone_is_empty(zone))
+ init_currently_empty_zone(zone, start_pfn, nr_pages);
+
+ clear_zone_contiguous(zone);
+
+ /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
+ pgdat_resize_lock(pgdat, &flags);
+ zone_span_writelock(zone);
+ resize_zone_range(zone, start_pfn, nr_pages);
+ zone_span_writeunlock(zone);
+ resize_pgdat_range(pgdat, start_pfn, nr_pages);
+ pgdat_resize_unlock(pgdat, &flags);
+
+ /*
+ * TODO now we have a visible range of pages which are not associated
+ * with their zone properly. Not nice but set_pfnblock_flags_mask
+ * expects the zone spans the pfn range. All the pages in the range
+ * are reserved so nobody should be touching them so we should be safe
+ */
+ memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG);
+
+ set_zone_contiguous(zone);
+}
+
+/*
+ * Returns a default kernel memory zone for the given pfn range.
+ * If no kernel zone covers this pfn range it will automatically go
+ * to the ZONE_NORMAL.
+ */
+struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ int zid;
+
+ for (zid = 0; zid <= ZONE_NORMAL; zid++) {
+ struct zone *zone = &pgdat->node_zones[zid];
+
+ if (zone_intersects(zone, start_pfn, nr_pages))
+ return zone;
}
- if (target < idx) {
- /* pages must be at beginning of current zone */
- if (pfn != zone->zone_start_pfn)
- return false;
+ return &pgdat->node_zones[ZONE_NORMAL];
+}
- /* no zones in use between current zone and target */
- for (i = target + 1; i < idx; i++)
- if (zone_is_initialized(zone - idx + i))
- return false;
+/*
+ * Associates the given pfn range with the given node and the zone appropriate
+ * for the given online type.
+ */
+static struct zone * __meminit move_pfn_range(int online_type, int nid,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages);
+
+ if (online_type == MMOP_ONLINE_KEEP) {
+ struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
+ /*
+ * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use
+ * movable zone if that is not possible (e.g. we are within
+ * or past the existing movable zone)
+ */
+ if (!allow_online_pfn_range(nid, start_pfn, nr_pages,
+ MMOP_ONLINE_KERNEL))
+ zone = movable_zone;
+ } else if (online_type == MMOP_ONLINE_MOVABLE) {
+ zone = &pgdat->node_zones[ZONE_MOVABLE];
}
- *zone_shift = target - idx;
- return true;
+ move_pfn_range_to_zone(zone, start_pfn, nr_pages);
+ return zone;
}
/* Must be protected by mem_hotplug_begin() */
@@ -1085,38 +972,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
int nid;
int ret;
struct memory_notify arg;
- int zone_shift = 0;
- /*
- * This doesn't need a lock to do pfn_to_page().
- * The section can't be removed here because of the
- * memory_block->state_mutex.
- */
- zone = page_zone(pfn_to_page(pfn));
-
- if ((zone_idx(zone) > ZONE_NORMAL ||
- online_type == MMOP_ONLINE_MOVABLE) &&
- !can_online_high_movable(zone))
+ nid = pfn_to_nid(pfn);
+ if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))
return -EINVAL;
- if (online_type == MMOP_ONLINE_KERNEL) {
- if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift))
- return -EINVAL;
- } else if (online_type == MMOP_ONLINE_MOVABLE) {
- if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift))
- return -EINVAL;
- }
-
- zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages);
- if (!zone)
- return -EINVAL;
+ /* associate pfn range with the zone */
+ zone = move_pfn_range(online_type, nid, pfn, nr_pages);
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
node_states_check_changes_online(nr_pages, zone, &arg);
- nid = zone_to_nid(zone);
-
ret = memory_notify(MEM_GOING_ONLINE, &arg);
ret = notifier_to_errno(ret);
if (ret)
@@ -1311,39 +1178,6 @@ static int check_hotplug_memory_range(u64 start, u64 size)
return 0;
}
-/*
- * If movable zone has already been setup, newly added memory should be check.
- * If its address is higher than movable zone, it should be added as movable.
- * Without this check, movable zone may overlap with other zone.
- */
-static int should_add_memory_movable(int nid, u64 start, u64 size)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- pg_data_t *pgdat = NODE_DATA(nid);
- struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE;
-
- if (zone_is_empty(movable_zone))
- return 0;
-
- if (movable_zone->zone_start_pfn <= start_pfn)
- return 1;
-
- return 0;
-}
-
-int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
- bool for_device)
-{
-#ifdef CONFIG_ZONE_DEVICE
- if (for_device)
- return ZONE_DEVICE;
-#endif
- if (should_add_memory_movable(nid, start, size))
- return ZONE_MOVABLE;
-
- return zone_default;
-}
-
static int online_memory_block(struct memory_block *mem, void *arg)
{
return device_online(&mem->dev);
@@ -1389,7 +1223,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
}
/* call arch's memory hotadd */
- ret = arch_add_memory(nid, start, size, false);
+ ret = arch_add_memory(nid, start, size, true);
if (ret < 0)
goto error;
@@ -1398,7 +1232,22 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
node_set_online(nid);
if (new_node) {
- ret = register_one_node(nid);
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+
+ ret = __register_one_node(nid);
+ if (ret)
+ goto register_fail;
+
+ /*
+ * link memory sections under this node. This is already
+ * done when creatig memory section in register_new_memory
+ * but that depends to have the node registered so offline
+ * nodes have to go through register_node.
+ * TODO clean up this mess.
+ */
+ ret = link_mem_sections(nid, start_pfn, nr_pages);
+register_fail:
/*
* If sysfs file of new node can't create, cpu on the node
* can't be hot-added. There is no rollback way now.
@@ -1592,11 +1441,9 @@ static struct page *new_node_page(struct page *page, unsigned long private,
gfp_mask |= __GFP_HIGHMEM;
if (!nodes_empty(nmask))
- new_page = __alloc_pages_nodemask(gfp_mask, 0,
- node_zonelist(nid, gfp_mask), &nmask);
+ new_page = __alloc_pages_nodemask(gfp_mask, 0, nid, &nmask);
if (!new_page)
- new_page = __alloc_pages(gfp_mask, 0,
- node_zonelist(nid, gfp_mask));
+ new_page = __alloc_pages(gfp_mask, 0, nid);
return new_page;
}
@@ -1725,47 +1572,12 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
return offlined;
}
-#ifdef CONFIG_MOVABLE_NODE
-/*
- * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
- * normal memory.
- */
-static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
-{
- return true;
-}
-#else /* CONFIG_MOVABLE_NODE */
-/* ensure the node has NORMAL memory if it is still online */
-static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
-{
- struct pglist_data *pgdat = zone->zone_pgdat;
- unsigned long present_pages = 0;
- enum zone_type zt;
-
- for (zt = 0; zt <= ZONE_NORMAL; zt++)
- present_pages += pgdat->node_zones[zt].present_pages;
-
- if (present_pages > nr_pages)
- return true;
-
- present_pages = 0;
- for (; zt <= ZONE_MOVABLE; zt++)
- present_pages += pgdat->node_zones[zt].present_pages;
-
- /*
- * we can't offline the last normal memory until all
- * higher memory is offlined.
- */
- return present_pages == 0;
-}
-#endif /* CONFIG_MOVABLE_NODE */
-
static int __init cmdline_parse_movable_node(char *p)
{
-#ifdef CONFIG_MOVABLE_NODE
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
movable_node_enabled = true;
#else
- pr_warn("movable_node option not supported\n");
+ pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n");
#endif
return 0;
}
@@ -1887,9 +1699,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
node = zone_to_nid(zone);
nr_pages = end_pfn - start_pfn;
- if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
- return -EINVAL;
-
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE, true);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 37d0b334bfe9..7d8e56214ac0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -146,22 +146,7 @@ struct mempolicy *get_task_policy(struct task_struct *p)
static const struct mempolicy_operations {
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
- /*
- * If read-side task has no lock to protect task->mempolicy, write-side
- * task will rebind the task->mempolicy by two step. The first step is
- * setting all the newly nodes, and the second step is cleaning all the
- * disallowed nodes. In this way, we can avoid finding no node to alloc
- * page.
- * If we have a lock to protect task->mempolicy in read-side, we do
- * rebind directly.
- *
- * step:
- * MPOL_REBIND_ONCE - do rebind work at once
- * MPOL_REBIND_STEP1 - set all the newly nodes
- * MPOL_REBIND_STEP2 - clean all the disallowed nodes
- */
- void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
- enum mpol_rebind_step step);
+ void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
} mpol_ops[MPOL_MAX];
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
@@ -304,19 +289,11 @@ void __mpol_put(struct mempolicy *p)
kmem_cache_free(policy_cache, p);
}
-static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
- enum mpol_rebind_step step)
+static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
{
}
-/*
- * step:
- * MPOL_REBIND_ONCE - do rebind work at once
- * MPOL_REBIND_STEP1 - set all the newly nodes
- * MPOL_REBIND_STEP2 - clean all the disallowed nodes
- */
-static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
- enum mpol_rebind_step step)
+static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
nodemask_t tmp;
@@ -325,41 +302,19 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
else if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
else {
- /*
- * if step == 1, we use ->w.cpuset_mems_allowed to cache the
- * result
- */
- if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
- nodes_remap(tmp, pol->v.nodes,
- pol->w.cpuset_mems_allowed, *nodes);
- pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
- } else if (step == MPOL_REBIND_STEP2) {
- tmp = pol->w.cpuset_mems_allowed;
- pol->w.cpuset_mems_allowed = *nodes;
- } else
- BUG();
+ nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
+ *nodes);
+ pol->w.cpuset_mems_allowed = tmp;
}
if (nodes_empty(tmp))
tmp = *nodes;
- if (step == MPOL_REBIND_STEP1)
- nodes_or(pol->v.nodes, pol->v.nodes, tmp);
- else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
- pol->v.nodes = tmp;
- else
- BUG();
-
- if (!node_isset(current->il_next, tmp)) {
- current->il_next = next_node_in(current->il_next, tmp);
- if (current->il_next >= MAX_NUMNODES)
- current->il_next = numa_node_id();
- }
+ pol->v.nodes = tmp;
}
static void mpol_rebind_preferred(struct mempolicy *pol,
- const nodemask_t *nodes,
- enum mpol_rebind_step step)
+ const nodemask_t *nodes)
{
nodemask_t tmp;
@@ -385,42 +340,19 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
/*
* mpol_rebind_policy - Migrate a policy to a different set of nodes
*
- * If read-side task has no lock to protect task->mempolicy, write-side
- * task will rebind the task->mempolicy by two step. The first step is
- * setting all the newly nodes, and the second step is cleaning all the
- * disallowed nodes. In this way, we can avoid finding no node to alloc
- * page.
- * If we have a lock to protect task->mempolicy in read-side, we do
- * rebind directly.
- *
- * step:
- * MPOL_REBIND_ONCE - do rebind work at once
- * MPOL_REBIND_STEP1 - set all the newly nodes
- * MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ * Per-vma policies are protected by mmap_sem. Allocations using per-task
+ * policies are protected by task->mems_allowed_seq to prevent a premature
+ * OOM/allocation failure due to parallel nodemask modification.
*/
-static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
- enum mpol_rebind_step step)
+static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
if (!pol)
return;
- if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
+ if (!mpol_store_user_nodemask(pol) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
- if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
- return;
-
- if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
- BUG();
-
- if (step == MPOL_REBIND_STEP1)
- pol->flags |= MPOL_F_REBINDING;
- else if (step == MPOL_REBIND_STEP2)
- pol->flags &= ~MPOL_F_REBINDING;
- else if (step >= MPOL_REBIND_NSTEP)
- BUG();
-
- mpol_ops[pol->mode].rebind(pol, newmask, step);
+ mpol_ops[pol->mode].rebind(pol, newmask);
}
/*
@@ -430,10 +362,9 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
* Called with task's alloc_lock held.
*/
-void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
- enum mpol_rebind_step step)
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
{
- mpol_rebind_policy(tsk->mempolicy, new, step);
+ mpol_rebind_policy(tsk->mempolicy, new);
}
/*
@@ -448,7 +379,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
down_write(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next)
- mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
+ mpol_rebind_policy(vma->vm_policy, new);
up_write(&mm->mmap_sem);
}
@@ -812,9 +743,8 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
}
old = current->mempolicy;
current->mempolicy = new;
- if (new && new->mode == MPOL_INTERLEAVE &&
- nodes_weight(new->v.nodes))
- current->il_next = first_node(new->v.nodes);
+ if (new && new->mode == MPOL_INTERLEAVE)
+ current->il_prev = MAX_NUMNODES-1;
task_unlock(current);
mpol_put(old);
ret = 0;
@@ -916,7 +846,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
*policy = err;
} else if (pol == current->mempolicy &&
pol->mode == MPOL_INTERLEAVE) {
- *policy = current->il_next;
+ *policy = next_node_in(current->il_prev, pol->v.nodes);
} else {
err = -EINVAL;
goto out;
@@ -1676,9 +1606,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
return NULL;
}
-/* Return a zonelist indicated by gfp for node representing a mempolicy */
-static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
- int nd)
+/* Return the node id preferred by the given mempolicy, or the given id */
+static int policy_node(gfp_t gfp, struct mempolicy *policy,
+ int nd)
{
if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
nd = policy->v.preferred_node;
@@ -1691,20 +1621,19 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
}
- return node_zonelist(nd, gfp);
+ return nd;
}
/* Do dynamic interleaving for a process */
static unsigned interleave_nodes(struct mempolicy *policy)
{
- unsigned nid, next;
+ unsigned next;
struct task_struct *me = current;
- nid = me->il_next;
- next = next_node_in(nid, policy->v.nodes);
+ next = next_node_in(me->il_prev, policy->v.nodes);
if (next < MAX_NUMNODES)
- me->il_next = next;
- return nid;
+ me->il_prev = next;
+ return next;
}
/*
@@ -1799,38 +1728,37 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
#ifdef CONFIG_HUGETLBFS
/*
- * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
+ * huge_node(@vma, @addr, @gfp_flags, @mpol)
* @vma: virtual memory area whose policy is sought
* @addr: address in @vma for shared policy lookup and interleave policy
* @gfp_flags: for requested zone
* @mpol: pointer to mempolicy pointer for reference counted mempolicy
* @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
*
- * Returns a zonelist suitable for a huge page allocation and a pointer
+ * Returns a nid suitable for a huge page allocation and a pointer
* to the struct mempolicy for conditional unref after allocation.
* If the effective policy is 'BIND, returns a pointer to the mempolicy's
* @nodemask for filtering the zonelist.
*
* Must be protected by read_mems_allowed_begin()
*/
-struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
- gfp_t gfp_flags, struct mempolicy **mpol,
- nodemask_t **nodemask)
+int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
+ struct mempolicy **mpol, nodemask_t **nodemask)
{
- struct zonelist *zl;
+ int nid;
*mpol = get_vma_policy(vma, addr);
*nodemask = NULL; /* assume !MPOL_BIND */
if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
- zl = node_zonelist(interleave_nid(*mpol, vma, addr,
- huge_page_shift(hstate_vma(vma))), gfp_flags);
+ nid = interleave_nid(*mpol, vma, addr,
+ huge_page_shift(hstate_vma(vma)));
} else {
- zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
+ nid = policy_node(gfp_flags, *mpol, numa_node_id());
if ((*mpol)->mode == MPOL_BIND)
*nodemask = &(*mpol)->v.nodes;
}
- return zl;
+ return nid;
}
/*
@@ -1932,12 +1860,10 @@ out:
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
unsigned nid)
{
- struct zonelist *zl;
struct page *page;
- zl = node_zonelist(nid, gfp);
- page = __alloc_pages(gfp, order, zl);
- if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
+ page = __alloc_pages(gfp, order, nid);
+ if (page && page_to_nid(page) == nid)
inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
return page;
}
@@ -1971,13 +1897,10 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
{
struct mempolicy *pol;
struct page *page;
- unsigned int cpuset_mems_cookie;
- struct zonelist *zl;
+ int preferred_nid;
nodemask_t *nmask;
-retry_cpuset:
pol = get_vma_policy(vma, addr);
- cpuset_mems_cookie = read_mems_allowed_begin();
if (pol->mode == MPOL_INTERLEAVE) {
unsigned nid;
@@ -2015,12 +1938,10 @@ retry_cpuset:
}
nmask = policy_nodemask(gfp, pol);
- zl = policy_zonelist(gfp, pol, node);
- page = __alloc_pages_nodemask(gfp, order, zl, nmask);
+ preferred_nid = policy_node(gfp, pol, node);
+ page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
mpol_cond_put(pol);
out:
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
- goto retry_cpuset;
return page;
}
@@ -2038,23 +1959,15 @@ out:
* Allocate a page from the kernel page pool. When not in
* interrupt context and apply the current process NUMA policy.
* Returns NULL when no page can be allocated.
- *
- * Don't call cpuset_update_task_memory_state() unless
- * 1) it's ok to take cpuset_sem (can WAIT), and
- * 2) allocating for current task (not interrupt).
*/
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = &default_policy;
struct page *page;
- unsigned int cpuset_mems_cookie;
if (!in_interrupt() && !(gfp & __GFP_THISNODE))
pol = get_task_policy(current);
-retry_cpuset:
- cpuset_mems_cookie = read_mems_allowed_begin();
-
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
@@ -2063,12 +1976,9 @@ retry_cpuset:
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
else
page = __alloc_pages_nodemask(gfp, order,
- policy_zonelist(gfp, pol, numa_node_id()),
+ policy_node(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
- goto retry_cpuset;
-
return page;
}
EXPORT_SYMBOL(alloc_pages_current);
@@ -2112,10 +2022,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current);
- if (new->flags & MPOL_F_REBINDING)
- mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
- else
- mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
+ mpol_rebind_policy(new, &mems);
}
atomic_set(&new->refcnt, 1);
return new;
diff --git a/mm/mempool.c b/mm/mempool.c
index 47a659dedd44..1c0294858527 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -312,7 +312,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
void *element;
unsigned long flags;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
gfp_t gfp_temp;
VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
diff --git a/mm/migrate.c b/mm/migrate.c
index 89a0a1707f4c..051cc1555d36 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -227,25 +227,26 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
if (is_write_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
+ flush_dcache_page(new);
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
pte = pte_mkhuge(pte);
pte = arch_make_huge_pte(pte, vma, new, 0);
- }
-#endif
- flush_dcache_page(new);
- set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
-
- if (PageHuge(new)) {
+ set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
if (PageAnon(new))
hugepage_add_anon_rmap(new, vma, pvmw.address);
else
page_dup_rmap(new, true);
- } else if (PageAnon(new))
- page_add_anon_rmap(new, vma, pvmw.address, false);
- else
- page_add_file_rmap(new, false);
+ } else
+#endif
+ {
+ set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
+ if (PageAnon(new))
+ page_add_anon_rmap(new, vma, pvmw.address, false);
+ else
+ page_add_file_rmap(new, false);
+ }
if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
mlock_vma_page(new);
diff --git a/mm/mlock.c b/mm/mlock.c
index c483c5c20b4b..b562b5523a65 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -284,7 +284,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
{
int i;
int nr = pagevec_count(pvec);
- int delta_munlocked;
+ int delta_munlocked = -nr;
struct pagevec pvec_putback;
int pgrescued = 0;
@@ -304,6 +304,8 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
continue;
else
__munlock_isolation_failed(page);
+ } else {
+ delta_munlocked++;
}
/*
@@ -315,7 +317,6 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
pagevec_add(&pvec_putback, pvec->pages[i]);
pvec->pages[i] = NULL;
}
- delta_munlocked = -nr + pagevec_count(&pvec_putback);
__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
spin_unlock_irq(zone_lru_lock(zone));
diff --git a/mm/mmap.c b/mm/mmap.c
index f82741e199c0..5a0ba9788cdd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -94,7 +94,7 @@ static void unmap_region(struct mm_struct *mm,
* w: (no) no
* x: (yes) yes
*/
-pgprot_t protection_map[16] = {
+pgprot_t protection_map[16] __ro_after_init = {
__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
};
@@ -183,6 +183,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
unsigned long retval;
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
+ struct vm_area_struct *next;
unsigned long min_brk;
bool populate;
LIST_HEAD(uf);
@@ -229,7 +230,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
}
/* Check against existing mmap mappings. */
- if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
+ next = find_vma(mm, oldbrk);
+ if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
goto out;
/* Ok, looks good - let it rip. */
@@ -253,10 +255,22 @@ out:
static long vma_compute_subtree_gap(struct vm_area_struct *vma)
{
- unsigned long max, subtree_gap;
- max = vma->vm_start;
- if (vma->vm_prev)
- max -= vma->vm_prev->vm_end;
+ unsigned long max, prev_end, subtree_gap;
+
+ /*
+ * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
+ * allow two stack_guard_gaps between them here, and when choosing
+ * an unmapped area; whereas when expanding we only require one.
+ * That's a little inconsistent, but keeps the code here simpler.
+ */
+ max = vm_start_gap(vma);
+ if (vma->vm_prev) {
+ prev_end = vm_end_gap(vma->vm_prev);
+ if (max > prev_end)
+ max -= prev_end;
+ else
+ max = 0;
+ }
if (vma->vm_rb.rb_left) {
subtree_gap = rb_entry(vma->vm_rb.rb_left,
struct vm_area_struct, vm_rb)->rb_subtree_gap;
@@ -352,7 +366,7 @@ static void validate_mm(struct mm_struct *mm)
anon_vma_unlock_read(anon_vma);
}
- highest_address = vma->vm_end;
+ highest_address = vm_end_gap(vma);
vma = vma->vm_next;
i++;
}
@@ -541,7 +555,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
if (vma->vm_next)
vma_gap_update(vma->vm_next);
else
- mm->highest_vm_end = vma->vm_end;
+ mm->highest_vm_end = vm_end_gap(vma);
/*
* vma->vm_prev wasn't known when we followed the rbtree to find the
@@ -856,7 +870,7 @@ again:
vma_gap_update(vma);
if (end_changed) {
if (!next)
- mm->highest_vm_end = end;
+ mm->highest_vm_end = vm_end_gap(vma);
else if (!adjust_next)
vma_gap_update(next);
}
@@ -941,7 +955,7 @@ again:
* mm->highest_vm_end doesn't need any update
* in remove_next == 1 case.
*/
- VM_WARN_ON(mm->highest_vm_end != end);
+ VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
}
}
if (insert && file)
@@ -1787,7 +1801,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
while (true) {
/* Visit left subtree if it looks promising */
- gap_end = vma->vm_start;
+ gap_end = vm_start_gap(vma);
if (gap_end >= low_limit && vma->vm_rb.rb_left) {
struct vm_area_struct *left =
rb_entry(vma->vm_rb.rb_left,
@@ -1798,12 +1812,13 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
}
}
- gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+ gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
check_current:
/* Check if current node has a suitable gap */
if (gap_start > high_limit)
return -ENOMEM;
- if (gap_end >= low_limit && gap_end - gap_start >= length)
+ if (gap_end >= low_limit &&
+ gap_end > gap_start && gap_end - gap_start >= length)
goto found;
/* Visit right subtree if it looks promising */
@@ -1825,8 +1840,8 @@ check_current:
vma = rb_entry(rb_parent(prev),
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_left) {
- gap_start = vma->vm_prev->vm_end;
- gap_end = vma->vm_start;
+ gap_start = vm_end_gap(vma->vm_prev);
+ gap_end = vm_start_gap(vma);
goto check_current;
}
}
@@ -1890,7 +1905,7 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
while (true) {
/* Visit right subtree if it looks promising */
- gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+ gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
if (gap_start <= high_limit && vma->vm_rb.rb_right) {
struct vm_area_struct *right =
rb_entry(vma->vm_rb.rb_right,
@@ -1903,10 +1918,11 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
check_current:
/* Check if current node has a suitable gap */
- gap_end = vma->vm_start;
+ gap_end = vm_start_gap(vma);
if (gap_end < low_limit)
return -ENOMEM;
- if (gap_start <= high_limit && gap_end - gap_start >= length)
+ if (gap_start <= high_limit &&
+ gap_end > gap_start && gap_end - gap_start >= length)
goto found;
/* Visit left subtree if it looks promising */
@@ -1929,7 +1945,7 @@ check_current:
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_right) {
gap_start = vma->vm_prev ?
- vma->vm_prev->vm_end : 0;
+ vm_end_gap(vma->vm_prev) : 0;
goto check_current;
}
}
@@ -1967,7 +1983,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma, *prev;
struct vm_unmapped_area_info info;
if (len > TASK_SIZE - mmap_min_addr)
@@ -1978,9 +1994,10 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
if (addr) {
addr = PAGE_ALIGN(addr);
- vma = find_vma(mm, addr);
+ vma = find_vma_prev(mm, addr, &prev);
if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)) &&
+ (!prev || addr >= vm_end_gap(prev)))
return addr;
}
@@ -2003,7 +2020,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
const unsigned long len, const unsigned long pgoff,
const unsigned long flags)
{
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma, *prev;
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
struct vm_unmapped_area_info info;
@@ -2018,9 +2035,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
/* requesting a specific address */
if (addr) {
addr = PAGE_ALIGN(addr);
- vma = find_vma(mm, addr);
+ vma = find_vma_prev(mm, addr, &prev);
if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)) &&
+ (!prev || addr >= vm_end_gap(prev)))
return addr;
}
@@ -2155,21 +2173,19 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
* update accounting. This is shared with both the
* grow-up and grow-down cases.
*/
-static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
+static int acct_stack_growth(struct vm_area_struct *vma,
+ unsigned long size, unsigned long grow)
{
struct mm_struct *mm = vma->vm_mm;
struct rlimit *rlim = current->signal->rlim;
- unsigned long new_start, actual_size;
+ unsigned long new_start;
/* address space limit tests */
if (!may_expand_vm(mm, vma->vm_flags, grow))
return -ENOMEM;
/* Stack limit test */
- actual_size = size;
- if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN)))
- actual_size -= PAGE_SIZE;
- if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))
+ if (size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))
return -ENOMEM;
/* mlock limit tests */
@@ -2207,16 +2223,32 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *next;
+ unsigned long gap_addr;
int error = 0;
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
- /* Guard against wrapping around to address 0. */
- if (address < PAGE_ALIGN(address+4))
- address = PAGE_ALIGN(address+4);
- else
+ /* Guard against exceeding limits of the address space. */
+ address &= PAGE_MASK;
+ if (address >= TASK_SIZE)
return -ENOMEM;
+ address += PAGE_SIZE;
+
+ /* Enforce stack_guard_gap */
+ gap_addr = address + stack_guard_gap;
+
+ /* Guard against overflow */
+ if (gap_addr < address || gap_addr > TASK_SIZE)
+ gap_addr = TASK_SIZE;
+
+ next = vma->vm_next;
+ if (next && next->vm_start < gap_addr) {
+ if (!(next->vm_flags & VM_GROWSUP))
+ return -ENOMEM;
+ /* Check that both stack segments have the same anon_vma? */
+ }
/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma)))
@@ -2261,7 +2293,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (vma->vm_next)
vma_gap_update(vma->vm_next);
else
- mm->highest_vm_end = address;
+ mm->highest_vm_end = vm_end_gap(vma);
spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
@@ -2282,6 +2314,8 @@ int expand_downwards(struct vm_area_struct *vma,
unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *prev;
+ unsigned long gap_addr;
int error;
address &= PAGE_MASK;
@@ -2289,6 +2323,17 @@ int expand_downwards(struct vm_area_struct *vma,
if (error)
return error;
+ /* Enforce stack_guard_gap */
+ gap_addr = address - stack_guard_gap;
+ if (gap_addr > address)
+ return -ENOMEM;
+ prev = vma->vm_prev;
+ if (prev && prev->vm_end > gap_addr) {
+ if (!(prev->vm_flags & VM_GROWSDOWN))
+ return -ENOMEM;
+ /* Check that both stack segments have the same anon_vma? */
+ }
+
/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
@@ -2343,28 +2388,25 @@ int expand_downwards(struct vm_area_struct *vma,
return error;
}
-/*
- * Note how expand_stack() refuses to expand the stack all the way to
- * abut the next virtual mapping, *unless* that mapping itself is also
- * a stack mapping. We want to leave room for a guard page, after all
- * (the guard page itself is not added here, that is done by the
- * actual page faulting logic)
- *
- * This matches the behavior of the guard page logic (see mm/memory.c:
- * check_stack_guard_page()), which only allows the guard page to be
- * removed under these circumstances.
- */
+/* enforced gap between the expanding stack and other mappings. */
+unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
+
+static int __init cmdline_parse_stack_guard_gap(char *p)
+{
+ unsigned long val;
+ char *endptr;
+
+ val = simple_strtoul(p, &endptr, 10);
+ if (!*endptr)
+ stack_guard_gap = val << PAGE_SHIFT;
+
+ return 0;
+}
+__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
+
#ifdef CONFIG_STACK_GROWSUP
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
- struct vm_area_struct *next;
-
- address &= PAGE_MASK;
- next = vma->vm_next;
- if (next && next->vm_start == address + PAGE_SIZE) {
- if (!(next->vm_flags & VM_GROWSUP))
- return -ENOMEM;
- }
return expand_upwards(vma, address);
}
@@ -2386,14 +2428,6 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
#else
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
- struct vm_area_struct *prev;
-
- address &= PAGE_MASK;
- prev = vma->vm_prev;
- if (prev && prev->vm_end == address) {
- if (!(prev->vm_flags & VM_GROWSDOWN))
- return -ENOMEM;
- }
return expand_downwards(vma, address);
}
@@ -2491,7 +2525,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
vma->vm_prev = prev;
vma_gap_update(vma);
} else
- mm->highest_vm_end = prev ? prev->vm_end : 0;
+ mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
tail_vma->vm_next = NULL;
/* Kill the cache */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8edd0d576254..1a8c9ca83e48 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -58,8 +58,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
* reading.
*/
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- if (!pte)
- return 0;
/* Get target node for single threaded private VMAs */
if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 487dad610731..36454d0f96ee 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -118,7 +118,7 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
unsigned long end_pfn = min_t(unsigned long,
PFN_DOWN(end), max_low_pfn);
- if (start_pfn > end_pfn)
+ if (start_pfn >= end_pfn)
return 0;
__free_pages_memory(start_pfn, end_pfn);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 04c9143a8625..0e2c925e7826 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -876,6 +876,11 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
/* Get a reference to safely compare mm after task_unlock(victim) */
mm = victim->mm;
mmgrab(mm);
+
+ /* Raise event before sending signal: task reaper must see this */
+ count_vm_event(OOM_KILL);
+ count_memcg_event_mm(mm, OOM_KILL);
+
/*
* We should send SIGKILL before setting TIF_MEMDIE in order to prevent
* the OOM victim from depleting the memory reserves from the user
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 143c1c25d680..0b60cc7ddac2 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2366,15 +2366,15 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
}
/**
- * write_one_page - write out a single page and optionally wait on I/O
+ * write_one_page - write out a single page and wait on I/O
* @page: the page to write
- * @wait: if true, wait on writeout
*
* The page must be locked by the caller and will be unlocked upon return.
*
- * write_one_page() returns a negative error code if I/O failed.
+ * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
+ * function returns.
*/
-int write_one_page(struct page *page, int wait)
+int write_one_page(struct page *page)
{
struct address_space *mapping = page->mapping;
int ret = 0;
@@ -2385,21 +2385,20 @@ int write_one_page(struct page *page, int wait)
BUG_ON(!PageLocked(page));
- if (wait)
- wait_on_page_writeback(page);
+ wait_on_page_writeback(page);
if (clear_page_dirty_for_io(page)) {
get_page(page);
ret = mapping->a_ops->writepage(page, &wbc);
- if (ret == 0 && wait) {
+ if (ret == 0)
wait_on_page_writeback(page);
- if (PageError(page))
- ret = -EIO;
- }
put_page(page);
} else {
unlock_page(page);
}
+
+ if (!ret)
+ ret = filemap_check_errors(mapping);
return ret;
}
EXPORT_SYMBOL(write_one_page);
@@ -2433,8 +2432,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
inode_attach_wb(inode, page);
wb = inode_to_wb(inode);
- inc_memcg_page_state(page, NR_FILE_DIRTY);
- __inc_node_page_state(page, NR_FILE_DIRTY);
+ __inc_lruvec_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
__inc_node_page_state(page, NR_DIRTIED);
__inc_wb_stat(wb, WB_RECLAIMABLE);
@@ -2455,8 +2453,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
struct bdi_writeback *wb)
{
if (mapping_cap_account_dirty(mapping)) {
- dec_memcg_page_state(page, NR_FILE_DIRTY);
- dec_node_page_state(page, NR_FILE_DIRTY);
+ dec_lruvec_page_state(page, NR_FILE_DIRTY);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
dec_wb_stat(wb, WB_RECLAIMABLE);
task_io_account_cancelled_write(PAGE_SIZE);
@@ -2712,8 +2709,7 @@ int clear_page_dirty_for_io(struct page *page)
*/
wb = unlocked_inode_to_wb_begin(inode, &locked);
if (TestClearPageDirty(page)) {
- dec_memcg_page_state(page, NR_FILE_DIRTY);
- dec_node_page_state(page, NR_FILE_DIRTY);
+ dec_lruvec_page_state(page, NR_FILE_DIRTY);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
dec_wb_stat(wb, WB_RECLAIMABLE);
ret = 1;
@@ -2759,8 +2755,7 @@ int test_clear_page_writeback(struct page *page)
ret = TestClearPageWriteback(page);
}
if (ret) {
- dec_memcg_page_state(page, NR_WRITEBACK);
- dec_node_page_state(page, NR_WRITEBACK);
+ dec_lruvec_page_state(page, NR_WRITEBACK);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
inc_node_page_state(page, NR_WRITTEN);
}
@@ -2814,8 +2809,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
ret = TestSetPageWriteback(page);
}
if (!ret) {
- inc_memcg_page_state(page, NR_WRITEBACK);
- inc_node_page_state(page, NR_WRITEBACK);
+ inc_lruvec_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
}
unlock_page_memcg(page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f9e450c6b6e4..bd65b60939b6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -113,9 +113,7 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
#ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
-#ifdef CONFIG_MOVABLE_NODE
[N_MEMORY] = { { [0] = 1UL } },
-#endif
[N_CPU] = { { [0] = 1UL } },
#endif /* NUMA */
};
@@ -292,6 +290,26 @@ int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static inline void reset_deferred_meminit(pg_data_t *pgdat)
{
+ unsigned long max_initialise;
+ unsigned long reserved_lowmem;
+
+ /*
+ * Initialise at least 2G of a node but also take into account that
+ * two large system hashes that can take up 1GB for 0.25TB/node.
+ */
+ max_initialise = max(2UL << (30 - PAGE_SHIFT),
+ (pgdat->node_spanned_pages >> 8));
+
+ /*
+ * Compensate the all the memblock reservations (e.g. crash kernel)
+ * from the initial estimation to make sure we will initialize enough
+ * memory to boot.
+ */
+ reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn,
+ pgdat->node_start_pfn + max_initialise);
+ max_initialise += reserved_lowmem;
+
+ pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages);
pgdat->first_deferred_pfn = ULONG_MAX;
}
@@ -314,20 +332,11 @@ static inline bool update_defer_init(pg_data_t *pgdat,
unsigned long pfn, unsigned long zone_end,
unsigned long *nr_initialised)
{
- unsigned long max_initialise;
-
/* Always populate low zones for address-contrained allocations */
if (zone_end < pgdat_end_pfn(pgdat))
return true;
- /*
- * Initialise at least 2G of a node but also take into account that
- * two large system hashes that can take up 1GB for 0.25TB/node.
- */
- max_initialise = max(2UL << (30 - PAGE_SHIFT),
- (pgdat->node_spanned_pages >> 8));
-
(*nr_initialised)++;
- if ((*nr_initialised > max_initialise) &&
+ if ((*nr_initialised > pgdat->static_init_size) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
pgdat->first_deferred_pfn = pfn;
return false;
@@ -500,7 +509,7 @@ static int page_is_consistent(struct zone *zone, struct page *page)
/*
* Temporary debugging check for pages not lying within a given zone.
*/
-static int bad_range(struct zone *zone, struct page *page)
+static int __maybe_unused bad_range(struct zone *zone, struct page *page)
{
if (page_outside_zone_boundaries(zone, page))
return 1;
@@ -510,7 +519,7 @@ static int bad_range(struct zone *zone, struct page *page)
return 0;
}
#else
-static inline int bad_range(struct zone *zone, struct page *page)
+static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
{
return 0;
}
@@ -1286,8 +1295,9 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
#endif
#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
+static inline bool __meminit __maybe_unused
+meminit_pfn_in_nid(unsigned long pfn, int node,
+ struct mminit_pfnnid_cache *state)
{
int nid;
@@ -1309,8 +1319,9 @@ static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
return true;
}
-static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
+static inline bool __meminit __maybe_unused
+meminit_pfn_in_nid(unsigned long pfn, int node,
+ struct mminit_pfnnid_cache *state)
{
return true;
}
@@ -1354,7 +1365,9 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
return NULL;
- start_page = pfn_to_page(start_pfn);
+ start_page = pfn_to_online_page(start_pfn);
+ if (!start_page)
+ return NULL;
if (page_zone(start_page) != zone)
return NULL;
@@ -3662,6 +3675,39 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
return false;
}
+static inline bool
+check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
+{
+ /*
+ * It's possible that cpuset's mems_allowed and the nodemask from
+ * mempolicy don't intersect. This should be normally dealt with by
+ * policy_nodemask(), but it's possible to race with cpuset update in
+ * such a way the check therein was true, and then it became false
+ * before we got our cpuset_mems_cookie here.
+ * This assumes that for all allocations, ac->nodemask can come only
+ * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
+ * when it does not intersect with the cpuset restrictions) or the
+ * caller can deal with a violated nodemask.
+ */
+ if (cpusets_enabled() && ac->nodemask &&
+ !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
+ ac->nodemask = NULL;
+ return true;
+ }
+
+ /*
+ * When updating a task's mems_allowed or mempolicy nodemask, it is
+ * possible to race with parallel threads in such a way that our
+ * allocation can fail while the mask is being updated. If we are about
+ * to fail, check if the cpuset changed during allocation and if so,
+ * retry.
+ */
+ if (read_mems_allowed_retry(cpuset_mems_cookie))
+ return true;
+
+ return false;
+}
+
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
@@ -3857,11 +3903,9 @@ retry:
&compaction_retries))
goto retry;
- /*
- * It's possible we raced with cpuset update so the OOM would be
- * premature (see below the nopage: label for full explanation).
- */
- if (read_mems_allowed_retry(cpuset_mems_cookie))
+
+ /* Deal with possible cpuset update races before we start OOM killing */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
/* Reclaim has failed us, start killing things */
@@ -3870,7 +3914,9 @@ retry:
goto got_pg;
/* Avoid allocations with no watermarks from looping endlessly */
- if (test_thread_flag(TIF_MEMDIE))
+ if (test_thread_flag(TIF_MEMDIE) &&
+ (alloc_flags == ALLOC_NO_WATERMARKS ||
+ (gfp_mask & __GFP_NOMEMALLOC)))
goto nopage;
/* Retry as long as the OOM killer is making progress */
@@ -3880,14 +3926,8 @@ retry:
}
nopage:
- /*
- * When updating a task's mems_allowed or mempolicy nodemask, it is
- * possible to race with parallel threads in such a way that our
- * allocation can fail while the mask is being updated. If we are about
- * to fail, check if the cpuset changed during allocation and if so,
- * retry.
- */
- if (read_mems_allowed_retry(cpuset_mems_cookie))
+ /* Deal with possible cpuset update races before we fail */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
/*
@@ -3938,12 +3978,12 @@ got_pg:
}
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, nodemask_t *nodemask,
+ int preferred_nid, nodemask_t *nodemask,
struct alloc_context *ac, gfp_t *alloc_mask,
unsigned int *alloc_flags)
{
ac->high_zoneidx = gfp_zone(gfp_mask);
- ac->zonelist = zonelist;
+ ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
ac->nodemask = nodemask;
ac->migratetype = gfpflags_to_migratetype(gfp_mask);
@@ -3988,8 +4028,8 @@ static inline void finalise_ac(gfp_t gfp_mask,
* This is the 'heart' of the zoned buddy allocator.
*/
struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, nodemask_t *nodemask)
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
+ nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
@@ -3997,7 +4037,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct alloc_context ac = { };
gfp_mask &= gfp_allowed_mask;
- if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags))
+ if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;
finalise_ac(gfp_mask, order, &ac);
@@ -4601,8 +4641,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" present:%lukB"
" managed:%lukB"
" mlocked:%lukB"
- " slab_reclaimable:%lukB"
- " slab_unreclaimable:%lukB"
" kernel_stack:%lukB"
" pagetables:%lukB"
" bounce:%lukB"
@@ -4624,8 +4662,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(zone->present_pages),
K(zone->managed_pages),
K(zone_page_state(zone, NR_MLOCK)),
- K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
- K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
zone_page_state(zone, NR_KERNEL_STACK_KB),
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_BOUNCE)),
@@ -5111,6 +5147,7 @@ static void build_zonelists(pg_data_t *pgdat)
*/
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
+static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
static void setup_zone_pageset(struct zone *zone);
/*
@@ -5515,7 +5552,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
zone_batchsize(zone));
}
-int __meminit init_currently_empty_zone(struct zone *zone,
+void __meminit init_currently_empty_zone(struct zone *zone,
unsigned long zone_start_pfn,
unsigned long size)
{
@@ -5533,8 +5570,6 @@ int __meminit init_currently_empty_zone(struct zone *zone,
zone_init_free_lists(zone);
zone->initialized = 1;
-
- return 0;
}
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5992,7 +6027,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
- int ret;
pgdat_resize_init(pgdat);
#ifdef CONFIG_NUMA_BALANCING
@@ -6014,6 +6048,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
spin_lock_init(&pgdat->lru_lock);
lruvec_init(node_lruvec(pgdat));
+ pgdat->per_cpu_nodestats = &boot_nodestats;
+
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
@@ -6074,8 +6110,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
set_pageblock_order();
setup_usemap(pgdat, zone, zone_start_pfn, size);
- ret = init_currently_empty_zone(zone, zone_start_pfn, size);
- BUG_ON(ret);
+ init_currently_empty_zone(zone, zone_start_pfn, size);
memmap_init(size, nid, j, zone_start_pfn);
}
}
@@ -6136,7 +6171,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
/* pg_data_t should be reset to zero when it's allocated */
WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
- reset_deferred_meminit(pgdat);
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
pgdat->per_cpu_nodestats = NULL;
@@ -6158,6 +6192,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
(unsigned long)pgdat->node_mem_map);
#endif
+ reset_deferred_meminit(pgdat);
free_area_init_core(pgdat);
}
@@ -7169,6 +7204,21 @@ static unsigned long __init arch_reserved_kernel_pages(void)
#endif
/*
+ * Adaptive scale is meant to reduce sizes of hash tables on large memory
+ * machines. As memory size is increased the scale is also increased but at
+ * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
+ * quadruples the scale is increased by one, which means the size of hash table
+ * only doubles, instead of quadrupling as well.
+ * Because 32-bit systems cannot have large physical memory, where this scaling
+ * makes sense, it is disabled on such platforms.
+ */
+#if __BITS_PER_LONG > 32
+#define ADAPT_SCALE_BASE (64ul << 30)
+#define ADAPT_SCALE_SHIFT 2
+#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
+#endif
+
+/*
* allocate a large system hash table from bootmem
* - it is assumed that the hash table must contain an exact power-of-2
* quantity of entries
@@ -7187,6 +7237,7 @@ void *__init alloc_large_system_hash(const char *tablename,
unsigned long long max = high_limit;
unsigned long log2qty, size;
void *table = NULL;
+ gfp_t gfp_flags;
/* allow the kernel cmdline to have a say */
if (!numentries) {
@@ -7198,6 +7249,16 @@ void *__init alloc_large_system_hash(const char *tablename,
if (PAGE_SHIFT < 20)
numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
+#if __BITS_PER_LONG > 32
+ if (!high_limit) {
+ unsigned long adapt;
+
+ for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
+ adapt <<= ADAPT_SCALE_SHIFT)
+ scale++;
+ }
+#endif
+
/* limit to 1 bucket per 2^scale bytes of low memory */
if (scale > PAGE_SHIFT)
numentries >>= (scale - PAGE_SHIFT);
@@ -7231,12 +7292,17 @@ void *__init alloc_large_system_hash(const char *tablename,
log2qty = ilog2(numentries);
+ /*
+ * memblock allocator returns zeroed memory already, so HASH_ZERO is
+ * currently not used when HASH_EARLY is specified.
+ */
+ gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
do {
size = bucketsize << log2qty;
if (flags & HASH_EARLY)
table = memblock_virt_alloc_nopanic(size, 0);
else if (hashdist)
- table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
+ table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
else {
/*
* If bucketsize is not a power-of-two, we may free
@@ -7244,8 +7310,8 @@ void *__init alloc_large_system_hash(const char *tablename,
* alloc_pages_exact() automatically does
*/
if (get_order(size) < MAX_ORDER) {
- table = alloc_pages_exact(size, GFP_ATOMIC);
- kmemleak_alloc(table, size, 1, GFP_ATOMIC);
+ table = alloc_pages_exact(size, gfp_flags);
+ kmemleak_alloc(table, size, 1, gfp_flags);
}
}
} while (!table && size > PAGE_SIZE && --log2qty);
@@ -7647,6 +7713,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
break;
if (pfn == end_pfn)
return;
+ offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
spin_lock_irqsave(&zone->lock, flags);
pfn = start_pfn;
diff --git a/mm/page_io.c b/mm/page_io.c
index 23f6d0d3470f..2da71e627812 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -45,7 +45,7 @@ void end_swap_bio_write(struct bio *bio)
{
struct page *page = bio->bi_io_vec[0].bv_page;
- if (bio->bi_error) {
+ if (bio->bi_status) {
SetPageError(page);
/*
* We failed to write the page out to swap-space.
@@ -118,7 +118,7 @@ static void end_swap_bio_read(struct bio *bio)
{
struct page *page = bio->bi_io_vec[0].bv_page;
- if (bio->bi_error) {
+ if (bio->bi_status) {
SetPageError(page);
ClearPageUptodate(page);
pr_alert("Read-error on swap-device (%u:%u:%llu)\n",
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 5092e4ef00c8..3606104893e0 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -138,12 +138,18 @@ static inline struct page *
__first_valid_page(unsigned long pfn, unsigned long nr_pages)
{
int i;
- for (i = 0; i < nr_pages; i++)
- if (pfn_valid_within(pfn + i))
- break;
- if (unlikely(i == nr_pages))
- return NULL;
- return pfn_to_page(pfn + i);
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page;
+
+ if (!pfn_valid_within(pfn + i))
+ continue;
+ page = pfn_to_online_page(pfn + i);
+ if (!page)
+ continue;
+ return page;
+ }
+ return NULL;
}
/*
@@ -184,8 +190,12 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
undo:
for (pfn = start_pfn;
pfn < undo_pfn;
- pfn += pageblock_nr_pages)
- unset_migratetype_isolate(pfn_to_page(pfn), migratetype);
+ pfn += pageblock_nr_pages) {
+ struct page *page = pfn_to_online_page(pfn);
+ if (!page)
+ continue;
+ unset_migratetype_isolate(page, migratetype);
+ }
return -EBUSY;
}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index de9c40d7304a..8ec6ba230bb9 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -116,7 +116,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (unlikely(PageHuge(pvmw->page))) {
/* when pud is not present, pte will be NULL */
- pvmw->pte = huge_pte_offset(mm, pvmw->address);
+ pvmw->pte = huge_pte_offset(mm, pvmw->address,
+ PAGE_SIZE << compound_order(page));
if (!pvmw->pte)
return false;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 60f7856e508f..1a4197965415 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -180,12 +180,13 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
struct hstate *h = hstate_vma(vma);
unsigned long next;
unsigned long hmask = huge_page_mask(h);
+ unsigned long sz = huge_page_size(h);
pte_t *pte;
int err = 0;
do {
next = hugetlb_entry_end(h, addr, end);
- pte = huge_pte_offset(walk->mm, addr & hmask);
+ pte = huge_pte_offset(walk->mm, addr & hmask, sz);
if (pte && walk->hugetlb_entry)
err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
if (err)
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
new file mode 100644
index 000000000000..cd2442e13d8f
--- /dev/null
+++ b/mm/percpu-internal.h
@@ -0,0 +1,166 @@
+#ifndef _MM_PERCPU_INTERNAL_H
+#define _MM_PERCPU_INTERNAL_H
+
+#include <linux/types.h>
+#include <linux/percpu.h>
+
+struct pcpu_chunk {
+#ifdef CONFIG_PERCPU_STATS
+ int nr_alloc; /* # of allocations */
+ size_t max_alloc_size; /* largest allocation size */
+#endif
+
+ struct list_head list; /* linked to pcpu_slot lists */
+ int free_size; /* free bytes in the chunk */
+ int contig_hint; /* max contiguous size hint */
+ void *base_addr; /* base address of this chunk */
+
+ int map_used; /* # of map entries used before the sentry */
+ int map_alloc; /* # of map entries allocated */
+ int *map; /* allocation map */
+ struct list_head map_extend_list;/* on pcpu_map_extend_chunks */
+
+ void *data; /* chunk data */
+ int first_free; /* no free below this */
+ bool immutable; /* no [de]population allowed */
+ bool has_reserved; /* Indicates if chunk has reserved space
+ at the beginning. Reserved chunk will
+ contain reservation for static chunk.
+ Dynamic chunk will contain reservation
+ for static and reserved chunks. */
+ int nr_populated; /* # of populated pages */
+ unsigned long populated[]; /* populated bitmap */
+};
+
+extern spinlock_t pcpu_lock;
+
+extern struct list_head *pcpu_slot;
+extern int pcpu_nr_slots;
+
+extern struct pcpu_chunk *pcpu_first_chunk;
+extern struct pcpu_chunk *pcpu_reserved_chunk;
+
+#ifdef CONFIG_PERCPU_STATS
+
+#include <linux/spinlock.h>
+
+struct percpu_stats {
+ u64 nr_alloc; /* lifetime # of allocations */
+ u64 nr_dealloc; /* lifetime # of deallocations */
+ u64 nr_cur_alloc; /* current # of allocations */
+ u64 nr_max_alloc; /* max # of live allocations */
+ u32 nr_chunks; /* current # of live chunks */
+ u32 nr_max_chunks; /* max # of live chunks */
+ size_t min_alloc_size; /* min allocaiton size */
+ size_t max_alloc_size; /* max allocation size */
+};
+
+extern struct percpu_stats pcpu_stats;
+extern struct pcpu_alloc_info pcpu_stats_ai;
+
+/*
+ * For debug purposes. We don't care about the flexible array.
+ */
+static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
+{
+ memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info));
+
+ /* initialize min_alloc_size to unit_size */
+ pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size;
+}
+
+/*
+ * pcpu_stats_area_alloc - increment area allocation stats
+ * @chunk: the location of the area being allocated
+ * @size: size of area to allocate in bytes
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ */
+static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
+{
+ lockdep_assert_held(&pcpu_lock);
+
+ pcpu_stats.nr_alloc++;
+ pcpu_stats.nr_cur_alloc++;
+ pcpu_stats.nr_max_alloc =
+ max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc);
+ pcpu_stats.min_alloc_size =
+ min(pcpu_stats.min_alloc_size, size);
+ pcpu_stats.max_alloc_size =
+ max(pcpu_stats.max_alloc_size, size);
+
+ chunk->nr_alloc++;
+ chunk->max_alloc_size = max(chunk->max_alloc_size, size);
+}
+
+/*
+ * pcpu_stats_area_dealloc - decrement allocation stats
+ * @chunk: the location of the area being deallocated
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ */
+static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
+{
+ lockdep_assert_held(&pcpu_lock);
+
+ pcpu_stats.nr_dealloc++;
+ pcpu_stats.nr_cur_alloc--;
+
+ chunk->nr_alloc--;
+}
+
+/*
+ * pcpu_stats_chunk_alloc - increment chunk stats
+ */
+static inline void pcpu_stats_chunk_alloc(void)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&pcpu_lock, flags);
+
+ pcpu_stats.nr_chunks++;
+ pcpu_stats.nr_max_chunks =
+ max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks);
+
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+}
+
+/*
+ * pcpu_stats_chunk_dealloc - decrement chunk stats
+ */
+static inline void pcpu_stats_chunk_dealloc(void)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&pcpu_lock, flags);
+
+ pcpu_stats.nr_chunks--;
+
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+}
+
+#else
+
+static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
+{
+}
+
+static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
+{
+}
+
+static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
+{
+}
+
+static inline void pcpu_stats_chunk_alloc(void)
+{
+}
+
+static inline void pcpu_stats_chunk_dealloc(void)
+{
+}
+
+#endif /* !CONFIG_PERCPU_STATS */
+
+#endif
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index d66911ff42d9..eb58aa4c0997 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -72,6 +72,9 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
pcpu_chunk_populated(chunk, 0, nr_pages);
spin_unlock_irq(&pcpu_lock);
+ pcpu_stats_chunk_alloc();
+ trace_percpu_create_chunk(chunk->base_addr);
+
return chunk;
}
@@ -79,7 +82,13 @@ static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
{
const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
- if (chunk && chunk->data)
+ if (!chunk)
+ return;
+
+ pcpu_stats_chunk_dealloc();
+ trace_percpu_destroy_chunk(chunk->base_addr);
+
+ if (chunk->data)
__free_pages(chunk->data, order_base_2(nr_pages));
pcpu_free_chunk(chunk);
}
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
new file mode 100644
index 000000000000..03524a56eeff
--- /dev/null
+++ b/mm/percpu-stats.c
@@ -0,0 +1,222 @@
+/*
+ * mm/percpu-debug.c
+ *
+ * Copyright (C) 2017 Facebook Inc.
+ * Copyright (C) 2017 Dennis Zhou <dennisz@fb.com>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Prints statistics about the percpu allocator and backing chunks.
+ */
+#include <linux/debugfs.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/sort.h>
+#include <linux/vmalloc.h>
+
+#include "percpu-internal.h"
+
+#define P(X, Y) \
+ seq_printf(m, " %-24s: %8lld\n", X, (long long int)Y)
+
+struct percpu_stats pcpu_stats;
+struct pcpu_alloc_info pcpu_stats_ai;
+
+static int cmpint(const void *a, const void *b)
+{
+ return *(int *)a - *(int *)b;
+}
+
+/*
+ * Iterates over all chunks to find the max # of map entries used.
+ */
+static int find_max_map_used(void)
+{
+ struct pcpu_chunk *chunk;
+ int slot, max_map_used;
+
+ max_map_used = 0;
+ for (slot = 0; slot < pcpu_nr_slots; slot++)
+ list_for_each_entry(chunk, &pcpu_slot[slot], list)
+ max_map_used = max(max_map_used, chunk->map_used);
+
+ return max_map_used;
+}
+
+/*
+ * Prints out chunk state. Fragmentation is considered between
+ * the beginning of the chunk to the last allocation.
+ */
+static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
+ void *buffer)
+{
+ int i, s_index, last_alloc, alloc_sign, as_len;
+ int *alloc_sizes, *p;
+ /* statistics */
+ int sum_frag = 0, max_frag = 0;
+ int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0;
+
+ alloc_sizes = buffer;
+ s_index = chunk->has_reserved ? 1 : 0;
+
+ /* find last allocation */
+ last_alloc = -1;
+ for (i = chunk->map_used - 1; i >= s_index; i--) {
+ if (chunk->map[i] & 1) {
+ last_alloc = i;
+ break;
+ }
+ }
+
+ /* if the chunk is not empty - ignoring reserve */
+ if (last_alloc >= s_index) {
+ as_len = last_alloc + 1 - s_index;
+
+ /*
+ * Iterate through chunk map computing size info.
+ * The first bit is overloaded to be a used flag.
+ * negative = free space, positive = allocated
+ */
+ for (i = 0, p = chunk->map + s_index; i < as_len; i++, p++) {
+ alloc_sign = (*p & 1) ? 1 : -1;
+ alloc_sizes[i] = alloc_sign *
+ ((p[1] & ~1) - (p[0] & ~1));
+ }
+
+ sort(alloc_sizes, as_len, sizeof(chunk->map[0]), cmpint, NULL);
+
+ /* Iterate through the unallocated fragements. */
+ for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) {
+ sum_frag -= *p;
+ max_frag = max(max_frag, -1 * (*p));
+ }
+
+ cur_min_alloc = alloc_sizes[i];
+ cur_med_alloc = alloc_sizes[(i + as_len - 1) / 2];
+ cur_max_alloc = alloc_sizes[as_len - 1];
+ }
+
+ P("nr_alloc", chunk->nr_alloc);
+ P("max_alloc_size", chunk->max_alloc_size);
+ P("free_size", chunk->free_size);
+ P("contig_hint", chunk->contig_hint);
+ P("sum_frag", sum_frag);
+ P("max_frag", max_frag);
+ P("cur_min_alloc", cur_min_alloc);
+ P("cur_med_alloc", cur_med_alloc);
+ P("cur_max_alloc", cur_max_alloc);
+ seq_putc(m, '\n');
+}
+
+static int percpu_stats_show(struct seq_file *m, void *v)
+{
+ struct pcpu_chunk *chunk;
+ int slot, max_map_used;
+ void *buffer;
+
+alloc_buffer:
+ spin_lock_irq(&pcpu_lock);
+ max_map_used = find_max_map_used();
+ spin_unlock_irq(&pcpu_lock);
+
+ buffer = vmalloc(max_map_used * sizeof(pcpu_first_chunk->map[0]));
+ if (!buffer)
+ return -ENOMEM;
+
+ spin_lock_irq(&pcpu_lock);
+
+ /* if the buffer allocated earlier is too small */
+ if (max_map_used < find_max_map_used()) {
+ spin_unlock_irq(&pcpu_lock);
+ vfree(buffer);
+ goto alloc_buffer;
+ }
+
+#define PL(X) \
+ seq_printf(m, " %-24s: %8lld\n", #X, (long long int)pcpu_stats_ai.X)
+
+ seq_printf(m,
+ "Percpu Memory Statistics\n"
+ "Allocation Info:\n"
+ "----------------------------------------\n");
+ PL(unit_size);
+ PL(static_size);
+ PL(reserved_size);
+ PL(dyn_size);
+ PL(atom_size);
+ PL(alloc_size);
+ seq_putc(m, '\n');
+
+#undef PL
+
+#define PU(X) \
+ seq_printf(m, " %-18s: %14llu\n", #X, (unsigned long long)pcpu_stats.X)
+
+ seq_printf(m,
+ "Global Stats:\n"
+ "----------------------------------------\n");
+ PU(nr_alloc);
+ PU(nr_dealloc);
+ PU(nr_cur_alloc);
+ PU(nr_max_alloc);
+ PU(nr_chunks);
+ PU(nr_max_chunks);
+ PU(min_alloc_size);
+ PU(max_alloc_size);
+ seq_putc(m, '\n');
+
+#undef PU
+
+ seq_printf(m,
+ "Per Chunk Stats:\n"
+ "----------------------------------------\n");
+
+ if (pcpu_reserved_chunk) {
+ seq_puts(m, "Chunk: <- Reserved Chunk\n");
+ chunk_map_stats(m, pcpu_reserved_chunk, buffer);
+ }
+
+ for (slot = 0; slot < pcpu_nr_slots; slot++) {
+ list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+ if (chunk == pcpu_first_chunk) {
+ seq_puts(m, "Chunk: <- First Chunk\n");
+ chunk_map_stats(m, chunk, buffer);
+
+
+ } else {
+ seq_puts(m, "Chunk:\n");
+ chunk_map_stats(m, chunk, buffer);
+ }
+
+ }
+ }
+
+ spin_unlock_irq(&pcpu_lock);
+
+ vfree(buffer);
+
+ return 0;
+}
+
+static int percpu_stats_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, percpu_stats_show, NULL);
+}
+
+static const struct file_operations percpu_stats_fops = {
+ .open = percpu_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init init_percpu_stats_debugfs(void)
+{
+ debugfs_create_file("percpu_stats", 0444, NULL, NULL,
+ &percpu_stats_fops);
+
+ return 0;
+}
+
+late_initcall(init_percpu_stats_debugfs);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 9ac639499bd1..15dab691ea70 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -343,12 +343,22 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
chunk->data = vms;
chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0];
+
+ pcpu_stats_chunk_alloc();
+ trace_percpu_create_chunk(chunk->base_addr);
+
return chunk;
}
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
{
- if (chunk && chunk->data)
+ if (!chunk)
+ return;
+
+ pcpu_stats_chunk_dealloc();
+ trace_percpu_destroy_chunk(chunk->base_addr);
+
+ if (chunk->data)
pcpu_free_vm_areas(chunk->data, pcpu_nr_groups);
pcpu_free_chunk(chunk);
}
diff --git a/mm/percpu.c b/mm/percpu.c
index e0aa8ae7bde7..bd4130a69bbc 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -76,6 +76,11 @@
#include <asm/tlbflush.h>
#include <asm/io.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/percpu.h>
+
+#include "percpu-internal.h"
+
#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
@@ -103,53 +108,35 @@
#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
#endif /* CONFIG_SMP */
-struct pcpu_chunk {
- struct list_head list; /* linked to pcpu_slot lists */
- int free_size; /* free bytes in the chunk */
- int contig_hint; /* max contiguous size hint */
- void *base_addr; /* base address of this chunk */
-
- int map_used; /* # of map entries used before the sentry */
- int map_alloc; /* # of map entries allocated */
- int *map; /* allocation map */
- struct list_head map_extend_list;/* on pcpu_map_extend_chunks */
-
- void *data; /* chunk data */
- int first_free; /* no free below this */
- bool immutable; /* no [de]population allowed */
- int nr_populated; /* # of populated pages */
- unsigned long populated[]; /* populated bitmap */
-};
-
-static int pcpu_unit_pages __read_mostly;
-static int pcpu_unit_size __read_mostly;
-static int pcpu_nr_units __read_mostly;
-static int pcpu_atom_size __read_mostly;
-static int pcpu_nr_slots __read_mostly;
-static size_t pcpu_chunk_struct_size __read_mostly;
+static int pcpu_unit_pages __ro_after_init;
+static int pcpu_unit_size __ro_after_init;
+static int pcpu_nr_units __ro_after_init;
+static int pcpu_atom_size __ro_after_init;
+int pcpu_nr_slots __ro_after_init;
+static size_t pcpu_chunk_struct_size __ro_after_init;
/* cpus with the lowest and highest unit addresses */
-static unsigned int pcpu_low_unit_cpu __read_mostly;
-static unsigned int pcpu_high_unit_cpu __read_mostly;
+static unsigned int pcpu_low_unit_cpu __ro_after_init;
+static unsigned int pcpu_high_unit_cpu __ro_after_init;
/* the address of the first chunk which starts with the kernel static area */
-void *pcpu_base_addr __read_mostly;
+void *pcpu_base_addr __ro_after_init;
EXPORT_SYMBOL_GPL(pcpu_base_addr);
-static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */
-const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */
+static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */
+const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */
/* group information, used for vm allocation */
-static int pcpu_nr_groups __read_mostly;
-static const unsigned long *pcpu_group_offsets __read_mostly;
-static const size_t *pcpu_group_sizes __read_mostly;
+static int pcpu_nr_groups __ro_after_init;
+static const unsigned long *pcpu_group_offsets __ro_after_init;
+static const size_t *pcpu_group_sizes __ro_after_init;
/*
* The first chunk which always exists. Note that unlike other
* chunks, this one can be allocated and mapped in several different
* ways and thus often doesn't live in the vmalloc area.
*/
-static struct pcpu_chunk *pcpu_first_chunk;
+struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
/*
* Optional reserved chunk. This chunk reserves part of the first
@@ -158,13 +145,13 @@ static struct pcpu_chunk *pcpu_first_chunk;
* area doesn't exist, the following variables contain NULL and 0
* respectively.
*/
-static struct pcpu_chunk *pcpu_reserved_chunk;
-static int pcpu_reserved_chunk_limit;
+struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
+static int pcpu_reserved_chunk_limit __ro_after_init;
-static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
+DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
-static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
+struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */
/* chunks which need their map areas extended, protected by pcpu_lock */
static LIST_HEAD(pcpu_map_extend_chunks);
@@ -672,6 +659,9 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
int to_free = 0;
int *p;
+ lockdep_assert_held(&pcpu_lock);
+ pcpu_stats_area_dealloc(chunk);
+
freeme |= 1; /* we are searching for <given offset, in use> pair */
i = 0;
@@ -735,6 +725,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
chunk->map[0] = 0;
chunk->map[1] = pcpu_unit_size | 1;
chunk->map_used = 1;
+ chunk->has_reserved = false;
INIT_LIST_HEAD(&chunk->list);
INIT_LIST_HEAD(&chunk->map_extend_list);
@@ -965,8 +956,10 @@ restart:
* tasks to create chunks simultaneously. Serialize and create iff
* there's still no empty chunk after grabbing the mutex.
*/
- if (is_atomic)
+ if (is_atomic) {
+ err = "atomic alloc failed, no space left";
goto fail;
+ }
if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
chunk = pcpu_create_chunk();
@@ -984,6 +977,7 @@ restart:
goto restart;
area_found:
+ pcpu_stats_area_alloc(chunk, size);
spin_unlock_irqrestore(&pcpu_lock, flags);
/* populate if not all pages are already there */
@@ -1026,11 +1020,17 @@ area_found:
ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
kmemleak_alloc_percpu(ptr, size, gfp);
+
+ trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
+ chunk->base_addr, off, ptr);
+
return ptr;
fail_unlock:
spin_unlock_irqrestore(&pcpu_lock, flags);
fail:
+ trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
+
if (!is_atomic && warn_limit) {
pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
size, align, is_atomic, err);
@@ -1280,6 +1280,8 @@ void free_percpu(void __percpu *ptr)
}
}
+ trace_percpu_free_percpu(chunk->base_addr, off, ptr);
+
spin_unlock_irqrestore(&pcpu_lock, flags);
}
EXPORT_SYMBOL_GPL(free_percpu);
@@ -1656,6 +1658,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
+ pcpu_stats_save_ai(ai);
+
/*
* Allocate chunk slots. The additional last slot is for
* empty chunks.
@@ -1699,6 +1703,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
if (schunk->free_size)
schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size;
schunk->map[schunk->map_used] |= 1;
+ schunk->has_reserved = true;
/* init dynamic chunk if necessary */
if (dyn_size) {
@@ -1717,6 +1722,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
dchunk->map[1] = pcpu_reserved_chunk_limit;
dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1;
dchunk->map_used = 2;
+ dchunk->has_reserved = true;
}
/* link the first chunk in */
@@ -1725,6 +1731,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
pcpu_count_occupied_pages(pcpu_first_chunk, 1);
pcpu_chunk_relocate(pcpu_first_chunk, -1);
+ pcpu_stats_chunk_alloc();
+ trace_percpu_create_chunk(base_addr);
+
/* we're done */
pcpu_base_addr = base_addr;
return 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index d405f0e0ee96..ced14f1af6dc 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -579,25 +579,13 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
void try_to_unmap_flush(void)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
- int cpu;
if (!tlb_ubc->flush_required)
return;
- cpu = get_cpu();
-
- if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) {
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- local_flush_tlb();
- trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
- }
-
- if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids)
- flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL);
- cpumask_clear(&tlb_ubc->cpumask);
+ arch_tlbbatch_flush(&tlb_ubc->arch);
tlb_ubc->flush_required = false;
tlb_ubc->writable = false;
- put_cpu();
}
/* Flush iff there are potentially writable TLB entries that can race with IO */
@@ -613,7 +601,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
- cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
+ arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
tlb_ubc->flush_required = true;
/*
@@ -1157,8 +1145,7 @@ void page_add_file_rmap(struct page *page, bool compound)
if (!atomic_inc_and_test(&page->_mapcount))
goto out;
}
- __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr);
- mod_memcg_page_state(page, NR_FILE_MAPPED, nr);
+ __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
out:
unlock_page_memcg(page);
}
@@ -1193,12 +1180,11 @@ static void page_remove_file_rmap(struct page *page, bool compound)
}
/*
- * We use the irq-unsafe __{inc|mod}_zone_page_state because
+ * We use the irq-unsafe __{inc|mod}_lruvec_page_state because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
- __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr);
- mod_memcg_page_state(page, NR_FILE_MAPPED, -nr);
+ __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
@@ -1379,15 +1365,18 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
update_hiwater_rss(mm);
if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
+ pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (PageHuge(page)) {
int nr = 1 << compound_order(page);
hugetlb_count_sub(nr, mm);
+ set_huge_swap_pte_at(mm, address,
+ pvmw.pte, pteval,
+ vma_mmu_pagesize(vma));
} else {
dec_mm_counter(mm, mm_counter(page));
+ set_pte_at(mm, address, pvmw.pte, pteval);
}
- pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
- set_pte_at(mm, address, pvmw.pte, pteval);
} else if (pte_unused(pteval)) {
/*
* The guest indicated that the page content is of no
diff --git a/mm/shmem.c b/mm/shmem.c
index e67d6ba4e98e..9418f5a9bc46 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -75,6 +75,7 @@ static struct vfsmount *shm_mnt;
#include <uapi/linux/memfd.h>
#include <linux/userfaultfd_k.h>
#include <linux/rmap.h>
+#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <asm/pgtable.h>
@@ -1290,7 +1291,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
SetPageUptodate(page);
}
- swap = get_swap_page();
+ swap = get_swap_page(page);
if (!swap.val)
goto redirty;
@@ -1326,7 +1327,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
mutex_unlock(&shmem_swaplist_mutex);
free_swap:
- swapcache_free(swap);
+ put_swap_page(page, swap);
redirty:
set_page_dirty(page);
if (wbc->for_reclaim)
@@ -1645,8 +1646,7 @@ repeat:
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(charge_mm,
- PGMAJFAULT);
+ count_memcg_event_mm(charge_mm, PGMAJFAULT);
}
/* Here we actually start the io */
page = shmem_swapin(swap, gfp, info, index);
@@ -1902,10 +1902,10 @@ unlock:
* entry unconditionally - even if something else had already woken the
* target.
*/
-static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int ret = default_wake_function(wait, mode, sync, key);
- list_del_init(&wait->task_list);
+ list_del_init(&wait->entry);
return ret;
}
@@ -2840,7 +2840,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
spin_lock(&inode->i_lock);
inode->i_private = NULL;
wake_up_all(&shmem_falloc_waitq);
- WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list));
+ WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
spin_unlock(&inode->i_lock);
error = 0;
goto out;
@@ -3761,6 +3761,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_TMPFS_POSIX_ACL
sb->s_flags |= MS_POSIXACL;
#endif
+ uuid_gen(&sb->s_uuid);
inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
if (!inode)
diff --git a/mm/slab.c b/mm/slab.c
index 2a31ee3c5814..04dec48c3ed7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1425,11 +1425,9 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
nr_pages = (1 << cachep->gfporder);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
- add_zone_page_state(page_zone(page),
- NR_SLAB_RECLAIMABLE, nr_pages);
+ mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, nr_pages);
else
- add_zone_page_state(page_zone(page),
- NR_SLAB_UNRECLAIMABLE, nr_pages);
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, nr_pages);
__SetPageSlab(page);
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
@@ -1459,11 +1457,9 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
kmemcheck_free_shadow(page, order);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
- sub_zone_page_state(page_zone(page),
- NR_SLAB_RECLAIMABLE, nr_freed);
+ mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed);
else
- sub_zone_page_state(page_zone(page),
- NR_SLAB_UNRECLAIMABLE, nr_freed);
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, -nr_freed);
BUG_ON(!PageSlab(page));
__ClearPageSlabPfmemalloc(page);
@@ -2040,17 +2036,13 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
* unaligned accesses for some archs when redzoning is used, and makes
* sure any on-slab bufctl's are also correctly aligned.
*/
- if (size & (BYTES_PER_WORD - 1)) {
- size += (BYTES_PER_WORD - 1);
- size &= ~(BYTES_PER_WORD - 1);
- }
+ size = ALIGN(size, BYTES_PER_WORD);
if (flags & SLAB_RED_ZONE) {
ralign = REDZONE_ALIGN;
/* If redzoning, ensure that the second redzone is suitably
* aligned, by adjusting the object size accordingly. */
- size += REDZONE_ALIGN - 1;
- size &= ~(REDZONE_ALIGN - 1);
+ size = ALIGN(size, REDZONE_ALIGN);
}
/* 3) caller mandated alignment */
diff --git a/mm/slab.h b/mm/slab.h
index 9cfcf099709c..6885e1192ec5 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -274,22 +274,11 @@ static __always_inline int memcg_charge_slab(struct page *page,
gfp_t gfp, int order,
struct kmem_cache *s)
{
- int ret;
-
if (!memcg_kmem_enabled())
return 0;
if (is_root_cache(s))
return 0;
-
- ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
- if (ret)
- return ret;
-
- memcg_kmem_update_page_stat(page,
- (s->flags & SLAB_RECLAIM_ACCOUNT) ?
- MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE,
- 1 << order);
- return 0;
+ return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
}
static __always_inline void memcg_uncharge_slab(struct page *page, int order,
@@ -297,11 +286,6 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
{
if (!memcg_kmem_enabled())
return;
-
- memcg_kmem_update_page_stat(page,
- (s->flags & SLAB_RECLAIM_ACCOUNT) ?
- MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE,
- -(1 << order));
memcg_kmem_uncharge(page, order);
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 01a0fe2eb332..904a83be82de 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -47,13 +47,12 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
/*
* Merge control. If this is set then no merging of slab caches will occur.
- * (Could be removed. This was introduced to pacify the merge skeptics.)
*/
-static int slab_nomerge;
+static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT);
static int __init setup_slab_nomerge(char *str)
{
- slab_nomerge = 1;
+ slab_nomerge = true;
return 1;
}
diff --git a/mm/slub.c b/mm/slub.c
index 57e5156f02be..1d3f9835f4ea 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1615,7 +1615,7 @@ out:
if (!page)
return NULL;
- mod_zone_page_state(page_zone(page),
+ mod_lruvec_page_state(page,
(s->flags & SLAB_RECLAIM_ACCOUNT) ?
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1 << oo_order(oo));
@@ -1655,7 +1655,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
kmemcheck_free_shadow(page, compound_order(page));
- mod_zone_page_state(page_zone(page),
+ mod_lruvec_page_state(page,
(s->flags & SLAB_RECLAIM_ACCOUNT) ?
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-pages);
@@ -1829,7 +1829,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
stat(s, CPU_PARTIAL_NODE);
}
if (!kmem_cache_has_cpu_partial(s)
- || available > s->cpu_partial / 2)
+ || available > slub_cpu_partial(s) / 2)
break;
}
@@ -1993,7 +1993,7 @@ static void init_kmem_cache_cpus(struct kmem_cache *s)
* Remove the cpu slab
*/
static void deactivate_slab(struct kmem_cache *s, struct page *page,
- void *freelist)
+ void *freelist, struct kmem_cache_cpu *c)
{
enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
@@ -2132,6 +2132,9 @@ redo:
discard_slab(s, page);
stat(s, FREE_SLAB);
}
+
+ c->page = NULL;
+ c->freelist = NULL;
}
/*
@@ -2266,11 +2269,9 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
stat(s, CPUSLAB_FLUSH);
- deactivate_slab(s, c->page, c->freelist);
+ deactivate_slab(s, c->page, c->freelist, c);
c->tid = next_tid(c->tid);
- c->page = NULL;
- c->freelist = NULL;
}
/*
@@ -2302,7 +2303,7 @@ static bool has_cpu_slab(int cpu, void *info)
struct kmem_cache *s = info;
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
- return c->page || c->partial;
+ return c->page || slub_percpu_partial(c);
}
static void flush_all(struct kmem_cache *s)
@@ -2521,9 +2522,7 @@ redo:
if (unlikely(!node_match(page, searchnode))) {
stat(s, ALLOC_NODE_MISMATCH);
- deactivate_slab(s, page, c->freelist);
- c->page = NULL;
- c->freelist = NULL;
+ deactivate_slab(s, page, c->freelist, c);
goto new_slab;
}
}
@@ -2534,9 +2533,7 @@ redo:
* information when the page leaves the per-cpu allocator
*/
if (unlikely(!pfmemalloc_match(page, gfpflags))) {
- deactivate_slab(s, page, c->freelist);
- c->page = NULL;
- c->freelist = NULL;
+ deactivate_slab(s, page, c->freelist, c);
goto new_slab;
}
@@ -2568,11 +2565,10 @@ load_freelist:
new_slab:
- if (c->partial) {
- page = c->page = c->partial;
- c->partial = page->next;
+ if (slub_percpu_partial(c)) {
+ page = c->page = slub_percpu_partial(c);
+ slub_set_percpu_partial(c, page);
stat(s, CPU_PARTIAL_ALLOC);
- c->freelist = NULL;
goto redo;
}
@@ -2592,9 +2588,7 @@ new_slab:
!alloc_debug_processing(s, page, freelist, addr))
goto new_slab; /* Slab failed checks. Next slab needed */
- deactivate_slab(s, page, get_freepointer(s, freelist));
- c->page = NULL;
- c->freelist = NULL;
+ deactivate_slab(s, page, get_freepointer(s, freelist), c);
return freelist;
}
@@ -3410,6 +3404,39 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)
s->min_partial = min;
}
+static void set_cpu_partial(struct kmem_cache *s)
+{
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ /*
+ * cpu_partial determined the maximum number of objects kept in the
+ * per cpu partial lists of a processor.
+ *
+ * Per cpu partial lists mainly contain slabs that just have one
+ * object freed. If they are used for allocation then they can be
+ * filled up again with minimal effort. The slab will never hit the
+ * per node partial lists and therefore no locking will be required.
+ *
+ * This setting also determines
+ *
+ * A) The number of objects from per cpu partial slabs dumped to the
+ * per node list when we reach the limit.
+ * B) The number of objects in cpu partial slabs to extract from the
+ * per node list when we run out of per cpu objects. We only fetch
+ * 50% to keep some capacity around for frees.
+ */
+ if (!kmem_cache_has_cpu_partial(s))
+ s->cpu_partial = 0;
+ else if (s->size >= PAGE_SIZE)
+ s->cpu_partial = 2;
+ else if (s->size >= 1024)
+ s->cpu_partial = 6;
+ else if (s->size >= 256)
+ s->cpu_partial = 13;
+ else
+ s->cpu_partial = 30;
+#endif
+}
+
/*
* calculate_sizes() determines the order and the distribution of data within
* a slab object.
@@ -3568,33 +3595,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
*/
set_min_partial(s, ilog2(s->size) / 2);
- /*
- * cpu_partial determined the maximum number of objects kept in the
- * per cpu partial lists of a processor.
- *
- * Per cpu partial lists mainly contain slabs that just have one
- * object freed. If they are used for allocation then they can be
- * filled up again with minimal effort. The slab will never hit the
- * per node partial lists and therefore no locking will be required.
- *
- * This setting also determines
- *
- * A) The number of objects from per cpu partial slabs dumped to the
- * per node list when we reach the limit.
- * B) The number of objects in cpu partial slabs to extract from the
- * per node list when we run out of per cpu objects. We only fetch
- * 50% to keep some capacity around for frees.
- */
- if (!kmem_cache_has_cpu_partial(s))
- s->cpu_partial = 0;
- else if (s->size >= PAGE_SIZE)
- s->cpu_partial = 2;
- else if (s->size >= 1024)
- s->cpu_partial = 6;
- else if (s->size >= 256)
- s->cpu_partial = 13;
- else
- s->cpu_partial = 30;
+ set_cpu_partial(s);
#ifdef CONFIG_NUMA
s->remote_node_defrag_ratio = 1000;
@@ -3981,7 +3982,7 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s)
* Disable empty slabs caching. Used to avoid pinning offline
* memory cgroups by kmem pages that can be freed.
*/
- s->cpu_partial = 0;
+ slub_set_cpu_partial(s, 0);
s->min_partial = 0;
/*
@@ -4760,7 +4761,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
total += x;
nodes[node] += x;
- page = READ_ONCE(c->partial);
+ page = slub_percpu_partial_read_once(c);
if (page) {
node = page_to_nid(page);
if (flags & SO_TOTAL)
@@ -4921,7 +4922,7 @@ SLAB_ATTR(min_partial);
static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%u\n", s->cpu_partial);
+ return sprintf(buf, "%u\n", slub_cpu_partial(s));
}
static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
@@ -4936,7 +4937,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
if (objects && !kmem_cache_has_cpu_partial(s))
return -EINVAL;
- s->cpu_partial = objects;
+ slub_set_cpu_partial(s, objects);
flush_all(s);
return length;
}
@@ -4988,7 +4989,9 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
int len;
for_each_online_cpu(cpu) {
- struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial;
+ struct page *page;
+
+ page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
if (page) {
pages += page->pages;
@@ -5000,7 +5003,9 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
#ifdef CONFIG_SMP
for_each_online_cpu(cpu) {
- struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial;
+ struct page *page;
+
+ page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
if (page && len < PAGE_SIZE - 20)
len += sprintf(buf + len, " C%d=%d(%d)", cpu,
@@ -5512,6 +5517,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
char mbuf[64];
char *buf;
struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
+ ssize_t len;
if (!attr || !attr->store || !attr->show)
continue;
@@ -5536,8 +5542,9 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
buf = buffer;
}
- attr->show(root_cache, buf);
- attr->store(s, buf, strlen(buf));
+ len = attr->show(root_cache, buf);
+ if (len > 0)
+ attr->store(s, buf, len);
}
if (buffer)
@@ -5623,6 +5630,28 @@ static char *create_unique_id(struct kmem_cache *s)
return name;
}
+static void sysfs_slab_remove_workfn(struct work_struct *work)
+{
+ struct kmem_cache *s =
+ container_of(work, struct kmem_cache, kobj_remove_work);
+
+ if (!s->kobj.state_in_sysfs)
+ /*
+ * For a memcg cache, this may be called during
+ * deactivation and again on shutdown. Remove only once.
+ * A cache is never shut down before deactivation is
+ * complete, so no need to worry about synchronization.
+ */
+ return;
+
+#ifdef CONFIG_MEMCG
+ kset_unregister(s->memcg_kset);
+#endif
+ kobject_uevent(&s->kobj, KOBJ_REMOVE);
+ kobject_del(&s->kobj);
+ kobject_put(&s->kobj);
+}
+
static int sysfs_slab_add(struct kmem_cache *s)
{
int err;
@@ -5630,6 +5659,8 @@ static int sysfs_slab_add(struct kmem_cache *s)
struct kset *kset = cache_kset(s);
int unmergeable = slab_unmergeable(s);
+ INIT_WORK(&s->kobj_remove_work, sysfs_slab_remove_workfn);
+
if (!kset) {
kobject_init(&s->kobj, &slab_ktype);
return 0;
@@ -5693,20 +5724,8 @@ static void sysfs_slab_remove(struct kmem_cache *s)
*/
return;
- if (!s->kobj.state_in_sysfs)
- /*
- * For a memcg cache, this may be called during
- * deactivation and again on shutdown. Remove only once.
- * A cache is never shut down before deactivation is
- * complete, so no need to worry about synchronization.
- */
- return;
-
-#ifdef CONFIG_MEMCG
- kset_unregister(s->memcg_kset);
-#endif
- kobject_uevent(&s->kobj, KOBJ_REMOVE);
- kobject_del(&s->kobj);
+ kobject_get(&s->kobj);
+ schedule_work(&s->kobj_remove_work);
}
void sysfs_slab_release(struct kmem_cache *s)
diff --git a/mm/sparse.c b/mm/sparse.c
index 6903c8fc3085..7b4be3fd5cac 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -168,6 +168,44 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
}
}
+/*
+ * There are a number of times that we loop over NR_MEM_SECTIONS,
+ * looking for section_present() on each. But, when we have very
+ * large physical address spaces, NR_MEM_SECTIONS can also be
+ * very large which makes the loops quite long.
+ *
+ * Keeping track of this gives us an easy way to break out of
+ * those loops early.
+ */
+int __highest_present_section_nr;
+static void section_mark_present(struct mem_section *ms)
+{
+ int section_nr = __section_nr(ms);
+
+ if (section_nr > __highest_present_section_nr)
+ __highest_present_section_nr = section_nr;
+
+ ms->section_mem_map |= SECTION_MARKED_PRESENT;
+}
+
+static inline int next_present_section_nr(int section_nr)
+{
+ do {
+ section_nr++;
+ if (present_section_nr(section_nr))
+ return section_nr;
+ } while ((section_nr < NR_MEM_SECTIONS) &&
+ (section_nr <= __highest_present_section_nr));
+
+ return -1;
+}
+#define for_each_present_section_nr(start, section_nr) \
+ for (section_nr = next_present_section_nr(start-1); \
+ ((section_nr >= 0) && \
+ (section_nr < NR_MEM_SECTIONS) && \
+ (section_nr <= __highest_present_section_nr)); \
+ section_nr = next_present_section_nr(section_nr))
+
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
@@ -183,9 +221,11 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
set_section_nid(section, nid);
ms = __nr_to_section(section);
- if (!ms->section_mem_map)
+ if (!ms->section_mem_map) {
ms->section_mem_map = sparse_encode_early_nid(nid) |
- SECTION_MARKED_PRESENT;
+ SECTION_IS_ONLINE;
+ section_mark_present(ms);
+ }
}
}
@@ -476,23 +516,19 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func)
int nodeid_begin = 0;
unsigned long pnum_begin = 0;
- for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+ for_each_present_section_nr(0, pnum) {
struct mem_section *ms;
- if (!present_section_nr(pnum))
- continue;
ms = __nr_to_section(pnum);
nodeid_begin = sparse_early_nid(ms);
pnum_begin = pnum;
break;
}
map_count = 1;
- for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
+ for_each_present_section_nr(pnum_begin + 1, pnum) {
struct mem_section *ms;
int nodeid;
- if (!present_section_nr(pnum))
- continue;
ms = __nr_to_section(pnum);
nodeid = sparse_early_nid(ms);
if (nodeid == nodeid_begin) {
@@ -561,10 +597,7 @@ void __init sparse_init(void)
(void *)map_map);
#endif
- for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
- if (!present_section_nr(pnum))
- continue;
-
+ for_each_present_section_nr(0, pnum) {
usemap = usemap_map[pnum];
if (!usemap)
continue;
@@ -590,6 +623,48 @@ void __init sparse_init(void)
}
#ifdef CONFIG_MEMORY_HOTPLUG
+
+/* Mark all memory sections within the pfn range as online */
+void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ unsigned long section_nr = pfn_to_section_nr(start_pfn);
+ struct mem_section *ms;
+
+ /* onlining code should never touch invalid ranges */
+ if (WARN_ON(!valid_section_nr(section_nr)))
+ continue;
+
+ ms = __nr_to_section(section_nr);
+ ms->section_mem_map |= SECTION_IS_ONLINE;
+ }
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/* Mark all memory sections within the pfn range as online */
+void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ unsigned long section_nr = pfn_to_section_nr(start_pfn);
+ struct mem_section *ms;
+
+ /*
+ * TODO this needs some double checking. Offlining code makes
+ * sure to check pfn_valid but those checks might be just bogus
+ */
+ if (WARN_ON(!valid_section_nr(section_nr)))
+ continue;
+
+ ms = __nr_to_section(section_nr);
+ ms->section_mem_map &= ~SECTION_IS_ONLINE;
+ }
+}
+#endif
+
#ifdef CONFIG_SPARSEMEM_VMEMMAP
static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
{
@@ -686,10 +761,9 @@ static void free_map_bootmem(struct page *memmap)
* set. If this is <=0, then that means that the passed-in
* map was not consumed and must be freed.
*/
-int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
+int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn)
{
unsigned long section_nr = pfn_to_section_nr(start_pfn);
- struct pglist_data *pgdat = zone->zone_pgdat;
struct mem_section *ms;
struct page *memmap;
unsigned long *usemap;
@@ -722,7 +796,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION);
- ms->section_mem_map |= SECTION_MARKED_PRESENT;
+ section_mark_present(ms);
ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
diff --git a/mm/swap.c b/mm/swap.c
index 98d08b4579fa..4f44dbd7f780 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -591,6 +591,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
__count_vm_events(PGLAZYFREE, hpage_nr_pages(page));
+ count_memcg_page_event(page, PGLAZYFREE);
update_page_reclaim_stat(lruvec, 1, 0);
}
}
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index ac6318a064d3..fcd2740f4ed7 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -48,6 +48,9 @@ static int swap_cgroup_prepare(int type)
if (!page)
goto not_enough_page;
ctrl->map[idx] = page;
+
+ if (!(idx % SWAP_CLUSTER_MAX))
+ cond_resched();
}
return 0;
not_enough_page:
@@ -58,21 +61,27 @@ not_enough_page:
return -ENOMEM;
}
+static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl,
+ pgoff_t offset)
+{
+ struct page *mappage;
+ struct swap_cgroup *sc;
+
+ mappage = ctrl->map[offset / SC_PER_PAGE];
+ sc = page_address(mappage);
+ return sc + offset % SC_PER_PAGE;
+}
+
static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
struct swap_cgroup_ctrl **ctrlp)
{
pgoff_t offset = swp_offset(ent);
struct swap_cgroup_ctrl *ctrl;
- struct page *mappage;
- struct swap_cgroup *sc;
ctrl = &swap_cgroup_ctrl[swp_type(ent)];
if (ctrlp)
*ctrlp = ctrl;
-
- mappage = ctrl->map[offset / SC_PER_PAGE];
- sc = page_address(mappage);
- return sc + offset % SC_PER_PAGE;
+ return __lookup_swap_cgroup(ctrl, offset);
}
/**
@@ -105,25 +114,39 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
}
/**
- * swap_cgroup_record - record mem_cgroup for this swp_entry.
- * @ent: swap entry to be recorded into
+ * swap_cgroup_record - record mem_cgroup for a set of swap entries
+ * @ent: the first swap entry to be recorded into
* @id: mem_cgroup to be recorded
+ * @nr_ents: number of swap entries to be recorded
*
* Returns old value at success, 0 at failure.
* (Of course, old value can be 0.)
*/
-unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
+unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
+ unsigned int nr_ents)
{
struct swap_cgroup_ctrl *ctrl;
struct swap_cgroup *sc;
unsigned short old;
unsigned long flags;
+ pgoff_t offset = swp_offset(ent);
+ pgoff_t end = offset + nr_ents;
sc = lookup_swap_cgroup(ent, &ctrl);
spin_lock_irqsave(&ctrl->lock, flags);
old = sc->id;
- sc->id = id;
+ for (;;) {
+ VM_BUG_ON(sc->id != old);
+ sc->id = id;
+ offset++;
+ if (offset == end)
+ break;
+ if (offset % SC_PER_PAGE)
+ sc++;
+ else
+ sc = __lookup_swap_cgroup(ctrl, offset);
+ }
spin_unlock_irqrestore(&ctrl->lock, flags);
return old;
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 58f6c78f1dad..90c1032a8ac3 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -263,7 +263,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache)
cache->cur = 0;
if (swap_slot_cache_active)
- cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, cache->slots);
+ cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false,
+ cache->slots);
return cache->nr;
}
@@ -301,11 +302,19 @@ direct_free:
return 0;
}
-swp_entry_t get_swap_page(void)
+swp_entry_t get_swap_page(struct page *page)
{
swp_entry_t entry, *pentry;
struct swap_slots_cache *cache;
+ entry.val = 0;
+
+ if (PageTransHuge(page)) {
+ if (IS_ENABLED(CONFIG_THP_SWAP))
+ get_swap_pages(1, true, &entry);
+ return entry;
+ }
+
/*
* Preemption is allowed here, because we may sleep
* in refill_swap_slots_cache(). But it is safe, because
@@ -317,7 +326,6 @@ swp_entry_t get_swap_page(void)
*/
cache = raw_cpu_ptr(&swp_slots);
- entry.val = 0;
if (check_cache_active()) {
mutex_lock(&cache->alloc_lock);
if (cache->slots) {
@@ -337,7 +345,7 @@ repeat:
return entry;
}
- get_swap_pages(1, &entry);
+ get_swap_pages(1, false, &entry);
return entry;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 539b8885e3d1..9c71b6b2562f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -19,6 +19,7 @@
#include <linux/migrate.h>
#include <linux/vmalloc.h>
#include <linux/swap_slots.h>
+#include <linux/huge_mm.h>
#include <asm/pgtable.h>
@@ -38,6 +39,7 @@ struct address_space *swapper_spaces[MAX_SWAPFILES];
static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
+#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0)
static struct {
unsigned long add_total;
@@ -90,39 +92,46 @@ void show_swap_cache_info(void)
*/
int __add_to_swap_cache(struct page *page, swp_entry_t entry)
{
- int error;
+ int error, i, nr = hpage_nr_pages(page);
struct address_space *address_space;
+ pgoff_t idx = swp_offset(entry);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapCache(page), page);
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- get_page(page);
+ page_ref_add(page, nr);
SetPageSwapCache(page);
- set_page_private(page, entry.val);
address_space = swap_address_space(entry);
spin_lock_irq(&address_space->tree_lock);
- error = radix_tree_insert(&address_space->page_tree,
- swp_offset(entry), page);
- if (likely(!error)) {
- address_space->nrpages++;
- __inc_node_page_state(page, NR_FILE_PAGES);
- INC_CACHE_INFO(add_total);
+ for (i = 0; i < nr; i++) {
+ set_page_private(page + i, entry.val + i);
+ error = radix_tree_insert(&address_space->page_tree,
+ idx + i, page + i);
+ if (unlikely(error))
+ break;
}
- spin_unlock_irq(&address_space->tree_lock);
-
- if (unlikely(error)) {
+ if (likely(!error)) {
+ address_space->nrpages += nr;
+ __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
+ ADD_CACHE_INFO(add_total, nr);
+ } else {
/*
* Only the context which have set SWAP_HAS_CACHE flag
* would call add_to_swap_cache().
* So add_to_swap_cache() doesn't returns -EEXIST.
*/
VM_BUG_ON(error == -EEXIST);
- set_page_private(page, 0UL);
+ set_page_private(page + i, 0UL);
+ while (i--) {
+ radix_tree_delete(&address_space->page_tree, idx + i);
+ set_page_private(page + i, 0UL);
+ }
ClearPageSwapCache(page);
- put_page(page);
+ page_ref_sub(page, nr);
}
+ spin_unlock_irq(&address_space->tree_lock);
return error;
}
@@ -132,7 +141,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
{
int error;
- error = radix_tree_maybe_preload(gfp_mask);
+ error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page));
if (!error) {
error = __add_to_swap_cache(page, entry);
radix_tree_preload_end();
@@ -146,8 +155,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
*/
void __delete_from_swap_cache(struct page *page)
{
- swp_entry_t entry;
struct address_space *address_space;
+ int i, nr = hpage_nr_pages(page);
+ swp_entry_t entry;
+ pgoff_t idx;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
@@ -155,12 +166,15 @@ void __delete_from_swap_cache(struct page *page)
entry.val = page_private(page);
address_space = swap_address_space(entry);
- radix_tree_delete(&address_space->page_tree, swp_offset(entry));
- set_page_private(page, 0);
+ idx = swp_offset(entry);
+ for (i = 0; i < nr; i++) {
+ radix_tree_delete(&address_space->page_tree, idx + i);
+ set_page_private(page + i, 0);
+ }
ClearPageSwapCache(page);
- address_space->nrpages--;
- __dec_node_page_state(page, NR_FILE_PAGES);
- INC_CACHE_INFO(del_total);
+ address_space->nrpages -= nr;
+ __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
+ ADD_CACHE_INFO(del_total, nr);
}
/**
@@ -170,7 +184,7 @@ void __delete_from_swap_cache(struct page *page)
* Allocate swap space for the page and add the page to the
* swap cache. Caller needs to hold the page lock.
*/
-int add_to_swap(struct page *page, struct list_head *list)
+int add_to_swap(struct page *page)
{
swp_entry_t entry;
int err;
@@ -178,20 +192,12 @@ int add_to_swap(struct page *page, struct list_head *list)
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageUptodate(page), page);
- entry = get_swap_page();
+ entry = get_swap_page(page);
if (!entry.val)
return 0;
- if (mem_cgroup_try_charge_swap(page, entry)) {
- swapcache_free(entry);
- return 0;
- }
-
- if (unlikely(PageTransHuge(page)))
- if (unlikely(split_huge_page_to_list(page, list))) {
- swapcache_free(entry);
- return 0;
- }
+ if (mem_cgroup_try_charge_swap(page, entry))
+ goto fail;
/*
* Radix-tree node allocations from PF_MEMALLOC contexts could
@@ -206,17 +212,19 @@ int add_to_swap(struct page *page, struct list_head *list)
*/
err = add_to_swap_cache(page, entry,
__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
-
- if (!err) {
- return 1;
- } else { /* -ENOMEM radix-tree allocation failure */
+ /* -ENOMEM radix-tree allocation failure */
+ if (err)
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
*/
- swapcache_free(entry);
- return 0;
- }
+ goto fail;
+
+ return 1;
+
+fail:
+ put_swap_page(page, entry);
+ return 0;
}
/*
@@ -237,8 +245,8 @@ void delete_from_swap_cache(struct page *page)
__delete_from_swap_cache(page);
spin_unlock_irq(&address_space->tree_lock);
- swapcache_free(entry);
- put_page(page);
+ put_swap_page(page, entry);
+ page_ref_sub(page, hpage_nr_pages(page));
}
/*
@@ -295,7 +303,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
page = find_get_page(swap_address_space(entry), swp_offset(entry));
- if (page) {
+ if (page && likely(!PageTransCompound(page))) {
INC_CACHE_INFO(find_success);
if (TestClearPageReadahead(page))
atomic_inc(&swapin_readahead_hits);
@@ -389,7 +397,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
*/
- swapcache_free(entry);
+ put_swap_page(new_page, entry);
} while (err != -ENOMEM);
if (new_page)
@@ -506,7 +514,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
gfp_mask, vma, addr);
if (!page)
continue;
- if (offset != entry_offset)
+ if (offset != entry_offset && likely(!PageTransCompound(page)))
SetPageReadahead(page);
put_page(page);
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4f6cba1b6632..811d90e1c929 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -37,6 +37,7 @@
#include <linux/swapfile.h>
#include <linux/export.h>
#include <linux/swap_slots.h>
+#include <linux/sort.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -199,7 +200,11 @@ static void discard_swap_cluster(struct swap_info_struct *si,
}
}
+#ifdef CONFIG_THP_SWAP
+#define SWAPFILE_CLUSTER HPAGE_PMD_NR
+#else
#define SWAPFILE_CLUSTER 256
+#endif
#define LATENCY_LIMIT 256
static inline void cluster_set_flag(struct swap_cluster_info *info,
@@ -374,6 +379,14 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
schedule_work(&si->discard_work);
}
+static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+ struct swap_cluster_info *ci = si->cluster_info;
+
+ cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
+ cluster_list_add_tail(&si->free_clusters, ci, idx);
+}
+
/*
* Doing discard actually. After a cluster discard is finished, the cluster
* will be added to free cluster list. caller should hold si->lock.
@@ -394,10 +407,7 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
spin_lock(&si->lock);
ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
- cluster_set_flag(ci, CLUSTER_FLAG_FREE);
- unlock_cluster(ci);
- cluster_list_add_tail(&si->free_clusters, info, idx);
- ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
+ __free_cluster(si, idx);
memset(si->swap_map + idx * SWAPFILE_CLUSTER,
0, SWAPFILE_CLUSTER);
unlock_cluster(ci);
@@ -415,6 +425,34 @@ static void swap_discard_work(struct work_struct *work)
spin_unlock(&si->lock);
}
+static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+ struct swap_cluster_info *ci = si->cluster_info;
+
+ VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
+ cluster_list_del_first(&si->free_clusters, ci);
+ cluster_set_count_flag(ci + idx, 0, 0);
+}
+
+static void free_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+ struct swap_cluster_info *ci = si->cluster_info + idx;
+
+ VM_BUG_ON(cluster_count(ci) != 0);
+ /*
+ * If the swap is discardable, prepare discard the cluster
+ * instead of free it immediately. The cluster will be freed
+ * after discard.
+ */
+ if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
+ (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
+ swap_cluster_schedule_discard(si, idx);
+ return;
+ }
+
+ __free_cluster(si, idx);
+}
+
/*
* The cluster corresponding to page_nr will be used. The cluster will be
* removed from free cluster list and its usage counter will be increased.
@@ -426,11 +464,8 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
if (!cluster_info)
return;
- if (cluster_is_free(&cluster_info[idx])) {
- VM_BUG_ON(cluster_list_first(&p->free_clusters) != idx);
- cluster_list_del_first(&p->free_clusters, cluster_info);
- cluster_set_count_flag(&cluster_info[idx], 0, 0);
- }
+ if (cluster_is_free(&cluster_info[idx]))
+ alloc_cluster(p, idx);
VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
cluster_set_count(&cluster_info[idx],
@@ -454,21 +489,8 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
cluster_set_count(&cluster_info[idx],
cluster_count(&cluster_info[idx]) - 1);
- if (cluster_count(&cluster_info[idx]) == 0) {
- /*
- * If the swap is discardable, prepare discard the cluster
- * instead of free it immediately. The cluster will be freed
- * after discard.
- */
- if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
- (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
- swap_cluster_schedule_discard(p, idx);
- return;
- }
-
- cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
- cluster_list_add_tail(&p->free_clusters, cluster_info, idx);
- }
+ if (cluster_count(&cluster_info[idx]) == 0)
+ free_cluster(p, idx);
}
/*
@@ -558,6 +580,60 @@ new_cluster:
return found_free;
}
+static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
+ unsigned int nr_entries)
+{
+ unsigned int end = offset + nr_entries - 1;
+
+ if (offset == si->lowest_bit)
+ si->lowest_bit += nr_entries;
+ if (end == si->highest_bit)
+ si->highest_bit -= nr_entries;
+ si->inuse_pages += nr_entries;
+ if (si->inuse_pages == si->pages) {
+ si->lowest_bit = si->max;
+ si->highest_bit = 0;
+ spin_lock(&swap_avail_lock);
+ plist_del(&si->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
+ }
+}
+
+static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
+ unsigned int nr_entries)
+{
+ unsigned long end = offset + nr_entries - 1;
+ void (*swap_slot_free_notify)(struct block_device *, unsigned long);
+
+ if (offset < si->lowest_bit)
+ si->lowest_bit = offset;
+ if (end > si->highest_bit) {
+ bool was_full = !si->highest_bit;
+
+ si->highest_bit = end;
+ if (was_full && (si->flags & SWP_WRITEOK)) {
+ spin_lock(&swap_avail_lock);
+ WARN_ON(!plist_node_empty(&si->avail_list));
+ if (plist_node_empty(&si->avail_list))
+ plist_add(&si->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
+ }
+ }
+ atomic_long_add(nr_entries, &nr_swap_pages);
+ si->inuse_pages -= nr_entries;
+ if (si->flags & SWP_BLKDEV)
+ swap_slot_free_notify =
+ si->bdev->bd_disk->fops->swap_slot_free_notify;
+ else
+ swap_slot_free_notify = NULL;
+ while (offset <= end) {
+ frontswap_invalidate_page(si->type, offset);
+ if (swap_slot_free_notify)
+ swap_slot_free_notify(si->bdev, offset);
+ offset++;
+ }
+}
+
static int scan_swap_map_slots(struct swap_info_struct *si,
unsigned char usage, int nr,
swp_entry_t slots[])
@@ -676,18 +752,7 @@ checks:
inc_cluster_info_page(si, si->cluster_info, offset);
unlock_cluster(ci);
- if (offset == si->lowest_bit)
- si->lowest_bit++;
- if (offset == si->highest_bit)
- si->highest_bit--;
- si->inuse_pages++;
- if (si->inuse_pages == si->pages) {
- si->lowest_bit = si->max;
- si->highest_bit = 0;
- spin_lock(&swap_avail_lock);
- plist_del(&si->avail_list, &swap_avail_head);
- spin_unlock(&swap_avail_lock);
- }
+ swap_range_alloc(si, offset, 1);
si->cluster_next = offset + 1;
slots[n_ret++] = swp_entry(si->type, offset);
@@ -766,6 +831,52 @@ no_page:
return n_ret;
}
+#ifdef CONFIG_THP_SWAP
+static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
+{
+ unsigned long idx;
+ struct swap_cluster_info *ci;
+ unsigned long offset, i;
+ unsigned char *map;
+
+ if (cluster_list_empty(&si->free_clusters))
+ return 0;
+
+ idx = cluster_list_first(&si->free_clusters);
+ offset = idx * SWAPFILE_CLUSTER;
+ ci = lock_cluster(si, offset);
+ alloc_cluster(si, idx);
+ cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0);
+
+ map = si->swap_map + offset;
+ for (i = 0; i < SWAPFILE_CLUSTER; i++)
+ map[i] = SWAP_HAS_CACHE;
+ unlock_cluster(ci);
+ swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
+ *slot = swp_entry(si->type, offset);
+
+ return 1;
+}
+
+static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+ unsigned long offset = idx * SWAPFILE_CLUSTER;
+ struct swap_cluster_info *ci;
+
+ ci = lock_cluster(si, offset);
+ cluster_set_count_flag(ci, 0, 0);
+ free_cluster(si, idx);
+ unlock_cluster(ci);
+ swap_range_free(si, offset, SWAPFILE_CLUSTER);
+}
+#else
+static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
+{
+ VM_WARN_ON_ONCE(1);
+ return 0;
+}
+#endif /* CONFIG_THP_SWAP */
+
static unsigned long scan_swap_map(struct swap_info_struct *si,
unsigned char usage)
{
@@ -781,13 +892,17 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
}
-int get_swap_pages(int n_goal, swp_entry_t swp_entries[])
+int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
{
+ unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1;
struct swap_info_struct *si, *next;
long avail_pgs;
int n_ret = 0;
- avail_pgs = atomic_long_read(&nr_swap_pages);
+ /* Only single cluster request supported */
+ WARN_ON_ONCE(n_goal > 1 && cluster);
+
+ avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages;
if (avail_pgs <= 0)
goto noswap;
@@ -797,7 +912,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[])
if (n_goal > avail_pgs)
n_goal = avail_pgs;
- atomic_long_sub(n_goal, &nr_swap_pages);
+ atomic_long_sub(n_goal * nr_pages, &nr_swap_pages);
spin_lock(&swap_avail_lock);
@@ -823,10 +938,13 @@ start_over:
spin_unlock(&si->lock);
goto nextsi;
}
- n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
- n_goal, swp_entries);
+ if (cluster)
+ n_ret = swap_alloc_cluster(si, swp_entries);
+ else
+ n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
+ n_goal, swp_entries);
spin_unlock(&si->lock);
- if (n_ret)
+ if (n_ret || cluster)
goto check_out;
pr_debug("scan_swap_map of si %d failed to find offset\n",
si->type);
@@ -852,7 +970,8 @@ nextsi:
check_out:
if (n_ret < n_goal)
- atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages);
+ atomic_long_add((long)(n_goal - n_ret) * nr_pages,
+ &nr_swap_pages);
noswap:
return n_ret;
}
@@ -1008,32 +1127,8 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
dec_cluster_info_page(p, p->cluster_info, offset);
unlock_cluster(ci);
- mem_cgroup_uncharge_swap(entry);
- if (offset < p->lowest_bit)
- p->lowest_bit = offset;
- if (offset > p->highest_bit) {
- bool was_full = !p->highest_bit;
-
- p->highest_bit = offset;
- if (was_full && (p->flags & SWP_WRITEOK)) {
- spin_lock(&swap_avail_lock);
- WARN_ON(!plist_node_empty(&p->avail_list));
- if (plist_node_empty(&p->avail_list))
- plist_add(&p->avail_list,
- &swap_avail_head);
- spin_unlock(&swap_avail_lock);
- }
- }
- atomic_long_inc(&nr_swap_pages);
- p->inuse_pages--;
- frontswap_invalidate_page(p->type, offset);
- if (p->flags & SWP_BLKDEV) {
- struct gendisk *disk = p->bdev->bd_disk;
-
- if (disk->fops->swap_slot_free_notify)
- disk->fops->swap_slot_free_notify(p->bdev,
- offset);
- }
+ mem_cgroup_uncharge_swap(entry, 1);
+ swap_range_free(p, offset, 1);
}
/*
@@ -1054,7 +1149,7 @@ void swap_free(swp_entry_t entry)
/*
* Called after dropping swapcache to decrease refcnt to swap entries.
*/
-void swapcache_free(swp_entry_t entry)
+static void swapcache_free(swp_entry_t entry)
{
struct swap_info_struct *p;
@@ -1065,6 +1160,52 @@ void swapcache_free(swp_entry_t entry)
}
}
+#ifdef CONFIG_THP_SWAP
+static void swapcache_free_cluster(swp_entry_t entry)
+{
+ unsigned long offset = swp_offset(entry);
+ unsigned long idx = offset / SWAPFILE_CLUSTER;
+ struct swap_cluster_info *ci;
+ struct swap_info_struct *si;
+ unsigned char *map;
+ unsigned int i;
+
+ si = swap_info_get(entry);
+ if (!si)
+ return;
+
+ ci = lock_cluster(si, offset);
+ map = si->swap_map + offset;
+ for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+ VM_BUG_ON(map[i] != SWAP_HAS_CACHE);
+ map[i] = 0;
+ }
+ unlock_cluster(ci);
+ mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+ swap_free_cluster(si, idx);
+ spin_unlock(&si->lock);
+}
+#else
+static inline void swapcache_free_cluster(swp_entry_t entry)
+{
+}
+#endif /* CONFIG_THP_SWAP */
+
+void put_swap_page(struct page *page, swp_entry_t entry)
+{
+ if (!PageTransHuge(page))
+ swapcache_free(entry);
+ else
+ swapcache_free_cluster(entry);
+}
+
+static int swp_entry_cmp(const void *ent1, const void *ent2)
+{
+ const swp_entry_t *e1 = ent1, *e2 = ent2;
+
+ return (int)swp_type(*e1) - (int)swp_type(*e2);
+}
+
void swapcache_free_entries(swp_entry_t *entries, int n)
{
struct swap_info_struct *p, *prev;
@@ -1075,6 +1216,14 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
prev = NULL;
p = NULL;
+
+ /*
+ * Sort swap entries by swap device, so each lock is only taken once.
+ * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
+ * so low that it isn't necessary to optimize further.
+ */
+ if (nr_swapfiles > 1)
+ sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
for (i = 0; i < n; ++i) {
p = swap_info_get_cont(entries[i], prev);
if (p)
diff --git a/mm/util.c b/mm/util.c
index 464df3489903..26be6407abd7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -357,8 +357,11 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL);
/*
- * Make sure that larger requests are not too disruptive - no OOM
- * killer and no allocation failure warnings as we have a fallback
+ * We want to attempt a large physically contiguous block first because
+ * it is less likely to fragment multiple larger blocks and therefore
+ * contribute to a long term fragmentation less than vmalloc fallback.
+ * However make sure that larger requests are not too disruptive - no
+ * OOM killer and no allocation failure warnings as we have a fallback.
*/
if (size > PAGE_SIZE) {
kmalloc_flags |= __GFP_NOWARN;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 34a1c3e46ed7..6211a807cb31 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -287,10 +287,21 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
if (p4d_none(*p4d))
return NULL;
pud = pud_offset(p4d, addr);
- if (pud_none(*pud))
+
+ /*
+ * Don't dereference bad PUD or PMD (below) entries. This will also
+ * identify huge mappings, which we may encounter on architectures
+ * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
+ * identified as vmalloc addresses by is_vmalloc_addr(), but are
+ * not [unambiguously] associated with a struct page, so there is
+ * no correct value to return for them.
+ */
+ WARN_ON_ONCE(pud_bad(*pud));
+ if (pud_none(*pud) || pud_bad(*pud))
return NULL;
pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd))
+ WARN_ON_ONCE(pmd_bad(*pmd));
+ if (pmd_none(*pmd) || pmd_bad(*pmd))
return NULL;
ptep = pte_offset_map(pmd, addr);
@@ -1759,12 +1770,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
*/
clear_vm_uninitialized_flag(area);
- /*
- * A ref_count = 2 is needed because vm_struct allocated in
- * __get_vm_area_node() contains a reference to the virtual address of
- * the vmalloc'ed block.
- */
- kmemleak_alloc(addr, real_size, 2, gfp_mask);
+ kmemleak_vmalloc(area, size, gfp_mask);
return addr;
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 6063581f705c..ce0618bfa8d0 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -115,9 +115,9 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
unsigned long pressure = 0;
/*
- * reclaimed can be greater than scanned in cases
- * like THP, where the scanned is 1 and reclaimed
- * could be 512
+ * reclaimed can be greater than scanned for things such as reclaimed
+ * slab pages. shrink_node() just adds reclaimed pages without a
+ * related increment to scanned pages.
*/
if (reclaimed >= scanned)
goto out;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8ad39bbc79e6..9e95fafc026b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -708,7 +708,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- swapcache_free(swap);
+ put_swap_page(page, swap);
} else {
void (*freepage)(struct page *);
void *shadow = NULL;
@@ -1125,8 +1125,36 @@ static unsigned long shrink_page_list(struct list_head *page_list,
!PageSwapCache(page)) {
if (!(sc->gfp_mask & __GFP_IO))
goto keep_locked;
- if (!add_to_swap(page, page_list))
+ if (PageTransHuge(page)) {
+ /* cannot split THP, skip it */
+ if (!can_split_huge_page(page, NULL))
+ goto activate_locked;
+ /*
+ * Split pages without a PMD map right
+ * away. Chances are some or all of the
+ * tail pages can be freed without IO.
+ */
+ if (!compound_mapcount(page) &&
+ split_huge_page_to_list(page, page_list))
+ goto activate_locked;
+ }
+ if (!add_to_swap(page)) {
+ if (!PageTransHuge(page))
+ goto activate_locked;
+ /* Split THP and swap individual base pages */
+ if (split_huge_page_to_list(page, page_list))
+ goto activate_locked;
+ if (!add_to_swap(page))
+ goto activate_locked;
+ }
+
+ /* XXX: We don't support THP writes */
+ if (PageTransHuge(page) &&
+ split_huge_page_to_list(page, page_list)) {
+ delete_from_swap_cache(page);
goto activate_locked;
+ }
+
may_enter_fs = 1;
/* Adding to swap updated mapping */
@@ -1266,6 +1294,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
count_vm_event(PGLAZYFREED);
+ count_memcg_page_event(page, PGLAZYFREED);
} else if (!mapping || !__remove_mapping(mapping, page, true))
goto keep_locked;
/*
@@ -1295,6 +1324,7 @@ activate_locked:
if (!PageMlocked(page)) {
SetPageActive(page);
pgactivate++;
+ count_memcg_page_event(page, PGACTIVATE);
}
keep_locked:
unlock_page(page);
@@ -1734,11 +1764,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
- if (global_reclaim(sc)) {
- if (current_is_kswapd())
+ if (current_is_kswapd()) {
+ if (global_reclaim(sc))
__count_vm_events(PGSCAN_KSWAPD, nr_scanned);
- else
+ count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD,
+ nr_scanned);
+ } else {
+ if (global_reclaim(sc))
__count_vm_events(PGSCAN_DIRECT, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT,
+ nr_scanned);
}
spin_unlock_irq(&pgdat->lru_lock);
@@ -1750,11 +1785,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_lock_irq(&pgdat->lru_lock);
- if (global_reclaim(sc)) {
- if (current_is_kswapd())
+ if (current_is_kswapd()) {
+ if (global_reclaim(sc))
__count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
- else
+ count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD,
+ nr_reclaimed);
+ } else {
+ if (global_reclaim(sc))
__count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
+ count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT,
+ nr_reclaimed);
}
putback_inactive_pages(lruvec, &page_list);
@@ -1899,8 +1939,11 @@ static unsigned move_active_pages_to_lru(struct lruvec *lruvec,
}
}
- if (!is_active_lru(lru))
+ if (!is_active_lru(lru)) {
__count_vm_events(PGDEACTIVATE, nr_moved);
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
+ nr_moved);
+ }
return nr_moved;
}
@@ -1938,6 +1981,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
reclaim_stat->recent_scanned[file] += nr_taken;
__count_vm_events(PGREFILL, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
spin_unlock_irq(&pgdat->lru_lock);
@@ -2967,7 +3011,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
unsigned long nr_reclaimed;
struct scan_control sc = {
.nr_to_reclaim = SWAP_CLUSTER_MAX,
- .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
+ .gfp_mask = current_gfp_context(gfp_mask),
.reclaim_idx = gfp_zone(gfp_mask),
.order = order,
.nodemask = nodemask,
@@ -2982,12 +3026,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
* 1 is returned so that the page allocator does not OOM kill at this
* point.
*/
- if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
+ if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
return 1;
trace_mm_vmscan_direct_reclaim_begin(order,
sc.may_writepage,
- gfp_mask,
+ sc.gfp_mask,
sc.reclaim_idx);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
@@ -3652,7 +3696,7 @@ int kswapd_run(int nid)
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
- BUG_ON(system_state == SYSTEM_BOOTING);
+ BUG_ON(system_state < SYSTEM_RUNNING);
pr_err("Failed to start kswapd on node %d\n", nid);
ret = PTR_ERR(pgdat->kswapd);
pgdat->kswapd = NULL;
@@ -3774,17 +3818,16 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
struct reclaim_state reclaim_state;
- int classzone_idx = gfp_zone(gfp_mask);
unsigned int noreclaim_flag;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
- .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
+ .gfp_mask = current_gfp_context(gfp_mask),
.order = order,
.priority = NODE_RECLAIM_PRIORITY,
.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
.may_swap = 1,
- .reclaim_idx = classzone_idx,
+ .reclaim_idx = gfp_zone(gfp_mask),
};
cond_resched();
@@ -3795,7 +3838,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
*/
noreclaim_flag = memalloc_noreclaim_save();
p->flags |= PF_SWAPWRITE;
- lockdep_set_current_reclaim_state(gfp_mask);
+ lockdep_set_current_reclaim_state(sc.gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
@@ -3831,7 +3874,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
* unmapped file backed pages.
*/
if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
- sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
+ node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
return NODE_RECLAIM_FULL;
/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 76f73670200a..744ceaeb42a0 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -928,8 +928,6 @@ const char * const vmstat_text[] = {
"nr_zone_unevictable",
"nr_zone_write_pending",
"nr_mlock",
- "nr_slab_reclaimable",
- "nr_slab_unreclaimable",
"nr_page_table_pages",
"nr_kernel_stack",
"nr_bounce",
@@ -952,6 +950,8 @@ const char * const vmstat_text[] = {
"nr_inactive_file",
"nr_active_file",
"nr_unevictable",
+ "nr_slab_reclaimable",
+ "nr_slab_unreclaimable",
"nr_isolated_anon",
"nr_isolated_file",
"workingset_refault",
@@ -1018,6 +1018,7 @@ const char * const vmstat_text[] = {
"drop_pagecache",
"drop_slab",
+ "oom_kill",
#ifdef CONFIG_NUMA_BALANCING
"numa_pte_updates",
@@ -1223,11 +1224,10 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
struct page *page;
- if (!pfn_valid(pfn))
+ page = pfn_to_online_page(pfn);
+ if (!page)
continue;
- page = pfn_to_page(pfn);
-
/* Watch for unexpected holes punched in the memmap */
if (!memmap_valid_within(pfn, page, zone))
continue;
@@ -1322,7 +1322,7 @@ static int fragmentation_open(struct inode *inode, struct file *file)
return seq_open(file, &fragmentation_op);
}
-static const struct file_operations fragmentation_file_operations = {
+static const struct file_operations buddyinfo_file_operations = {
.open = fragmentation_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -1341,7 +1341,7 @@ static int pagetypeinfo_open(struct inode *inode, struct file *file)
return seq_open(file, &pagetypeinfo_op);
}
-static const struct file_operations pagetypeinfo_file_ops = {
+static const struct file_operations pagetypeinfo_file_operations = {
.open = pagetypeinfo_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -1463,7 +1463,7 @@ static int zoneinfo_open(struct inode *inode, struct file *file)
return seq_open(file, &zoneinfo_op);
}
-static const struct file_operations proc_zoneinfo_file_operations = {
+static const struct file_operations zoneinfo_file_operations = {
.open = zoneinfo_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -1552,7 +1552,7 @@ static int vmstat_open(struct inode *inode, struct file *file)
return seq_open(file, &vmstat_op);
}
-static const struct file_operations proc_vmstat_file_operations = {
+static const struct file_operations vmstat_file_operations = {
.open = vmstat_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -1785,10 +1785,10 @@ void __init init_mm_internals(void)
start_shepherd_timer();
#endif
#ifdef CONFIG_PROC_FS
- proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
- proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
- proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
- proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
+ proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations);
+ proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations);
+ proc_create("vmstat", 0444, NULL, &vmstat_file_operations);
+ proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations);
#endif
}
diff --git a/mm/workingset.c b/mm/workingset.c
index b8c9ab678479..7119cd745ace 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -288,12 +288,10 @@ bool workingset_refault(void *shadow)
*/
refault_distance = (refault - eviction) & EVICTION_MASK;
- inc_node_state(pgdat, WORKINGSET_REFAULT);
- inc_memcg_state(memcg, WORKINGSET_REFAULT);
+ inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
if (refault_distance <= active_file) {
- inc_node_state(pgdat, WORKINGSET_ACTIVATE);
- inc_memcg_state(memcg, WORKINGSET_ACTIVATE);
+ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
rcu_read_unlock();
return true;
}
@@ -474,8 +472,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
}
if (WARN_ON_ONCE(node->exceptional))
goto out_invalid;
- inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
- inc_memcg_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
+ inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
__radix_tree_delete_node(&mapping->page_tree, node,
workingset_update_node, mapping);
diff --git a/mm/zswap.c b/mm/zswap.c
index eedc27894b10..d39581a076c3 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -371,10 +371,9 @@ static int zswap_dstmem_prepare(unsigned int cpu)
u8 *dst;
dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!dst) {
- pr_err("can't allocate compressor buffer\n");
+ if (!dst)
return -ENOMEM;
- }
+
per_cpu(zswap_dstmem, cpu) = dst;
return 0;
}
@@ -515,10 +514,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
}
pool = kzalloc(sizeof(*pool), GFP_KERNEL);
- if (!pool) {
- pr_err("pool alloc failed\n");
+ if (!pool)
return NULL;
- }
/* unique name for each pool specifically required by zsmalloc */
snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
@@ -1158,7 +1155,7 @@ static void zswap_frontswap_init(unsigned type)
{
struct zswap_tree *tree;
- tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
+ tree = kzalloc(sizeof(*tree), GFP_KERNEL);
if (!tree) {
pr_err("alloc failed, zswap disabled for swap type %d\n", type);
return;