summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-04-02 13:55:34 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-04-02 13:55:34 -0700
commit6cad420cc695867b4ca710bac21fde21a4102e4b (patch)
tree890d42abc1e82c2cf5cef583584f88ca70116ce9 /mm
parent7be97138e7276c71cc9ad1752dcb502d28f4400d (diff)
parent77d6b9094819ba55353de0ef92957f3f54f2c36c (diff)
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: "A large amount of MM, plenty more to come. Subsystems affected by this patch series: - tools - kthread - kbuild - scripts - ocfs2 - vfs - mm: slub, kmemleak, pagecache, gup, swap, memcg, pagemap, mremap, sparsemem, kasan, pagealloc, vmscan, compaction, mempolicy, hugetlbfs, hugetlb" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (155 commits) include/linux/huge_mm.h: check PageTail in hpage_nr_pages even when !THP mm/hugetlb: fix build failure with HUGETLB_PAGE but not HUGEBTLBFS selftests/vm: fix map_hugetlb length used for testing read and write mm/hugetlb: remove unnecessary memory fetch in PageHeadHuge() mm/hugetlb.c: clean code by removing unnecessary initialization hugetlb_cgroup: add hugetlb_cgroup reservation docs hugetlb_cgroup: add hugetlb_cgroup reservation tests hugetlb: support file_region coalescing again hugetlb_cgroup: support noreserve mappings hugetlb_cgroup: add accounting for shared mappings hugetlb: disable region_add file_region coalescing hugetlb_cgroup: add reservation accounting for private mappings mm/hugetlb_cgroup: fix hugetlb_cgroup migration hugetlb_cgroup: add interface for charge/uncharge hugetlb reservations hugetlb_cgroup: add hugetlb_cgroup reservation counter hugetlbfs: Use i_mmap_rwsem to address page fault/truncate race hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization mm/memblock.c: remove redundant assignment to variable max_addr mm: mempolicy: require at least one nodeid for MPOL_PREFERRED mm: mempolicy: use VM_BUG_ON_VMA in queue_pages_test_walk() ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile1
-rw-r--r--mm/compaction.c31
-rw-r--r--mm/debug.c44
-rw-r--r--mm/filemap.c71
-rw-r--r--mm/gup.c658
-rw-r--r--mm/gup_benchmark.c71
-rw-r--r--mm/huge_memory.c29
-rw-r--r--mm/hugetlb.c802
-rw-r--r--mm/hugetlb_cgroup.c317
-rw-r--r--mm/internal.h32
-rw-r--r--mm/kasan/common.c26
-rw-r--r--mm/kasan/generic.c9
-rw-r--r--mm/kasan/generic_report.c11
-rw-r--r--mm/kasan/kasan.h2
-rw-r--r--mm/kasan/report.c5
-rw-r--r--mm/kasan/tags.c9
-rw-r--r--mm/kasan/tags_report.c11
-rw-r--r--mm/khugepaged.c4
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/list_lru.c12
-rw-r--r--mm/mapping_dirty_helpers.c42
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memcontrol.c342
-rw-r--r--mm/memory-failure.c29
-rw-r--r--mm/memory.c4
-rw-r--r--mm/mempolicy.c73
-rw-r--r--mm/migrate.c25
-rw-r--r--mm/mmap.c28
-rw-r--r--mm/mremap.c92
-rw-r--r--mm/page-writeback.c19
-rw-r--r--mm/page_alloc.c82
-rw-r--r--mm/page_counter.c23
-rw-r--r--mm/page_ext.c2
-rw-r--r--mm/rmap.c39
-rw-r--r--mm/shuffle.c2
-rw-r--r--mm/slab.h22
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/slub.c27
-rw-r--r--mm/sparse.c29
-rw-r--r--mm/swap.c5
-rw-r--r--mm/swap_slots.c12
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/swapfile.c10
-rw-r--r--mm/userfaultfd.c11
-rw-r--r--mm/vmpressure.c8
-rw-r--r--mm/vmscan.c111
-rw-r--r--mm/vmstat.c2
47 files changed, 2275 insertions, 917 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 272e66039e70..dbc8346d16ca 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -6,6 +6,7 @@
KASAN_SANITIZE_slab_common.o := n
KASAN_SANITIZE_slab.o := n
KASAN_SANITIZE_slub.o := n
+KCSAN_SANITIZE_kmemleak.o := n
# These files are disabled because they produce non-interesting and/or
# flaky coverage that is not a function of syscall inputs. E.g. slab is out of
diff --git a/mm/compaction.c b/mm/compaction.c
index 672d3c78c6ab..df3da2f76fdc 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -894,12 +894,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/*
* Regardless of being on LRU, compound pages such as THP and
- * hugetlbfs are not to be compacted. We can potentially save
- * a lot of iterations if we skip them at once. The check is
- * racy, but we can consider only valid values and the only
- * danger is skipping too much.
+ * hugetlbfs are not to be compacted unless we are attempting
+ * an allocation much larger than the huge page size (eg CMA).
+ * We can potentially save a lot of iterations if we skip them
+ * at once. The check is racy, but we can consider only valid
+ * values and the only danger is skipping too much.
*/
- if (PageCompound(page)) {
+ if (PageCompound(page) && !cc->alloc_contig) {
const unsigned int order = compound_order(page);
if (likely(order < MAX_ORDER))
@@ -969,7 +970,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* and it's on LRU. It can only be a THP so the order
* is safe to read and it's 0 for tail pages.
*/
- if (unlikely(PageCompound(page))) {
+ if (unlikely(PageCompound(page) && !cc->alloc_contig)) {
low_pfn += compound_nr(page) - 1;
goto isolate_fail;
}
@@ -981,12 +982,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (__isolate_lru_page(page, isolate_mode) != 0)
goto isolate_fail;
- VM_BUG_ON_PAGE(PageCompound(page), page);
+ /* The whole page is taken off the LRU; skip the tail pages. */
+ if (PageCompound(page))
+ low_pfn += compound_nr(page) - 1;
/* Successfully isolated */
del_page_from_lru_list(page, lruvec, page_lru(page));
- inc_node_page_state(page,
- NR_ISOLATED_ANON + page_is_file_cache(page));
+ mod_node_page_state(page_pgdat(page),
+ NR_ISOLATED_ANON + page_is_file_cache(page),
+ hpage_nr_pages(page));
isolate_success:
list_add(&page->lru, &cc->migratepages);
@@ -1590,7 +1594,11 @@ typedef enum {
* Allow userspace to control policy on scanning the unevictable LRU for
* compactable pages.
*/
+#ifdef CONFIG_PREEMPT_RT
+int sysctl_compact_unevictable_allowed __read_mostly = 0;
+#else
int sysctl_compact_unevictable_allowed __read_mostly = 1;
+#endif
static inline void
update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
@@ -2174,7 +2182,6 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
ret = COMPACT_CONTENDED;
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
- last_migrated_pfn = 0;
goto out;
case ISOLATE_NONE:
if (update_cached) {
@@ -2310,8 +2317,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.page = NULL,
};
- if (capture)
- current->capture_control = &capc;
+ current->capture_control = &capc;
ret = compact_zone(&cc, &capc);
@@ -2333,6 +2339,7 @@ int sysctl_extfrag_threshold = 500;
* @alloc_flags: The allocation flags of the current allocation
* @ac: The context of current allocation
* @prio: Determines how hard direct compaction should try to succeed
+ * @capture: Pointer to free page created by compaction will be stored here
*
* This is the main entry point for direct page compaction.
*/
diff --git a/mm/debug.c b/mm/debug.c
index ecccd9f17801..2189357f0987 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -44,8 +44,10 @@ const struct trace_print_flags vmaflag_names[] = {
void __dump_page(struct page *page, const char *reason)
{
+ struct page *head = compound_head(page);
struct address_space *mapping;
bool page_poisoned = PagePoisoned(page);
+ bool compound = PageCompound(page);
/*
* Accessing the pageblock without the zone lock. It could change to
* "isolate" again in the meantime, but since we are just dumping the
@@ -66,25 +68,43 @@ void __dump_page(struct page *page, const char *reason)
goto hex_only;
}
- mapping = page_mapping(page);
+ if (page < head || (page >= head + MAX_ORDER_NR_PAGES)) {
+ /* Corrupt page, cannot call page_mapping */
+ mapping = page->mapping;
+ head = page;
+ compound = false;
+ } else {
+ mapping = page_mapping(page);
+ }
/*
* Avoid VM_BUG_ON() in page_mapcount().
* page->_mapcount space in struct page is used by sl[aou]b pages to
* encode own info.
*/
- mapcount = PageSlab(page) ? 0 : page_mapcount(page);
+ mapcount = PageSlab(head) ? 0 : page_mapcount(page);
- if (PageCompound(page))
- pr_warn("page:%px refcount:%d mapcount:%d mapping:%px "
- "index:%#lx compound_mapcount: %d\n",
- page, page_ref_count(page), mapcount,
- page->mapping, page_to_pgoff(page),
- compound_mapcount(page));
+ if (compound)
+ if (hpage_pincount_available(page)) {
+ pr_warn("page:%px refcount:%d mapcount:%d mapping:%p "
+ "index:%#lx head:%px order:%u "
+ "compound_mapcount:%d compound_pincount:%d\n",
+ page, page_ref_count(head), mapcount,
+ mapping, page_to_pgoff(page), head,
+ compound_order(head), compound_mapcount(page),
+ compound_pincount(page));
+ } else {
+ pr_warn("page:%px refcount:%d mapcount:%d mapping:%p "
+ "index:%#lx head:%px order:%u "
+ "compound_mapcount:%d\n",
+ page, page_ref_count(head), mapcount,
+ mapping, page_to_pgoff(page), head,
+ compound_order(head), compound_mapcount(page));
+ }
else
- pr_warn("page:%px refcount:%d mapcount:%d mapping:%px index:%#lx\n",
+ pr_warn("page:%px refcount:%d mapcount:%d mapping:%p index:%#lx\n",
page, page_ref_count(page), mapcount,
- page->mapping, page_to_pgoff(page));
+ mapping, page_to_pgoff(page));
if (PageKsm(page))
type = "ksm ";
else if (PageAnon(page))
@@ -106,6 +126,10 @@ hex_only:
print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
sizeof(struct page), false);
+ if (head != page)
+ print_hex_dump(KERN_WARNING, "head: ", DUMP_PREFIX_NONE, 32,
+ sizeof(unsigned long), head,
+ sizeof(struct page), false);
if (reason)
pr_warn("page dumped because: %s\n", reason);
diff --git a/mm/filemap.c b/mm/filemap.c
index 1784478270e1..0fbdc8e30dd2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1386,7 +1386,7 @@ EXPORT_SYMBOL_GPL(__lock_page_killable);
int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags)
{
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ if (fault_flag_allow_retry_first(flags)) {
/*
* CAUTION! In this case, mmap_sem is not released
* even though return 0.
@@ -1536,7 +1536,6 @@ out:
return page;
}
-EXPORT_SYMBOL(find_get_entry);
/**
* find_lock_entry - locate, pin and lock a page cache entry
@@ -1575,42 +1574,39 @@ repeat:
EXPORT_SYMBOL(find_lock_entry);
/**
- * pagecache_get_page - find and get a page reference
- * @mapping: the address_space to search
- * @offset: the page index
- * @fgp_flags: PCG flags
- * @gfp_mask: gfp mask to use for the page cache data page allocation
- *
- * Looks up the page cache slot at @mapping & @offset.
+ * pagecache_get_page - Find and get a reference to a page.
+ * @mapping: The address_space to search.
+ * @index: The page index.
+ * @fgp_flags: %FGP flags modify how the page is returned.
+ * @gfp_mask: Memory allocation flags to use if %FGP_CREAT is specified.
*
- * PCG flags modify how the page is returned.
+ * Looks up the page cache entry at @mapping & @index.
*
- * @fgp_flags can be:
+ * @fgp_flags can be zero or more of these flags:
*
- * - FGP_ACCESSED: the page will be marked accessed
- * - FGP_LOCK: Page is return locked
- * - FGP_CREAT: If page is not present then a new page is allocated using
- * @gfp_mask and added to the page cache and the VM's LRU
- * list. The page is returned locked and with an increased
- * refcount.
- * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do
- * its own locking dance if the page is already in cache, or unlock the page
- * before returning if we had to add the page to pagecache.
+ * * %FGP_ACCESSED - The page will be marked accessed.
+ * * %FGP_LOCK - The page is returned locked.
+ * * %FGP_CREAT - If no page is present then a new page is allocated using
+ * @gfp_mask and added to the page cache and the VM's LRU list.
+ * The page is returned locked and with an increased refcount.
+ * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
+ * page is already in cache. If the page was allocated, unlock it before
+ * returning so the caller can do the same dance.
*
- * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
- * if the GFP flags specified for FGP_CREAT are atomic.
+ * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
+ * if the %GFP flags specified for %FGP_CREAT are atomic.
*
* If there is a page cache page, it is returned with an increased refcount.
*
- * Return: the found page or %NULL otherwise.
+ * Return: The found page or %NULL otherwise.
*/
-struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
- int fgp_flags, gfp_t gfp_mask)
+struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
+ int fgp_flags, gfp_t gfp_mask)
{
struct page *page;
repeat:
- page = find_get_entry(mapping, offset);
+ page = find_get_entry(mapping, index);
if (xa_is_value(page))
page = NULL;
if (!page)
@@ -1632,7 +1628,7 @@ repeat:
put_page(page);
goto repeat;
}
- VM_BUG_ON_PAGE(page->index != offset, page);
+ VM_BUG_ON_PAGE(page->index != index, page);
}
if (fgp_flags & FGP_ACCESSED)
@@ -1657,7 +1653,7 @@ no_page:
if (fgp_flags & FGP_ACCESSED)
__SetPageReferenced(page);
- err = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
+ err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
if (unlikely(err)) {
put_page(page);
page = NULL;
@@ -1962,8 +1958,7 @@ EXPORT_SYMBOL(find_get_pages_range_tag);
*
* It is going insane. Fix it by quickly scaling down the readahead size.
*/
-static void shrink_readahead_size_eio(struct file *filp,
- struct file_ra_state *ra)
+static void shrink_readahead_size_eio(struct file_ra_state *ra)
{
ra->ra_pages /= 4;
}
@@ -2188,7 +2183,7 @@ readpage:
goto find_page;
}
unlock_page(page);
- shrink_readahead_size_eio(filp, ra);
+ shrink_readahead_size_eio(ra);
error = -EIO;
goto readpage_error;
}
@@ -2416,7 +2411,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
pgoff_t offset = vmf->pgoff;
/* If we don't want any read-ahead, don't bother */
- if (vmf->vma->vm_flags & VM_RAND_READ)
+ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
if (ra->mmap_miss > 0)
ra->mmap_miss--;
@@ -2491,7 +2486,7 @@ retry_find:
if (!page) {
if (fpin)
goto out_retry;
- return vmf_error(-ENOMEM);
+ return VM_FAULT_OOM;
}
}
@@ -2560,7 +2555,7 @@ page_not_uptodate:
goto retry_find;
/* Things didn't work out. Return zero to tell the mm layer so. */
- shrink_readahead_size_eio(file, ra);
+ shrink_readahead_size_eio(ra);
return VM_FAULT_SIGBUS;
out_retry:
@@ -2823,6 +2818,14 @@ filler:
unlock_page(page);
goto out;
}
+
+ /*
+ * A previous I/O error may have been due to temporary
+ * failures.
+ * Clear page error before actual read, PG_error will be
+ * set again if read page fails.
+ */
+ ClearPageError(page);
goto filler;
out:
diff --git a/mm/gup.c b/mm/gup.c
index 1b521e0ac1de..da3e03185144 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -29,6 +29,22 @@ struct follow_page_context {
unsigned int page_mask;
};
+static void hpage_pincount_add(struct page *page, int refs)
+{
+ VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
+ VM_BUG_ON_PAGE(page != compound_head(page), page);
+
+ atomic_add(refs, compound_pincount_ptr(page));
+}
+
+static void hpage_pincount_sub(struct page *page, int refs)
+{
+ VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
+ VM_BUG_ON_PAGE(page != compound_head(page), page);
+
+ atomic_sub(refs, compound_pincount_ptr(page));
+}
+
/*
* Return the compound head page with ref appropriately incremented,
* or NULL if that failed.
@@ -44,6 +60,195 @@ static inline struct page *try_get_compound_head(struct page *page, int refs)
return head;
}
+/*
+ * try_grab_compound_head() - attempt to elevate a page's refcount, by a
+ * flags-dependent amount.
+ *
+ * "grab" names in this file mean, "look at flags to decide whether to use
+ * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
+ *
+ * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
+ * same time. (That's true throughout the get_user_pages*() and
+ * pin_user_pages*() APIs.) Cases:
+ *
+ * FOLL_GET: page's refcount will be incremented by 1.
+ * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
+ *
+ * Return: head page (with refcount appropriately incremented) for success, or
+ * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
+ * considered failure, and furthermore, a likely bug in the caller, so a warning
+ * is also emitted.
+ */
+static __maybe_unused struct page *try_grab_compound_head(struct page *page,
+ int refs,
+ unsigned int flags)
+{
+ if (flags & FOLL_GET)
+ return try_get_compound_head(page, refs);
+ else if (flags & FOLL_PIN) {
+ int orig_refs = refs;
+
+ /*
+ * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast
+ * path, so fail and let the caller fall back to the slow path.
+ */
+ if (unlikely(flags & FOLL_LONGTERM) &&
+ is_migrate_cma_page(page))
+ return NULL;
+
+ /*
+ * When pinning a compound page of order > 1 (which is what
+ * hpage_pincount_available() checks for), use an exact count to
+ * track it, via hpage_pincount_add/_sub().
+ *
+ * However, be sure to *also* increment the normal page refcount
+ * field at least once, so that the page really is pinned.
+ */
+ if (!hpage_pincount_available(page))
+ refs *= GUP_PIN_COUNTING_BIAS;
+
+ page = try_get_compound_head(page, refs);
+ if (!page)
+ return NULL;
+
+ if (hpage_pincount_available(page))
+ hpage_pincount_add(page, refs);
+
+ mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED,
+ orig_refs);
+
+ return page;
+ }
+
+ WARN_ON_ONCE(1);
+ return NULL;
+}
+
+/**
+ * try_grab_page() - elevate a page's refcount by a flag-dependent amount
+ *
+ * This might not do anything at all, depending on the flags argument.
+ *
+ * "grab" names in this file mean, "look at flags to decide whether to use
+ * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
+ *
+ * @page: pointer to page to be grabbed
+ * @flags: gup flags: these are the FOLL_* flag values.
+ *
+ * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
+ * time. Cases:
+ *
+ * FOLL_GET: page's refcount will be incremented by 1.
+ * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
+ *
+ * Return: true for success, or if no action was required (if neither FOLL_PIN
+ * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
+ * FOLL_PIN was set, but the page could not be grabbed.
+ */
+bool __must_check try_grab_page(struct page *page, unsigned int flags)
+{
+ WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
+
+ if (flags & FOLL_GET)
+ return try_get_page(page);
+ else if (flags & FOLL_PIN) {
+ int refs = 1;
+
+ page = compound_head(page);
+
+ if (WARN_ON_ONCE(page_ref_count(page) <= 0))
+ return false;
+
+ if (hpage_pincount_available(page))
+ hpage_pincount_add(page, 1);
+ else
+ refs = GUP_PIN_COUNTING_BIAS;
+
+ /*
+ * Similar to try_grab_compound_head(): even if using the
+ * hpage_pincount_add/_sub() routines, be sure to
+ * *also* increment the normal page refcount field at least
+ * once, so that the page really is pinned.
+ */
+ page_ref_add(page, refs);
+
+ mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1);
+ }
+
+ return true;
+}
+
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+static bool __unpin_devmap_managed_user_page(struct page *page)
+{
+ int count, refs = 1;
+
+ if (!page_is_devmap_managed(page))
+ return false;
+
+ if (hpage_pincount_available(page))
+ hpage_pincount_sub(page, 1);
+ else
+ refs = GUP_PIN_COUNTING_BIAS;
+
+ count = page_ref_sub_return(page, refs);
+
+ mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, 1);
+ /*
+ * devmap page refcounts are 1-based, rather than 0-based: if
+ * refcount is 1, then the page is free and the refcount is
+ * stable because nobody holds a reference on the page.
+ */
+ if (count == 1)
+ free_devmap_managed_page(page);
+ else if (!count)
+ __put_page(page);
+
+ return true;
+}
+#else
+static bool __unpin_devmap_managed_user_page(struct page *page)
+{
+ return false;
+}
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
+
+/**
+ * unpin_user_page() - release a dma-pinned page
+ * @page: pointer to page to be released
+ *
+ * Pages that were pinned via pin_user_pages*() must be released via either
+ * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
+ * that such pages can be separately tracked and uniquely handled. In
+ * particular, interactions with RDMA and filesystems need special handling.
+ */
+void unpin_user_page(struct page *page)
+{
+ int refs = 1;
+
+ page = compound_head(page);
+
+ /*
+ * For devmap managed pages we need to catch refcount transition from
+ * GUP_PIN_COUNTING_BIAS to 1, when refcount reach one it means the
+ * page is free and we need to inform the device driver through
+ * callback. See include/linux/memremap.h and HMM for details.
+ */
+ if (__unpin_devmap_managed_user_page(page))
+ return;
+
+ if (hpage_pincount_available(page))
+ hpage_pincount_sub(page, 1);
+ else
+ refs = GUP_PIN_COUNTING_BIAS;
+
+ if (page_ref_sub_and_test(page, refs))
+ __put_page(page);
+
+ mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, 1);
+}
+EXPORT_SYMBOL(unpin_user_page);
+
/**
* unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
* @pages: array of pages to be maybe marked dirty, and definitely released.
@@ -193,6 +398,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
struct page *page;
spinlock_t *ptl;
pte_t *ptep, pte;
+ int ret;
/* FOLL_GET and FOLL_PIN are mutually exclusive. */
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
@@ -230,10 +436,11 @@ retry:
}
page = vm_normal_page(vma, address, pte);
- if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
+ if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
/*
- * Only return device mapping pages in the FOLL_GET case since
- * they are only valid while holding the pgmap reference.
+ * Only return device mapping pages in the FOLL_GET or FOLL_PIN
+ * case since they are only valid while holding the pgmap
+ * reference.
*/
*pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
if (*pgmap)
@@ -250,8 +457,6 @@ retry:
if (is_zero_pfn(pte_pfn(pte))) {
page = pte_page(pte);
} else {
- int ret;
-
ret = follow_pfn_pte(vma, address, ptep, flags);
page = ERR_PTR(ret);
goto out;
@@ -259,7 +464,6 @@ retry:
}
if (flags & FOLL_SPLIT && PageTransCompound(page)) {
- int ret;
get_page(page);
pte_unmap_unlock(ptep, ptl);
lock_page(page);
@@ -271,9 +475,21 @@ retry:
goto retry;
}
- if (flags & FOLL_GET) {
- if (unlikely(!try_get_page(page))) {
- page = ERR_PTR(-ENOMEM);
+ /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
+ if (unlikely(!try_grab_page(page, flags))) {
+ page = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+ /*
+ * We need to make the page accessible if and only if we are going
+ * to access its content (the FOLL_PIN case). Please see
+ * Documentation/core-api/pin_user_pages.rst for details.
+ */
+ if (flags & FOLL_PIN) {
+ ret = arch_make_page_accessible(page);
+ if (ret) {
+ unpin_user_page(page);
+ page = ERR_PTR(ret);
goto out;
}
}
@@ -537,7 +753,7 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
/* make this handle hugepd */
page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
if (!IS_ERR(page)) {
- BUG_ON(flags & FOLL_GET);
+ WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
return page;
}
@@ -630,12 +846,12 @@ unmap:
}
/*
- * mmap_sem must be held on entry. If @nonblocking != NULL and
- * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released.
- * If it is, *@nonblocking will be set to 0 and -EBUSY returned.
+ * mmap_sem must be held on entry. If @locked != NULL and *@flags
+ * does not include FOLL_NOWAIT, the mmap_sem may be released. If it
+ * is, *@locked will be set to 0 and -EBUSY returned.
*/
static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
- unsigned long address, unsigned int *flags, int *nonblocking)
+ unsigned long address, unsigned int *flags, int *locked)
{
unsigned int fault_flags = 0;
vm_fault_t ret;
@@ -647,12 +863,15 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
fault_flags |= FAULT_FLAG_WRITE;
if (*flags & FOLL_REMOTE)
fault_flags |= FAULT_FLAG_REMOTE;
- if (nonblocking)
- fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+ if (locked)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
if (*flags & FOLL_NOWAIT)
fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
if (*flags & FOLL_TRIED) {
- VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
+ /*
+ * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
+ * can co-exist
+ */
fault_flags |= FAULT_FLAG_TRIED;
}
@@ -673,8 +892,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
}
if (ret & VM_FAULT_RETRY) {
- if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
- *nonblocking = 0;
+ if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
+ *locked = 0;
return -EBUSY;
}
@@ -751,7 +970,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
- * @nonblocking: whether waiting for disk IO or mmap_sem contention
+ * @locked: whether we're still with the mmap_sem held
*
* Returns either number of pages pinned (which may be less than the
* number requested), or an error. Details about the return value:
@@ -786,13 +1005,11 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* appropriate) must be called after the page is finished with, and
* before put_page is called.
*
- * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
- * or mmap_sem contention, and if waiting is needed to pin all pages,
- * *@nonblocking will be set to 0. Further, if @gup_flags does not
- * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in
- * this case.
+ * If @locked != NULL, *@locked will be set to 0 when mmap_sem is
+ * released by an up_read(). That can happen if @gup_flags does not
+ * have FOLL_NOWAIT.
*
- * A caller using such a combination of @nonblocking and @gup_flags
+ * A caller using such a combination of @locked and @gup_flags
* must therefore hold the mmap_sem for reading only, and recognize
* when it's been released. Otherwise, it must be held for either
* reading or writing and will not be released.
@@ -804,7 +1021,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *nonblocking)
+ struct vm_area_struct **vmas, int *locked)
{
long ret = 0, i = 0;
struct vm_area_struct *vma = NULL;
@@ -850,7 +1067,17 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (is_vm_hugetlb_page(vma)) {
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
- gup_flags, nonblocking);
+ gup_flags, locked);
+ if (locked && *locked == 0) {
+ /*
+ * We've got a VM_FAULT_RETRY
+ * and we've lost mmap_sem.
+ * We must stop here.
+ */
+ BUG_ON(gup_flags & FOLL_NOWAIT);
+ BUG_ON(ret != 0);
+ goto out;
+ }
continue;
}
}
@@ -868,7 +1095,7 @@ retry:
page = follow_page_mask(vma, start, foll_flags, &ctx);
if (!page) {
ret = faultin_page(tsk, vma, start, &foll_flags,
- nonblocking);
+ locked);
switch (ret) {
case 0:
goto retry;
@@ -980,7 +1207,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
address = untagged_addr(address);
if (unlocked)
- fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
retry:
vma = find_extend_vma(mm, address);
@@ -1004,7 +1231,6 @@ retry:
down_read(&mm->mmap_sem);
if (!(fault_flags & FAULT_FLAG_TRIED)) {
*unlocked = true;
- fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
fault_flags |= FAULT_FLAG_TRIED;
goto retry;
}
@@ -1088,17 +1314,36 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
if (likely(pages))
pages += ret;
start += ret << PAGE_SHIFT;
+ lock_dropped = true;
+retry:
/*
* Repeat on the address that fired VM_FAULT_RETRY
- * without FAULT_FLAG_ALLOW_RETRY but with
- * FAULT_FLAG_TRIED.
+ * with both FAULT_FLAG_ALLOW_RETRY and
+ * FAULT_FLAG_TRIED. Note that GUP can be interrupted
+ * by fatal signals, so we need to check it before we
+ * start trying again otherwise it can loop forever.
*/
+
+ if (fatal_signal_pending(current))
+ break;
+
*locked = 1;
- lock_dropped = true;
- down_read(&mm->mmap_sem);
+ ret = down_read_killable(&mm->mmap_sem);
+ if (ret) {
+ BUG_ON(ret > 0);
+ if (!pages_done)
+ pages_done = ret;
+ break;
+ }
+
ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
- pages, NULL, NULL);
+ pages, NULL, locked);
+ if (!*locked) {
+ /* Continue to retry until we succeeded */
+ BUG_ON(ret != 0);
+ goto retry;
+ }
if (ret != 1) {
BUG_ON(ret > 1);
if (!pages_done)
@@ -1129,7 +1374,7 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
* @vma: target vma
* @start: start address
* @end: end address
- * @nonblocking:
+ * @locked: whether the mmap_sem is still held
*
* This takes care of mlocking the pages too if VM_LOCKED is set.
*
@@ -1137,14 +1382,14 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
*
* vma->vm_mm->mmap_sem must be held.
*
- * If @nonblocking is NULL, it may be held for read or write and will
+ * If @locked is NULL, it may be held for read or write and will
* be unperturbed.
*
- * If @nonblocking is non-NULL, it must held for read only and may be
- * released. If it's released, *@nonblocking will be set to 0.
+ * If @locked is non-NULL, it must held for read only and may be
+ * released. If it's released, *@locked will be set to 0.
*/
long populate_vma_page_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end, int *nonblocking)
+ unsigned long start, unsigned long end, int *locked)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long nr_pages = (end - start) / PAGE_SIZE;
@@ -1179,7 +1424,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
* not result in a stack expansion that recurses back here.
*/
return __get_user_pages(current, mm, start, nr_pages, gup_flags,
- NULL, NULL, nonblocking);
+ NULL, NULL, locked);
}
/*
@@ -1557,6 +1802,37 @@ static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
}
#endif /* CONFIG_FS_DAX || CONFIG_CMA */
+#ifdef CONFIG_MMU
+static long __get_user_pages_remote(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *locked)
+{
+ /*
+ * Parts of FOLL_LONGTERM behavior are incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. However, this only comes up if locked is set, and there are
+ * callers that do request FOLL_LONGTERM, but do not set locked. So,
+ * allow what we can.
+ */
+ if (gup_flags & FOLL_LONGTERM) {
+ if (WARN_ON_ONCE(locked))
+ return -EINVAL;
+ /*
+ * This will check the vmas (even if our vmas arg is NULL)
+ * and return -ENOTSUPP if DAX isn't allowed in this case:
+ */
+ return __gup_longterm_locked(tsk, mm, start, nr_pages, pages,
+ vmas, gup_flags | FOLL_TOUCH |
+ FOLL_REMOTE);
+ }
+
+ return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+ locked,
+ gup_flags | FOLL_TOUCH | FOLL_REMOTE);
+}
+
/*
* get_user_pages_remote() - pin user pages in memory
* @tsk: the task_struct to use for page fault accounting, or
@@ -1619,7 +1895,6 @@ static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
* should use get_user_pages because it cannot pass
* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
*/
-#ifdef CONFIG_MMU
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
@@ -1632,28 +1907,8 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
return -EINVAL;
- /*
- * Parts of FOLL_LONGTERM behavior are incompatible with
- * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- * vmas. However, this only comes up if locked is set, and there are
- * callers that do request FOLL_LONGTERM, but do not set locked. So,
- * allow what we can.
- */
- if (gup_flags & FOLL_LONGTERM) {
- if (WARN_ON_ONCE(locked))
- return -EINVAL;
- /*
- * This will check the vmas (even if our vmas arg is NULL)
- * and return -ENOTSUPP if DAX isn't allowed in this case:
- */
- return __gup_longterm_locked(tsk, mm, start, nr_pages, pages,
- vmas, gup_flags | FOLL_TOUCH |
- FOLL_REMOTE);
- }
-
- return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
- locked,
- gup_flags | FOLL_TOUCH | FOLL_REMOTE);
+ return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags,
+ pages, vmas, locked);
}
EXPORT_SYMBOL(get_user_pages_remote);
@@ -1665,6 +1920,15 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
{
return 0;
}
+
+static long __get_user_pages_remote(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *locked)
+{
+ return 0;
+}
#endif /* !CONFIG_MMU */
/*
@@ -1804,7 +2068,31 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
* This code is based heavily on the PowerPC implementation by Nick Piggin.
*/
#ifdef CONFIG_HAVE_FAST_GUP
+
+static void put_compound_head(struct page *page, int refs, unsigned int flags)
+{
+ if (flags & FOLL_PIN) {
+ mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED,
+ refs);
+
+ if (hpage_pincount_available(page))
+ hpage_pincount_sub(page, refs);
+ else
+ refs *= GUP_PIN_COUNTING_BIAS;
+ }
+
+ VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
+ /*
+ * Calling put_page() for each ref is unnecessarily slow. Only the last
+ * ref needs a put_page().
+ */
+ if (refs > 1)
+ page_ref_sub(page, refs - 1);
+ put_page(page);
+}
+
#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
+
/*
* WARNING: only to be used in the get_user_pages_fast() implementation.
*
@@ -1860,13 +2148,17 @@ static inline pte_t gup_get_pte(pte_t *ptep)
#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
+ unsigned int flags,
struct page **pages)
{
while ((*nr) - nr_start) {
struct page *page = pages[--(*nr)];
ClearPageReferenced(page);
- put_page(page);
+ if (flags & FOLL_PIN)
+ unpin_user_page(page);
+ else
+ put_page(page);
}
}
@@ -1899,7 +2191,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
if (unlikely(!pgmap)) {
- undo_dev_pagemap(nr, nr_start, pages);
+ undo_dev_pagemap(nr, nr_start, flags, pages);
goto pte_unmap;
}
} else if (pte_special(pte))
@@ -1908,17 +2200,30 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
- head = try_get_compound_head(page, 1);
+ head = try_grab_compound_head(page, 1, flags);
if (!head)
goto pte_unmap;
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- put_page(head);
+ put_compound_head(head, 1, flags);
goto pte_unmap;
}
VM_BUG_ON_PAGE(compound_head(page) != head, page);
+ /*
+ * We need to make the page accessible if and only if we are
+ * going to access its content (the FOLL_PIN case). Please
+ * see Documentation/core-api/pin_user_pages.rst for
+ * details.
+ */
+ if (flags & FOLL_PIN) {
+ ret = arch_make_page_accessible(page);
+ if (ret) {
+ unpin_user_page(page);
+ goto pte_unmap;
+ }
+ }
SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
@@ -1953,7 +2258,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
static int __gup_device_huge(unsigned long pfn, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
int nr_start = *nr;
struct dev_pagemap *pgmap = NULL;
@@ -1963,12 +2269,15 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
pgmap = get_dev_pagemap(pfn, pgmap);
if (unlikely(!pgmap)) {
- undo_dev_pagemap(nr, nr_start, pages);
+ undo_dev_pagemap(nr, nr_start, flags, pages);
return 0;
}
SetPageReferenced(page);
pages[*nr] = page;
- get_page(page);
+ if (unlikely(!try_grab_page(page, flags))) {
+ undo_dev_pagemap(nr, nr_start, flags, pages);
+ return 0;
+ }
(*nr)++;
pfn++;
} while (addr += PAGE_SIZE, addr != end);
@@ -1979,48 +2288,52 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
}
static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
unsigned long fault_pfn;
int nr_start = *nr;
fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
+ if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
return 0;
if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
- undo_dev_pagemap(nr, nr_start, pages);
+ undo_dev_pagemap(nr, nr_start, flags, pages);
return 0;
}
return 1;
}
static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
unsigned long fault_pfn;
int nr_start = *nr;
fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
+ if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
return 0;
if (unlikely(pud_val(orig) != pud_val(*pudp))) {
- undo_dev_pagemap(nr, nr_start, pages);
+ undo_dev_pagemap(nr, nr_start, flags, pages);
return 0;
}
return 1;
}
#else
static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
BUILD_BUG();
return 0;
}
static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
BUILD_BUG();
return 0;
@@ -2038,18 +2351,6 @@ static int record_subpages(struct page *page, unsigned long addr,
return nr;
}
-static void put_compound_head(struct page *page, int refs)
-{
- VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
- /*
- * Calling put_page() for each ref is unnecessarily slow. Only the last
- * ref needs a put_page().
- */
- if (refs > 1)
- page_ref_sub(page, refs - 1);
- put_page(page);
-}
-
#ifdef CONFIG_ARCH_HAS_HUGEPD
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
unsigned long sz)
@@ -2083,12 +2384,12 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
refs = record_subpages(page, addr, end, pages + *nr);
- head = try_get_compound_head(head, refs);
+ head = try_grab_compound_head(head, refs, flags);
if (!head)
return 0;
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- put_compound_head(head, refs);
+ put_compound_head(head, refs, flags);
return 0;
}
@@ -2136,18 +2437,19 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
if (pmd_devmap(orig)) {
if (unlikely(flags & FOLL_LONGTERM))
return 0;
- return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
+ return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
+ pages, nr);
}
page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
refs = record_subpages(page, addr, end, pages + *nr);
- head = try_get_compound_head(pmd_page(orig), refs);
+ head = try_grab_compound_head(pmd_page(orig), refs, flags);
if (!head)
return 0;
if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
- put_compound_head(head, refs);
+ put_compound_head(head, refs, flags);
return 0;
}
@@ -2157,7 +2459,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
}
static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
@@ -2168,18 +2471,19 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
if (pud_devmap(orig)) {
if (unlikely(flags & FOLL_LONGTERM))
return 0;
- return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
+ return __gup_device_huge_pud(orig, pudp, addr, end, flags,
+ pages, nr);
}
page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
refs = record_subpages(page, addr, end, pages + *nr);
- head = try_get_compound_head(pud_page(orig), refs);
+ head = try_grab_compound_head(pud_page(orig), refs, flags);
if (!head)
return 0;
if (unlikely(pud_val(orig) != pud_val(*pudp))) {
- put_compound_head(head, refs);
+ put_compound_head(head, refs, flags);
return 0;
}
@@ -2203,12 +2507,12 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
refs = record_subpages(page, addr, end, pages + *nr);
- head = try_get_compound_head(pgd_page(orig), refs);
+ head = try_grab_compound_head(pgd_page(orig), refs, flags);
if (!head)
return 0;
if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
- put_compound_head(head, refs);
+ put_compound_head(head, refs, flags);
return 0;
}
@@ -2370,7 +2674,15 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
{
unsigned long len, end;
unsigned long flags;
- int nr = 0;
+ int nr_pinned = 0;
+ /*
+ * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
+ * because gup fast is always a "pin with a +1 page refcount" request.
+ */
+ unsigned int gup_flags = FOLL_GET;
+
+ if (write)
+ gup_flags |= FOLL_WRITE;
start = untagged_addr(start) & PAGE_MASK;
len = (unsigned long) nr_pages << PAGE_SHIFT;
@@ -2396,11 +2708,11 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
gup_fast_permitted(start, end)) {
local_irq_save(flags);
- gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr);
+ gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
local_irq_restore(flags);
}
- return nr;
+ return nr_pinned;
}
EXPORT_SYMBOL_GPL(__get_user_pages_fast);
@@ -2432,10 +2744,10 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
struct page **pages)
{
unsigned long addr, len, end;
- int nr = 0, ret = 0;
+ int nr_pinned = 0, ret = 0;
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
- FOLL_FORCE | FOLL_PIN)))
+ FOLL_FORCE | FOLL_PIN | FOLL_GET)))
return -EINVAL;
start = untagged_addr(start) & PAGE_MASK;
@@ -2451,25 +2763,25 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
gup_fast_permitted(start, end)) {
local_irq_disable();
- gup_pgd_range(addr, end, gup_flags, pages, &nr);
+ gup_pgd_range(addr, end, gup_flags, pages, &nr_pinned);
local_irq_enable();
- ret = nr;
+ ret = nr_pinned;
}
- if (nr < nr_pages) {
+ if (nr_pinned < nr_pages) {
/* Try to get the remaining pages with get_user_pages */
- start += nr << PAGE_SHIFT;
- pages += nr;
+ start += nr_pinned << PAGE_SHIFT;
+ pages += nr_pinned;
- ret = __gup_longterm_unlocked(start, nr_pages - nr,
+ ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned,
gup_flags, pages);
/* Have to be a bit careful with return values */
- if (nr > 0) {
+ if (nr_pinned > 0) {
if (ret < 0)
- ret = nr;
+ ret = nr_pinned;
else
- ret += nr;
+ ret += nr_pinned;
}
}
@@ -2478,11 +2790,11 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
/**
* get_user_pages_fast() - pin user pages in memory
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @gup_flags: flags modifying pin behaviour
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long.
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying pin behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long.
*
* Attempt to pin user pages in memory without taking mm->mmap_sem.
* If not successful, it will fall back to taking the lock and
@@ -2502,6 +2814,13 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
return -EINVAL;
+ /*
+ * The caller may or may not have explicitly set FOLL_GET; either way is
+ * OK. However, internally (within mm/gup.c), gup fast variants must set
+ * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
+ * request.
+ */
+ gup_flags |= FOLL_GET;
return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
@@ -2509,9 +2828,18 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
/**
* pin_user_pages_fast() - pin user pages in memory without taking locks
*
- * For now, this is a placeholder function, until various call sites are
- * converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
- * this is identical to get_user_pages_fast().
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying pin behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long.
+ *
+ * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
+ * get_user_pages_fast() for documentation on the function arguments, because
+ * the arguments here are identical.
+ *
+ * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
+ * see Documentation/vm/pin_user_pages.rst for further details.
*
* This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
* is NOT intended for Case 2 (RDMA: long-term pins).
@@ -2519,21 +2847,39 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
int pin_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
- /*
- * This is a placeholder, until the pin functionality is activated.
- * Until then, just behave like the corresponding get_user_pages*()
- * routine.
- */
- return get_user_pages_fast(start, nr_pages, gup_flags, pages);
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ return -EINVAL;
+
+ gup_flags |= FOLL_PIN;
+ return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(pin_user_pages_fast);
/**
* pin_user_pages_remote() - pin pages of a remote process (task != current)
*
- * For now, this is a placeholder function, until various call sites are
- * converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
- * this is identical to get_user_pages_remote().
+ * @tsk: the task_struct to use for page fault accounting, or
+ * NULL if faults are not to be recorded.
+ * @mm: mm_struct of target mm
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying lookup behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ * @locked: pointer to lock flag indicating whether lock is held and
+ * subsequently whether VM_FAULT_RETRY functionality can be
+ * utilised. Lock must initially be held.
+ *
+ * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
+ * get_user_pages_remote() for documentation on the function arguments, because
+ * the arguments here are identical.
+ *
+ * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
+ * see Documentation/vm/pin_user_pages.rst for details.
*
* This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
* is NOT intended for Case 2 (RDMA: long-term pins).
@@ -2543,22 +2889,33 @@ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
- /*
- * This is a placeholder, until the pin functionality is activated.
- * Until then, just behave like the corresponding get_user_pages*()
- * routine.
- */
- return get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags, pages,
- vmas, locked);
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ return -EINVAL;
+
+ gup_flags |= FOLL_PIN;
+ return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags,
+ pages, vmas, locked);
}
EXPORT_SYMBOL(pin_user_pages_remote);
/**
* pin_user_pages() - pin user pages in memory for use by other devices
*
- * For now, this is a placeholder function, until various call sites are
- * converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
- * this is identical to get_user_pages().
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying lookup behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ *
+ * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
+ * FOLL_PIN is set.
+ *
+ * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
+ * see Documentation/vm/pin_user_pages.rst for details.
*
* This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
* is NOT intended for Case 2 (RDMA: long-term pins).
@@ -2567,11 +2924,12 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas)
{
- /*
- * This is a placeholder, until the pin functionality is activated.
- * Until then, just behave like the corresponding get_user_pages*()
- * routine.
- */
- return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ return -EINVAL;
+
+ gup_flags |= FOLL_PIN;
+ return __gup_longterm_locked(current, current->mm, start, nr_pages,
+ pages, vmas, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 8dba38e79a9f..be690fa66a46 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -8,6 +8,8 @@
#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark)
#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
+#define PIN_FAST_BENCHMARK _IOWR('g', 4, struct gup_benchmark)
+#define PIN_BENCHMARK _IOWR('g', 5, struct gup_benchmark)
struct gup_benchmark {
__u64 get_delta_usec;
@@ -19,6 +21,48 @@ struct gup_benchmark {
__u64 expansion[10]; /* For future use */
};
+static void put_back_pages(unsigned int cmd, struct page **pages,
+ unsigned long nr_pages)
+{
+ unsigned long i;
+
+ switch (cmd) {
+ case GUP_FAST_BENCHMARK:
+ case GUP_LONGTERM_BENCHMARK:
+ case GUP_BENCHMARK:
+ for (i = 0; i < nr_pages; i++)
+ put_page(pages[i]);
+ break;
+
+ case PIN_FAST_BENCHMARK:
+ case PIN_BENCHMARK:
+ unpin_user_pages(pages, nr_pages);
+ break;
+ }
+}
+
+static void verify_dma_pinned(unsigned int cmd, struct page **pages,
+ unsigned long nr_pages)
+{
+ unsigned long i;
+ struct page *page;
+
+ switch (cmd) {
+ case PIN_FAST_BENCHMARK:
+ case PIN_BENCHMARK:
+ for (i = 0; i < nr_pages; i++) {
+ page = pages[i];
+ if (WARN(!page_maybe_dma_pinned(page),
+ "pages[%lu] is NOT dma-pinned\n", i)) {
+
+ dump_page(page, "gup_benchmark failure");
+ break;
+ }
+ }
+ break;
+ }
+}
+
static int __gup_benchmark_ioctl(unsigned int cmd,
struct gup_benchmark *gup)
{
@@ -66,6 +110,14 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
nr = get_user_pages(addr, nr, gup->flags, pages + i,
NULL);
break;
+ case PIN_FAST_BENCHMARK:
+ nr = pin_user_pages_fast(addr, nr, gup->flags,
+ pages + i);
+ break;
+ case PIN_BENCHMARK:
+ nr = pin_user_pages(addr, nr, gup->flags, pages + i,
+ NULL);
+ break;
default:
kvfree(pages);
ret = -EINVAL;
@@ -78,15 +130,22 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
}
end_time = ktime_get();
+ /* Shifting the meaning of nr_pages: now it is actual number pinned: */
+ nr_pages = i;
+
gup->get_delta_usec = ktime_us_delta(end_time, start_time);
gup->size = addr - gup->addr;
+ /*
+ * Take an un-benchmark-timed moment to verify DMA pinned
+ * state: print a warning if any non-dma-pinned pages are found:
+ */
+ verify_dma_pinned(cmd, pages, nr_pages);
+
start_time = ktime_get();
- for (i = 0; i < nr_pages; i++) {
- if (!pages[i])
- break;
- put_page(pages[i]);
- }
+
+ put_back_pages(cmd, pages, nr_pages);
+
end_time = ktime_get();
gup->put_delta_usec = ktime_us_delta(end_time, start_time);
@@ -105,6 +164,8 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd,
case GUP_FAST_BENCHMARK:
case GUP_LONGTERM_BENCHMARK:
case GUP_BENCHMARK:
+ case PIN_FAST_BENCHMARK:
+ case PIN_BENCHMARK:
break;
default:
return -EINVAL;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 24ad53b4dfc0..b1e069e68189 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -958,6 +958,11 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
*/
WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET)))
+ return NULL;
+
if (flags & FOLL_WRITE && !pmd_write(*pmd))
return NULL;
@@ -973,7 +978,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
* device mapped pages can only be returned if the
* caller will manage the page reference count.
*/
- if (!(flags & FOLL_GET))
+ if (!(flags & (FOLL_GET | FOLL_PIN)))
return ERR_PTR(-EEXIST);
pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
@@ -981,7 +986,8 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
- get_page(page);
+ if (!try_grab_page(page, flags))
+ page = ERR_PTR(-ENOMEM);
return page;
}
@@ -1101,6 +1107,11 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
if (flags & FOLL_WRITE && !pud_write(*pud))
return NULL;
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET)))
+ return NULL;
+
if (pud_present(*pud) && pud_devmap(*pud))
/* pass */;
else
@@ -1112,8 +1123,10 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
/*
* device mapped pages can only be returned if the
* caller will manage the page reference count.
+ *
+ * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
*/
- if (!(flags & FOLL_GET))
+ if (!(flags & (FOLL_GET | FOLL_PIN)))
return ERR_PTR(-EEXIST);
pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
@@ -1121,7 +1134,8 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
- get_page(page);
+ if (!try_grab_page(page, flags))
+ page = ERR_PTR(-ENOMEM);
return page;
}
@@ -1497,8 +1511,13 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
page = pmd_page(*pmd);
VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
+
+ if (!try_grab_page(page, flags))
+ return ERR_PTR(-ENOMEM);
+
if (flags & FOLL_TOUCH)
touch_pmd(vma, addr, pmd, flags);
+
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
/*
* We don't mlock() pte-mapped THPs. This way we can avoid
@@ -1535,8 +1554,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
skip_mlock:
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
- if (flags & FOLL_GET)
- get_page(page);
out:
return page;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dd8737a94bec..f9ea1e5197b4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -220,132 +220,303 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
return subpool_inode(file_inode(vma->vm_file));
}
-/*
- * Region tracking -- allows tracking of reservations and instantiated pages
- * across the pages in a mapping.
- *
- * The region data structures are embedded into a resv_map and protected
- * by a resv_map's lock. The set of regions within the resv_map represent
- * reservations for huge pages, or huge pages that have already been
- * instantiated within the map. The from and to elements are huge page
- * indicies into the associated mapping. from indicates the starting index
- * of the region. to represents the first index past the end of the region.
- *
- * For example, a file region structure with from == 0 and to == 4 represents
- * four huge pages in a mapping. It is important to note that the to element
- * represents the first element past the end of the region. This is used in
- * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
- *
- * Interval notation of the form [from, to) will be used to indicate that
- * the endpoint from is inclusive and to is exclusive.
+/* Helper that removes a struct file_region from the resv_map cache and returns
+ * it for use.
*/
-struct file_region {
- struct list_head link;
- long from;
- long to;
-};
+static struct file_region *
+get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
+{
+ struct file_region *nrg = NULL;
+
+ VM_BUG_ON(resv->region_cache_count <= 0);
+
+ resv->region_cache_count--;
+ nrg = list_first_entry(&resv->region_cache, struct file_region, link);
+ VM_BUG_ON(!nrg);
+ list_del(&nrg->link);
+
+ nrg->from = from;
+ nrg->to = to;
+
+ return nrg;
+}
+
+static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
+ struct file_region *rg)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ nrg->reservation_counter = rg->reservation_counter;
+ nrg->css = rg->css;
+ if (rg->css)
+ css_get(rg->css);
+#endif
+}
+
+/* Helper that records hugetlb_cgroup uncharge info. */
+static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
+ struct hstate *h,
+ struct resv_map *resv,
+ struct file_region *nrg)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ if (h_cg) {
+ nrg->reservation_counter =
+ &h_cg->rsvd_hugepage[hstate_index(h)];
+ nrg->css = &h_cg->css;
+ if (!resv->pages_per_hpage)
+ resv->pages_per_hpage = pages_per_huge_page(h);
+ /* pages_per_hpage should be the same for all entries in
+ * a resv_map.
+ */
+ VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
+ } else {
+ nrg->reservation_counter = NULL;
+ nrg->css = NULL;
+ }
+#endif
+}
+
+static bool has_same_uncharge_info(struct file_region *rg,
+ struct file_region *org)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ return rg && org &&
+ rg->reservation_counter == org->reservation_counter &&
+ rg->css == org->css;
+
+#else
+ return true;
+#endif
+}
+
+static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
+{
+ struct file_region *nrg = NULL, *prg = NULL;
+
+ prg = list_prev_entry(rg, link);
+ if (&prg->link != &resv->regions && prg->to == rg->from &&
+ has_same_uncharge_info(prg, rg)) {
+ prg->to = rg->to;
+
+ list_del(&rg->link);
+ kfree(rg);
+
+ coalesce_file_region(resv, prg);
+ return;
+ }
+
+ nrg = list_next_entry(rg, link);
+ if (&nrg->link != &resv->regions && nrg->from == rg->to &&
+ has_same_uncharge_info(nrg, rg)) {
+ nrg->from = rg->from;
+
+ list_del(&rg->link);
+ kfree(rg);
+
+ coalesce_file_region(resv, nrg);
+ return;
+ }
+}
/* Must be called with resv->lock held. Calling this with count_only == true
* will count the number of pages to be added but will not modify the linked
- * list.
+ * list. If regions_needed != NULL and count_only == true, then regions_needed
+ * will indicate the number of file_regions needed in the cache to carry out to
+ * add the regions for this range.
*/
static long add_reservation_in_range(struct resv_map *resv, long f, long t,
+ struct hugetlb_cgroup *h_cg,
+ struct hstate *h, long *regions_needed,
bool count_only)
{
- long chg = 0;
+ long add = 0;
struct list_head *head = &resv->regions;
+ long last_accounted_offset = f;
struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
- /* Locate the region we are before or in. */
- list_for_each_entry(rg, head, link)
- if (f <= rg->to)
- break;
-
- /* Round our left edge to the current segment if it encloses us. */
- if (f > rg->from)
- f = rg->from;
+ if (regions_needed)
+ *regions_needed = 0;
- chg = t - f;
+ /* In this loop, we essentially handle an entry for the range
+ * [last_accounted_offset, rg->from), at every iteration, with some
+ * bounds checking.
+ */
+ list_for_each_entry_safe(rg, trg, head, link) {
+ /* Skip irrelevant regions that start before our range. */
+ if (rg->from < f) {
+ /* If this region ends after the last accounted offset,
+ * then we need to update last_accounted_offset.
+ */
+ if (rg->to > last_accounted_offset)
+ last_accounted_offset = rg->to;
+ continue;
+ }
- /* Check for and consume any regions we now overlap with. */
- nrg = rg;
- list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
- if (&rg->link == head)
- break;
+ /* When we find a region that starts beyond our range, we've
+ * finished.
+ */
if (rg->from > t)
break;
- /* We overlap with this area, if it extends further than
- * us then we must extend ourselves. Account for its
- * existing reservation.
+ /* Add an entry for last_accounted_offset -> rg->from, and
+ * update last_accounted_offset.
+ */
+ if (rg->from > last_accounted_offset) {
+ add += rg->from - last_accounted_offset;
+ if (!count_only) {
+ nrg = get_file_region_entry_from_cache(
+ resv, last_accounted_offset, rg->from);
+ record_hugetlb_cgroup_uncharge_info(h_cg, h,
+ resv, nrg);
+ list_add(&nrg->link, rg->link.prev);
+ coalesce_file_region(resv, nrg);
+ } else if (regions_needed)
+ *regions_needed += 1;
+ }
+
+ last_accounted_offset = rg->to;
+ }
+
+ /* Handle the case where our range extends beyond
+ * last_accounted_offset.
+ */
+ if (last_accounted_offset < t) {
+ add += t - last_accounted_offset;
+ if (!count_only) {
+ nrg = get_file_region_entry_from_cache(
+ resv, last_accounted_offset, t);
+ record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
+ list_add(&nrg->link, rg->link.prev);
+ coalesce_file_region(resv, nrg);
+ } else if (regions_needed)
+ *regions_needed += 1;
+ }
+
+ VM_BUG_ON(add < 0);
+ return add;
+}
+
+/* Must be called with resv->lock acquired. Will drop lock to allocate entries.
+ */
+static int allocate_file_region_entries(struct resv_map *resv,
+ int regions_needed)
+ __must_hold(&resv->lock)
+{
+ struct list_head allocated_regions;
+ int to_allocate = 0, i = 0;
+ struct file_region *trg = NULL, *rg = NULL;
+
+ VM_BUG_ON(regions_needed < 0);
+
+ INIT_LIST_HEAD(&allocated_regions);
+
+ /*
+ * Check for sufficient descriptors in the cache to accommodate
+ * the number of in progress add operations plus regions_needed.
+ *
+ * This is a while loop because when we drop the lock, some other call
+ * to region_add or region_del may have consumed some region_entries,
+ * so we keep looping here until we finally have enough entries for
+ * (adds_in_progress + regions_needed).
+ */
+ while (resv->region_cache_count <
+ (resv->adds_in_progress + regions_needed)) {
+ to_allocate = resv->adds_in_progress + regions_needed -
+ resv->region_cache_count;
+
+ /* At this point, we should have enough entries in the cache
+ * for all the existings adds_in_progress. We should only be
+ * needing to allocate for regions_needed.
*/
- if (rg->to > t) {
- chg += rg->to - t;
- t = rg->to;
+ VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
+
+ spin_unlock(&resv->lock);
+ for (i = 0; i < to_allocate; i++) {
+ trg = kmalloc(sizeof(*trg), GFP_KERNEL);
+ if (!trg)
+ goto out_of_memory;
+ list_add(&trg->link, &allocated_regions);
}
- chg -= rg->to - rg->from;
- if (!count_only && rg != nrg) {
+ spin_lock(&resv->lock);
+
+ list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
list_del(&rg->link);
- kfree(rg);
+ list_add(&rg->link, &resv->region_cache);
+ resv->region_cache_count++;
}
}
- if (!count_only) {
- nrg->from = f;
- nrg->to = t;
- }
+ return 0;
- return chg;
+out_of_memory:
+ list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
+ list_del(&rg->link);
+ kfree(rg);
+ }
+ return -ENOMEM;
}
/*
* Add the huge page range represented by [f, t) to the reserve
- * map. Existing regions will be expanded to accommodate the specified
- * range, or a region will be taken from the cache. Sufficient regions
- * must exist in the cache due to the previous call to region_chg with
- * the same range.
+ * map. Regions will be taken from the cache to fill in this range.
+ * Sufficient regions should exist in the cache due to the previous
+ * call to region_chg with the same range, but in some cases the cache will not
+ * have sufficient entries due to races with other code doing region_add or
+ * region_del. The extra needed entries will be allocated.
*
- * Return the number of new huge pages added to the map. This
- * number is greater than or equal to zero.
+ * regions_needed is the out value provided by a previous call to region_chg.
+ *
+ * Return the number of new huge pages added to the map. This number is greater
+ * than or equal to zero. If file_region entries needed to be allocated for
+ * this operation and we were not able to allocate, it ruturns -ENOMEM.
+ * region_add of regions of length 1 never allocate file_regions and cannot
+ * fail; region_chg will always allocate at least 1 entry and a region_add for
+ * 1 page will only require at most 1 entry.
*/
-static long region_add(struct resv_map *resv, long f, long t)
+static long region_add(struct resv_map *resv, long f, long t,
+ long in_regions_needed, struct hstate *h,
+ struct hugetlb_cgroup *h_cg)
{
- struct list_head *head = &resv->regions;
- struct file_region *rg, *nrg;
- long add = 0;
+ long add = 0, actual_regions_needed = 0;
spin_lock(&resv->lock);
- /* Locate the region we are either in or before. */
- list_for_each_entry(rg, head, link)
- if (f <= rg->to)
- break;
+retry:
+
+ /* Count how many regions are actually needed to execute this add. */
+ add_reservation_in_range(resv, f, t, NULL, NULL, &actual_regions_needed,
+ true);
/*
- * If no region exists which can be expanded to include the
- * specified range, pull a region descriptor from the cache
- * and use it for this range.
+ * Check for sufficient descriptors in the cache to accommodate
+ * this add operation. Note that actual_regions_needed may be greater
+ * than in_regions_needed, as the resv_map may have been modified since
+ * the region_chg call. In this case, we need to make sure that we
+ * allocate extra entries, such that we have enough for all the
+ * existing adds_in_progress, plus the excess needed for this
+ * operation.
*/
- if (&rg->link == head || t < rg->from) {
- VM_BUG_ON(resv->region_cache_count <= 0);
-
- resv->region_cache_count--;
- nrg = list_first_entry(&resv->region_cache, struct file_region,
- link);
- list_del(&nrg->link);
+ if (actual_regions_needed > in_regions_needed &&
+ resv->region_cache_count <
+ resv->adds_in_progress +
+ (actual_regions_needed - in_regions_needed)) {
+ /* region_add operation of range 1 should never need to
+ * allocate file_region entries.
+ */
+ VM_BUG_ON(t - f <= 1);
- nrg->from = f;
- nrg->to = t;
- list_add(&nrg->link, rg->link.prev);
+ if (allocate_file_region_entries(
+ resv, actual_regions_needed - in_regions_needed)) {
+ return -ENOMEM;
+ }
- add += t - f;
- goto out_locked;
+ goto retry;
}
- add = add_reservation_in_range(resv, f, t, false);
+ add = add_reservation_in_range(resv, f, t, h_cg, h, NULL, false);
+
+ resv->adds_in_progress -= in_regions_needed;
-out_locked:
- resv->adds_in_progress--;
spin_unlock(&resv->lock);
VM_BUG_ON(add < 0);
return add;
@@ -358,46 +529,37 @@ out_locked:
* call to region_add that will actually modify the reserve
* map to add the specified range [f, t). region_chg does
* not change the number of huge pages represented by the
- * map. A new file_region structure is added to the cache
- * as a placeholder, so that the subsequent region_add
- * call will have all the regions it needs and will not fail.
+ * map. A number of new file_region structures is added to the cache as a
+ * placeholder, for the subsequent region_add call to use. At least 1
+ * file_region structure is added.
+ *
+ * out_regions_needed is the number of regions added to the
+ * resv->adds_in_progress. This value needs to be provided to a follow up call
+ * to region_add or region_abort for proper accounting.
*
* Returns the number of huge pages that need to be added to the existing
* reservation map for the range [f, t). This number is greater or equal to
* zero. -ENOMEM is returned if a new file_region structure or cache entry
* is needed and can not be allocated.
*/
-static long region_chg(struct resv_map *resv, long f, long t)
+static long region_chg(struct resv_map *resv, long f, long t,
+ long *out_regions_needed)
{
long chg = 0;
spin_lock(&resv->lock);
-retry_locked:
- resv->adds_in_progress++;
- /*
- * Check for sufficient descriptors in the cache to accommodate
- * the number of in progress add operations.
- */
- if (resv->adds_in_progress > resv->region_cache_count) {
- struct file_region *trg;
-
- VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
- /* Must drop lock to allocate a new descriptor. */
- resv->adds_in_progress--;
- spin_unlock(&resv->lock);
+ /* Count how many hugepages in this range are NOT respresented. */
+ chg = add_reservation_in_range(resv, f, t, NULL, NULL,
+ out_regions_needed, true);
- trg = kmalloc(sizeof(*trg), GFP_KERNEL);
- if (!trg)
- return -ENOMEM;
+ if (*out_regions_needed == 0)
+ *out_regions_needed = 1;
- spin_lock(&resv->lock);
- list_add(&trg->link, &resv->region_cache);
- resv->region_cache_count++;
- goto retry_locked;
- }
+ if (allocate_file_region_entries(resv, *out_regions_needed))
+ return -ENOMEM;
- chg = add_reservation_in_range(resv, f, t, true);
+ resv->adds_in_progress += *out_regions_needed;
spin_unlock(&resv->lock);
return chg;
@@ -408,17 +570,20 @@ retry_locked:
* of the resv_map keeps track of the operations in progress between
* calls to region_chg and region_add. Operations are sometimes
* aborted after the call to region_chg. In such cases, region_abort
- * is called to decrement the adds_in_progress counter.
+ * is called to decrement the adds_in_progress counter. regions_needed
+ * is the value returned by the region_chg call, it is used to decrement
+ * the adds_in_progress counter.
*
* NOTE: The range arguments [f, t) are not needed or used in this
* routine. They are kept to make reading the calling code easier as
* arguments will match the associated region_chg call.
*/
-static void region_abort(struct resv_map *resv, long f, long t)
+static void region_abort(struct resv_map *resv, long f, long t,
+ long regions_needed)
{
spin_lock(&resv->lock);
VM_BUG_ON(!resv->region_cache_count);
- resv->adds_in_progress--;
+ resv->adds_in_progress -= regions_needed;
spin_unlock(&resv->lock);
}
@@ -486,11 +651,17 @@ retry:
/* New entry for end of split region */
nrg->from = t;
nrg->to = rg->to;
+
+ copy_hugetlb_cgroup_uncharge_info(nrg, rg);
+
INIT_LIST_HEAD(&nrg->link);
/* Original entry is trimmed */
rg->to = f;
+ hugetlb_cgroup_uncharge_file_region(
+ resv, rg, nrg->to - nrg->from);
+
list_add(&nrg->link, &rg->link);
nrg = NULL;
break;
@@ -498,6 +669,8 @@ retry:
if (f <= rg->from && t >= rg->to) { /* Remove entire region */
del += rg->to - rg->from;
+ hugetlb_cgroup_uncharge_file_region(resv, rg,
+ rg->to - rg->from);
list_del(&rg->link);
kfree(rg);
continue;
@@ -506,9 +679,15 @@ retry:
if (f <= rg->from) { /* Trim beginning of region */
del += t - rg->from;
rg->from = t;
+
+ hugetlb_cgroup_uncharge_file_region(resv, rg,
+ t - rg->from);
} else { /* Trim end of region */
del += rg->to - f;
rg->to = f;
+
+ hugetlb_cgroup_uncharge_file_region(resv, rg,
+ rg->to - f);
}
}
@@ -650,6 +829,25 @@ static void set_vma_private_data(struct vm_area_struct *vma,
vma->vm_private_data = (void *)value;
}
+static void
+resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
+ struct hugetlb_cgroup *h_cg,
+ struct hstate *h)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ if (!h_cg || !h) {
+ resv_map->reservation_counter = NULL;
+ resv_map->pages_per_hpage = 0;
+ resv_map->css = NULL;
+ } else {
+ resv_map->reservation_counter =
+ &h_cg->rsvd_hugepage[hstate_index(h)];
+ resv_map->pages_per_hpage = pages_per_huge_page(h);
+ resv_map->css = &h_cg->css;
+ }
+#endif
+}
+
struct resv_map *resv_map_alloc(void)
{
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
@@ -666,6 +864,13 @@ struct resv_map *resv_map_alloc(void)
INIT_LIST_HEAD(&resv_map->regions);
resv_map->adds_in_progress = 0;
+ /*
+ * Initialize these to 0. On shared mappings, 0's here indicate these
+ * fields don't do cgroup accounting. On private mappings, these will be
+ * re-initialized to the proper values, to indicate that hugetlb cgroup
+ * reservations are to be un-charged from here.
+ */
+ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
INIT_LIST_HEAD(&resv_map->region_cache);
list_add(&rg->link, &resv_map->region_cache);
@@ -1009,6 +1214,9 @@ static void destroy_compound_gigantic_page(struct page *page,
struct page *p = page + 1;
atomic_set(compound_mapcount_ptr(page), 0);
+ if (hpage_pincount_available(page))
+ atomic_set(compound_pincount_ptr(page), 0);
+
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
clear_compound_head(p);
set_page_refcounted(p);
@@ -1069,6 +1277,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
1 << PG_writeback);
}
VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
+ VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
set_page_refcounted(page);
if (hstate_is_gigantic(h)) {
@@ -1180,6 +1389,8 @@ static void __free_huge_page(struct page *page)
clear_page_huge_active(page);
hugetlb_cgroup_uncharge_page(hstate_index(h),
pages_per_huge_page(h), page);
+ hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
+ pages_per_huge_page(h), page);
if (restore_reserve)
h->resv_huge_pages++;
@@ -1254,6 +1465,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
spin_lock(&hugetlb_lock);
set_hugetlb_cgroup(page, NULL);
+ set_hugetlb_cgroup_rsvd(page, NULL);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
spin_unlock(&hugetlb_lock);
@@ -1287,6 +1499,9 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
set_compound_head(p, page);
}
atomic_set(compound_mapcount_ptr(page), -1);
+
+ if (hpage_pincount_available(page))
+ atomic_set(compound_pincount_ptr(page), 0);
}
/*
@@ -1313,7 +1528,107 @@ int PageHeadHuge(struct page *page_head)
if (!PageHead(page_head))
return 0;
- return get_compound_page_dtor(page_head) == free_huge_page;
+ return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
+}
+
+/*
+ * Find address_space associated with hugetlbfs page.
+ * Upon entry page is locked and page 'was' mapped although mapped state
+ * could change. If necessary, use anon_vma to find vma and associated
+ * address space. The returned mapping may be stale, but it can not be
+ * invalid as page lock (which is held) is required to destroy mapping.
+ */
+static struct address_space *_get_hugetlb_page_mapping(struct page *hpage)
+{
+ struct anon_vma *anon_vma;
+ pgoff_t pgoff_start, pgoff_end;
+ struct anon_vma_chain *avc;
+ struct address_space *mapping = page_mapping(hpage);
+
+ /* Simple file based mapping */
+ if (mapping)
+ return mapping;
+
+ /*
+ * Even anonymous hugetlbfs mappings are associated with an
+ * underlying hugetlbfs file (see hugetlb_file_setup in mmap
+ * code). Find a vma associated with the anonymous vma, and
+ * use the file pointer to get address_space.
+ */
+ anon_vma = page_lock_anon_vma_read(hpage);
+ if (!anon_vma)
+ return mapping; /* NULL */
+
+ /* Use first found vma */
+ pgoff_start = page_to_pgoff(hpage);
+ pgoff_end = pgoff_start + hpage_nr_pages(hpage) - 1;
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
+ pgoff_start, pgoff_end) {
+ struct vm_area_struct *vma = avc->vma;
+
+ mapping = vma->vm_file->f_mapping;
+ break;
+ }
+
+ anon_vma_unlock_read(anon_vma);
+ return mapping;
+}
+
+/*
+ * Find and lock address space (mapping) in write mode.
+ *
+ * Upon entry, the page is locked which allows us to find the mapping
+ * even in the case of an anon page. However, locking order dictates
+ * the i_mmap_rwsem be acquired BEFORE the page lock. This is hugetlbfs
+ * specific. So, we first try to lock the sema while still holding the
+ * page lock. If this works, great! If not, then we need to drop the
+ * page lock and then acquire i_mmap_rwsem and reacquire page lock. Of
+ * course, need to revalidate state along the way.
+ */
+struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
+{
+ struct address_space *mapping, *mapping2;
+
+ mapping = _get_hugetlb_page_mapping(hpage);
+retry:
+ if (!mapping)
+ return mapping;
+
+ /*
+ * If no contention, take lock and return
+ */
+ if (i_mmap_trylock_write(mapping))
+ return mapping;
+
+ /*
+ * Must drop page lock and wait on mapping sema.
+ * Note: Once page lock is dropped, mapping could become invalid.
+ * As a hack, increase map count until we lock page again.
+ */
+ atomic_inc(&hpage->_mapcount);
+ unlock_page(hpage);
+ i_mmap_lock_write(mapping);
+ lock_page(hpage);
+ atomic_add_negative(-1, &hpage->_mapcount);
+
+ /* verify page is still mapped */
+ if (!page_mapped(hpage)) {
+ i_mmap_unlock_write(mapping);
+ return NULL;
+ }
+
+ /*
+ * Get address space again and verify it is the same one
+ * we locked. If not, drop lock and retry.
+ */
+ mapping2 = _get_hugetlb_page_mapping(hpage);
+ if (mapping2 != mapping) {
+ i_mmap_unlock_write(mapping);
+ mapping = mapping2;
+ goto retry;
+ }
+
+ return mapping;
}
pgoff_t __basepage_index(struct page *page)
@@ -1870,6 +2185,7 @@ static long __vma_reservation_common(struct hstate *h,
struct resv_map *resv;
pgoff_t idx;
long ret;
+ long dummy_out_regions_needed;
resv = vma_resv_map(vma);
if (!resv)
@@ -1878,20 +2194,29 @@ static long __vma_reservation_common(struct hstate *h,
idx = vma_hugecache_offset(h, vma, addr);
switch (mode) {
case VMA_NEEDS_RESV:
- ret = region_chg(resv, idx, idx + 1);
+ ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
+ /* We assume that vma_reservation_* routines always operate on
+ * 1 page, and that adding to resv map a 1 page entry can only
+ * ever require 1 region.
+ */
+ VM_BUG_ON(dummy_out_regions_needed != 1);
break;
case VMA_COMMIT_RESV:
- ret = region_add(resv, idx, idx + 1);
+ ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
+ /* region_add calls of range 1 should never fail. */
+ VM_BUG_ON(ret < 0);
break;
case VMA_END_RESV:
- region_abort(resv, idx, idx + 1);
+ region_abort(resv, idx, idx + 1, 1);
ret = 0;
break;
case VMA_ADD_RESV:
- if (vma->vm_flags & VM_MAYSHARE)
- ret = region_add(resv, idx, idx + 1);
- else {
- region_abort(resv, idx, idx + 1);
+ if (vma->vm_flags & VM_MAYSHARE) {
+ ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
+ /* region_add calls of range 1 should never fail. */
+ VM_BUG_ON(ret < 0);
+ } else {
+ region_abort(resv, idx, idx + 1, 1);
ret = region_del(resv, idx, idx + 1);
}
break;
@@ -2002,6 +2327,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
long gbl_chg;
int ret, idx;
struct hugetlb_cgroup *h_cg;
+ bool deferred_reserve;
idx = hstate_index(h);
/*
@@ -2039,9 +2365,19 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
gbl_chg = 1;
}
+ /* If this allocation is not consuming a reservation, charge it now.
+ */
+ deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma);
+ if (deferred_reserve) {
+ ret = hugetlb_cgroup_charge_cgroup_rsvd(
+ idx, pages_per_huge_page(h), &h_cg);
+ if (ret)
+ goto out_subpool_put;
+ }
+
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
if (ret)
- goto out_subpool_put;
+ goto out_uncharge_cgroup_reservation;
spin_lock(&hugetlb_lock);
/*
@@ -2064,6 +2400,14 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
/* Fall through */
}
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
+ /* If allocation is not consuming a reservation, also store the
+ * hugetlb_cgroup pointer on the page.
+ */
+ if (deferred_reserve) {
+ hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
+ h_cg, page);
+ }
+
spin_unlock(&hugetlb_lock);
set_page_private(page, (unsigned long)spool);
@@ -2088,6 +2432,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
+out_uncharge_cgroup_reservation:
+ if (deferred_reserve)
+ hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
+ h_cg);
out_subpool_put:
if (map_chg || avoid_reserve)
hugepage_subpool_put_pages(spool, 1);
@@ -3188,9 +3536,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
end = vma_hugecache_offset(h, vma, vma->vm_end);
reserve = (end - start) - region_count(resv, start, end);
-
- kref_put(&resv->refs, resv_map_release);
-
+ hugetlb_cgroup_uncharge_counter(resv, start, end);
if (reserve) {
/*
* Decrement reserve counts. The global reserve count may be
@@ -3199,6 +3545,8 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
hugetlb_acct_memory(h, -gbl_reserve);
}
+
+ kref_put(&resv->refs, resv_map_release);
}
static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
@@ -3306,6 +3654,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
int cow;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
+ struct address_space *mapping = vma->vm_file->f_mapping;
struct mmu_notifier_range range;
int ret = 0;
@@ -3316,6 +3665,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
vma->vm_start,
vma->vm_end);
mmu_notifier_invalidate_range_start(&range);
+ } else {
+ /*
+ * For shared mappings i_mmap_rwsem must be held to call
+ * huge_pte_alloc, otherwise the returned ptep could go
+ * away if part of a shared pmd and another thread calls
+ * huge_pmd_unshare.
+ */
+ i_mmap_lock_read(mapping);
}
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
@@ -3393,6 +3750,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
if (cow)
mmu_notifier_invalidate_range_end(&range);
+ else
+ i_mmap_unlock_read(mapping);
return ret;
}
@@ -3812,16 +4171,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
}
/*
- * Use page lock to guard against racing truncation
- * before we get page_table_lock.
+ * We can not race with truncation due to holding i_mmap_rwsem.
+ * i_size is modified when holding i_mmap_rwsem, so check here
+ * once for faults beyond end of file.
*/
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto out;
+
retry:
page = find_lock_page(mapping, idx);
if (!page) {
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
- goto out;
-
/*
* Check for page in userfault range
*/
@@ -3841,13 +4201,15 @@ retry:
};
/*
- * hugetlb_fault_mutex must be dropped before
- * handling userfault. Reacquire after handling
- * fault to make calling code simpler.
+ * hugetlb_fault_mutex and i_mmap_rwsem must be
+ * dropped before handling userfault. Reacquire
+ * after handling fault to make calling code simpler.
*/
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
+ i_mmap_lock_read(mapping);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
goto out;
}
@@ -3925,10 +4287,6 @@ retry:
}
ptl = huge_pte_lock(h, mm, ptep);
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
- goto backout;
-
ret = 0;
if (!huge_pte_none(huge_ptep_get(ptep)))
goto backout;
@@ -4012,6 +4370,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (ptep) {
+ /*
+ * Since we hold no locks, ptep could be stale. That is
+ * OK as we are only making decisions based on content and
+ * not actually modifying content here.
+ */
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
migration_entry_wait_huge(vma, mm, ptep);
@@ -4025,14 +4388,31 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
}
+ /*
+ * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
+ * until finished with ptep. This serves two purposes:
+ * 1) It prevents huge_pmd_unshare from being called elsewhere
+ * and making the ptep no longer valid.
+ * 2) It synchronizes us with i_size modifications during truncation.
+ *
+ * ptep could have already be assigned via huge_pte_offset. That
+ * is OK, as huge_pte_alloc will return the same value unless
+ * something has changed.
+ */
mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, haddr);
+ i_mmap_lock_read(mapping);
+ ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+ if (!ptep) {
+ i_mmap_unlock_read(mapping);
+ return VM_FAULT_OOM;
+ }
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
+ idx = vma_hugecache_offset(h, vma, haddr);
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -4120,6 +4500,7 @@ out_ptl:
}
out_mutex:
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
/*
* Generally it's safe to hold refcount during waiting page lock. But
* here we just wait to defer the next page fault to avoid busy loop and
@@ -4266,7 +4647,7 @@ out_release_nounlock:
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
- long i, unsigned int flags, int *nonblocking)
+ long i, unsigned int flags, int *locked)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
@@ -4337,14 +4718,17 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
spin_unlock(ptl);
if (flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
- if (nonblocking)
- fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+ if (locked)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY |
+ FAULT_FLAG_KILLABLE;
if (flags & FOLL_NOWAIT)
fault_flags |= FAULT_FLAG_ALLOW_RETRY |
FAULT_FLAG_RETRY_NOWAIT;
if (flags & FOLL_TRIED) {
- VM_WARN_ON_ONCE(fault_flags &
- FAULT_FLAG_ALLOW_RETRY);
+ /*
+ * Note: FAULT_FLAG_ALLOW_RETRY and
+ * FAULT_FLAG_TRIED can co-exist
+ */
fault_flags |= FAULT_FLAG_TRIED;
}
ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
@@ -4354,9 +4738,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
break;
}
if (ret & VM_FAULT_RETRY) {
- if (nonblocking &&
+ if (locked &&
!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
- *nonblocking = 0;
+ *locked = 0;
*nr_pages = 0;
/*
* VM_FAULT_RETRY must not return an
@@ -4376,19 +4760,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = pte_page(huge_ptep_get(pte));
/*
- * Instead of doing 'try_get_page()' below in the same_page
- * loop, just check the count once here.
- */
- if (unlikely(page_count(page) <= 0)) {
- if (pages) {
- spin_unlock(ptl);
- remainder = 0;
- err = -ENOMEM;
- break;
- }
- }
-
- /*
* If subpage information not requested, update counters
* and skip the same_page loop below.
*/
@@ -4405,7 +4776,22 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
- get_page(pages[i]);
+ /*
+ * try_grab_page() should always succeed here, because:
+ * a) we hold the ptl lock, and b) we've just checked
+ * that the huge page is present in the page tables. If
+ * the huge page is present, then the tail pages must
+ * also be present. The ptl prevents the head page and
+ * tail pages from being rearranged in any way. So this
+ * page must be available at this point, unless the page
+ * refcount overflowed:
+ */
+ if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
+ spin_unlock(ptl);
+ remainder = 0;
+ err = -ENOMEM;
+ break;
+ }
}
if (vmas)
@@ -4541,11 +4927,12 @@ int hugetlb_reserve_pages(struct inode *inode,
struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- long ret, chg;
+ long ret, chg, add = -1;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
- long gbl_reserve;
+ struct hugetlb_cgroup *h_cg = NULL;
+ long gbl_reserve, regions_needed = 0;
/* This should never happen */
if (from > to) {
@@ -4575,9 +4962,10 @@ int hugetlb_reserve_pages(struct inode *inode,
*/
resv_map = inode_resv_map(inode);
- chg = region_chg(resv_map, from, to);
+ chg = region_chg(resv_map, from, to, &regions_needed);
} else {
+ /* Private mapping. */
resv_map = resv_map_alloc();
if (!resv_map)
return -ENOMEM;
@@ -4593,6 +4981,21 @@ int hugetlb_reserve_pages(struct inode *inode,
goto out_err;
}
+ ret = hugetlb_cgroup_charge_cgroup_rsvd(
+ hstate_index(h), chg * pages_per_huge_page(h), &h_cg);
+
+ if (ret < 0) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
+ /* For private mappings, the hugetlb_cgroup uncharge info hangs
+ * of the resv_map.
+ */
+ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
+ }
+
/*
* There must be enough pages in the subpool for the mapping. If
* the subpool has a minimum size, there may be some global
@@ -4601,7 +5004,7 @@ int hugetlb_reserve_pages(struct inode *inode,
gbl_reserve = hugepage_subpool_get_pages(spool, chg);
if (gbl_reserve < 0) {
ret = -ENOSPC;
- goto out_err;
+ goto out_uncharge_cgroup;
}
/*
@@ -4610,9 +5013,7 @@ int hugetlb_reserve_pages(struct inode *inode,
*/
ret = hugetlb_acct_memory(h, gbl_reserve);
if (ret < 0) {
- /* put back original number of pages, chg */
- (void)hugepage_subpool_put_pages(spool, chg);
- goto out_err;
+ goto out_put_pages;
}
/*
@@ -4627,9 +5028,12 @@ int hugetlb_reserve_pages(struct inode *inode,
* else has to be done for private mappings here
*/
if (!vma || vma->vm_flags & VM_MAYSHARE) {
- long add = region_add(resv_map, from, to);
+ add = region_add(resv_map, from, to, regions_needed, h, h_cg);
- if (unlikely(chg > add)) {
+ if (unlikely(add < 0)) {
+ hugetlb_acct_memory(h, -gbl_reserve);
+ goto out_put_pages;
+ } else if (unlikely(chg > add)) {
/*
* pages in this range were added to the reserve
* map between region_chg and region_add. This
@@ -4639,17 +5043,29 @@ int hugetlb_reserve_pages(struct inode *inode,
*/
long rsv_adjust;
+ hugetlb_cgroup_uncharge_cgroup_rsvd(
+ hstate_index(h),
+ (chg - add) * pages_per_huge_page(h), h_cg);
+
rsv_adjust = hugepage_subpool_put_pages(spool,
chg - add);
hugetlb_acct_memory(h, -rsv_adjust);
}
}
return 0;
+out_put_pages:
+ /* put back original number of pages, chg */
+ (void)hugepage_subpool_put_pages(spool, chg);
+out_uncharge_cgroup:
+ hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
+ chg * pages_per_huge_page(h), h_cg);
out_err:
if (!vma || vma->vm_flags & VM_MAYSHARE)
- /* Don't call region_abort if region_chg failed */
- if (chg >= 0)
- region_abort(resv_map, from, to);
+ /* Only call region_abort if the region_chg succeeded but the
+ * region_add failed or didn't run.
+ */
+ if (chg >= 0 && add < 0)
+ region_abort(resv_map, from, to, regions_needed);
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
kref_put(&resv_map->refs, resv_map_release);
return ret;
@@ -4740,7 +5156,7 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
- unsigned long check_addr = *start;
+ unsigned long check_addr;
if (!(vma->vm_flags & VM_MAYSHARE))
return;
@@ -4765,10 +5181,12 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
* !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner. pmd allocation is essential for the shared case because
- * pud has to be populated inside the same i_mmap_rwsem section - otherwise
- * racing tasks could either miss the sharing (see huge_pte_offset) or select a
- * bad pmd for sharing.
+ * code much cleaner.
+ *
+ * This routine must be called with i_mmap_rwsem held in at least read mode.
+ * For hugetlbfs, this prevents removal of any page table entries associated
+ * with the address space. This is important as we are setting up sharing
+ * based on existing page table entries (mappings).
*/
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
@@ -4785,7 +5203,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!vma_shareable(vma, addr))
return (pte_t *)pmd_alloc(mm, pud, addr);
- i_mmap_lock_read(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
@@ -4815,7 +5232,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
- i_mmap_unlock_read(mapping);
return pte;
}
@@ -4826,7 +5242,7 @@ out:
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
- * called with page table lock held.
+ * Called with page table lock held and i_mmap_rwsem held in write mode.
*
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
@@ -4965,6 +5381,12 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
struct page *page = NULL;
spinlock_t *ptl;
pte_t pte;
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET)))
+ return NULL;
+
retry:
ptl = pmd_lockptr(mm, pmd);
spin_lock(ptl);
@@ -4977,8 +5399,18 @@ retry:
pte = huge_ptep_get((pte_t *)pmd);
if (pte_present(pte)) {
page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
- if (flags & FOLL_GET)
- get_page(page);
+ /*
+ * try_grab_page() should always succeed here, because: a) we
+ * hold the pmd (ptl) lock, and b) we've just checked that the
+ * huge pmd (head) page is present in the page tables. The ptl
+ * prevents the head page and tail pages from being rearranged
+ * in any way. So this page must be available at this point,
+ * unless the page refcount overflowed:
+ */
+ if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+ page = NULL;
+ goto out;
+ }
} else {
if (is_hugetlb_entry_migration(pte)) {
spin_unlock(ptl);
@@ -4999,7 +5431,7 @@ struct page * __weak
follow_huge_pud(struct mm_struct *mm, unsigned long address,
pud_t *pud, int flags)
{
- if (flags & FOLL_GET)
+ if (flags & (FOLL_GET | FOLL_PIN))
return NULL;
return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
@@ -5008,7 +5440,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
struct page * __weak
follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
{
- if (flags & FOLL_GET)
+ if (flags & (FOLL_GET | FOLL_PIN))
return NULL;
return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 5280bcf459af..c2d7ae6cabd1 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -23,29 +23,6 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
-enum hugetlb_memory_event {
- HUGETLB_MAX,
- HUGETLB_NR_MEMORY_EVENTS,
-};
-
-struct hugetlb_cgroup {
- struct cgroup_subsys_state css;
-
- /*
- * the counter to account for hugepages from hugetlb.
- */
- struct page_counter hugepage[HUGE_MAX_HSTATE];
-
- atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
- atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
-
- /* Handle for "hugetlb.events" */
- struct cgroup_file events_file[HUGE_MAX_HSTATE];
-
- /* Handle for "hugetlb.events.local" */
- struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
-};
-
#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
@@ -55,6 +32,27 @@ struct hugetlb_cgroup {
static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
+static inline struct page_counter *
+__hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx,
+ bool rsvd)
+{
+ if (rsvd)
+ return &h_cg->rsvd_hugepage[idx];
+ return &h_cg->hugepage[idx];
+}
+
+static inline struct page_counter *
+hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx)
+{
+ return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false);
+}
+
+static inline struct page_counter *
+hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx)
+{
+ return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true);
+}
+
static inline
struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
{
@@ -83,8 +81,12 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
int idx;
for (idx = 0; idx < hugetlb_max_hstate; idx++) {
- if (page_counter_read(&h_cg->hugepage[idx]))
+ if (page_counter_read(
+ hugetlb_cgroup_counter_from_cgroup(h_cg, idx)) ||
+ page_counter_read(hugetlb_cgroup_counter_from_cgroup_rsvd(
+ h_cg, idx))) {
return true;
+ }
}
return false;
}
@@ -95,18 +97,34 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
int idx;
for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
- struct page_counter *counter = &h_cgroup->hugepage[idx];
- struct page_counter *parent = NULL;
+ struct page_counter *fault_parent = NULL;
+ struct page_counter *rsvd_parent = NULL;
unsigned long limit;
int ret;
- if (parent_h_cgroup)
- parent = &parent_h_cgroup->hugepage[idx];
- page_counter_init(counter, parent);
+ if (parent_h_cgroup) {
+ fault_parent = hugetlb_cgroup_counter_from_cgroup(
+ parent_h_cgroup, idx);
+ rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd(
+ parent_h_cgroup, idx);
+ }
+ page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup,
+ idx),
+ fault_parent);
+ page_counter_init(
+ hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
+ rsvd_parent);
limit = round_down(PAGE_COUNTER_MAX,
1 << huge_page_order(&hstates[idx]));
- ret = page_counter_set_max(counter, limit);
+
+ ret = page_counter_set_max(
+ hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx),
+ limit);
+ VM_BUG_ON(ret);
+ ret = page_counter_set_max(
+ hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
+ limit);
VM_BUG_ON(ret);
}
}
@@ -136,7 +154,6 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
kfree(h_cgroup);
}
-
/*
* Should be called with hugetlb_lock held.
* Since we are holding hugetlb_lock, pages cannot get moved from
@@ -213,8 +230,9 @@ static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx,
!hugetlb_cgroup_is_root(hugetlb));
}
-int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
- struct hugetlb_cgroup **ptr)
+static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup **ptr,
+ bool rsvd)
{
int ret = 0;
struct page_counter *counter;
@@ -237,50 +255,103 @@ again:
}
rcu_read_unlock();
- if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages,
- &counter)) {
+ if (!page_counter_try_charge(
+ __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
+ nr_pages, &counter)) {
ret = -ENOMEM;
hugetlb_event(h_cg, idx, HUGETLB_MAX);
+ css_put(&h_cg->css);
+ goto done;
}
- css_put(&h_cg->css);
+ /* Reservations take a reference to the css because they do not get
+ * reparented.
+ */
+ if (!rsvd)
+ css_put(&h_cg->css);
done:
*ptr = h_cg;
return ret;
}
+int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup **ptr)
+{
+ return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false);
+}
+
+int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup **ptr)
+{
+ return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true);
+}
+
/* Should be called with hugetlb_lock held */
-void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
- struct hugetlb_cgroup *h_cg,
- struct page *page)
+static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg,
+ struct page *page, bool rsvd)
{
if (hugetlb_cgroup_disabled() || !h_cg)
return;
- set_hugetlb_cgroup(page, h_cg);
+ __set_hugetlb_cgroup(page, h_cg, rsvd);
return;
}
+void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg,
+ struct page *page)
+{
+ __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, false);
+}
+
+void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg,
+ struct page *page)
+{
+ __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, true);
+}
+
/*
* Should be called with hugetlb_lock held
*/
-void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
- struct page *page)
+static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
+ struct page *page, bool rsvd)
{
struct hugetlb_cgroup *h_cg;
if (hugetlb_cgroup_disabled())
return;
lockdep_assert_held(&hugetlb_lock);
- h_cg = hugetlb_cgroup_from_page(page);
+ h_cg = __hugetlb_cgroup_from_page(page, rsvd);
if (unlikely(!h_cg))
return;
- set_hugetlb_cgroup(page, NULL);
- page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
+ __set_hugetlb_cgroup(page, NULL, rsvd);
+
+ page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
+ rsvd),
+ nr_pages);
+
+ if (rsvd)
+ css_put(&h_cg->css);
+
return;
}
-void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
- struct hugetlb_cgroup *h_cg)
+void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
+ struct page *page)
+{
+ __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, false);
+}
+
+void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages,
+ struct page *page)
+{
+ __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, true);
+}
+
+static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg,
+ bool rsvd)
{
if (hugetlb_cgroup_disabled() || !h_cg)
return;
@@ -288,34 +359,91 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
return;
- page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
- return;
+ page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
+ rsvd),
+ nr_pages);
+
+ if (rsvd)
+ css_put(&h_cg->css);
+}
+
+void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg)
+{
+ __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false);
+}
+
+void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg)
+{
+ __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true);
+}
+
+void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start,
+ unsigned long end)
+{
+ if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter ||
+ !resv->css)
+ return;
+
+ page_counter_uncharge(resv->reservation_counter,
+ (end - start) * resv->pages_per_hpage);
+ css_put(resv->css);
+}
+
+void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
+ struct file_region *rg,
+ unsigned long nr_pages)
+{
+ if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
+ return;
+
+ if (rg->reservation_counter && resv->pages_per_hpage && nr_pages > 0 &&
+ !resv->reservation_counter) {
+ page_counter_uncharge(rg->reservation_counter,
+ nr_pages * resv->pages_per_hpage);
+ css_put(rg->css);
+ }
}
enum {
RES_USAGE,
+ RES_RSVD_USAGE,
RES_LIMIT,
+ RES_RSVD_LIMIT,
RES_MAX_USAGE,
+ RES_RSVD_MAX_USAGE,
RES_FAILCNT,
+ RES_RSVD_FAILCNT,
};
static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct page_counter *counter;
+ struct page_counter *rsvd_counter;
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
+ rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)];
switch (MEMFILE_ATTR(cft->private)) {
case RES_USAGE:
return (u64)page_counter_read(counter) * PAGE_SIZE;
+ case RES_RSVD_USAGE:
+ return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE;
case RES_LIMIT:
return (u64)counter->max * PAGE_SIZE;
+ case RES_RSVD_LIMIT:
+ return (u64)rsvd_counter->max * PAGE_SIZE;
case RES_MAX_USAGE:
return (u64)counter->watermark * PAGE_SIZE;
+ case RES_RSVD_MAX_USAGE:
+ return (u64)rsvd_counter->watermark * PAGE_SIZE;
case RES_FAILCNT:
return counter->failcnt;
+ case RES_RSVD_FAILCNT:
+ return rsvd_counter->failcnt;
default:
BUG();
}
@@ -337,10 +465,16 @@ static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
1 << huge_page_order(&hstates[idx]));
switch (MEMFILE_ATTR(cft->private)) {
+ case RES_RSVD_USAGE:
+ counter = &h_cg->rsvd_hugepage[idx];
+ /* Fall through. */
case RES_USAGE:
val = (u64)page_counter_read(counter);
seq_printf(seq, "%llu\n", val * PAGE_SIZE);
break;
+ case RES_RSVD_LIMIT:
+ counter = &h_cg->rsvd_hugepage[idx];
+ /* Fall through. */
case RES_LIMIT:
val = (u64)counter->max;
if (val == limit)
@@ -364,6 +498,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
int ret, idx;
unsigned long nr_pages;
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
+ bool rsvd = false;
if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
return -EINVAL;
@@ -377,9 +512,14 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
nr_pages = round_down(nr_pages, 1 << huge_page_order(&hstates[idx]));
switch (MEMFILE_ATTR(of_cft(of)->private)) {
+ case RES_RSVD_LIMIT:
+ rsvd = true;
+ /* Fall through. */
case RES_LIMIT:
mutex_lock(&hugetlb_limit_mutex);
- ret = page_counter_set_max(&h_cg->hugepage[idx], nr_pages);
+ ret = page_counter_set_max(
+ __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
+ nr_pages);
mutex_unlock(&hugetlb_limit_mutex);
break;
default:
@@ -405,18 +545,25 @@ static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
int ret = 0;
- struct page_counter *counter;
+ struct page_counter *counter, *rsvd_counter;
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
+ rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)];
switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_MAX_USAGE:
page_counter_reset_watermark(counter);
break;
+ case RES_RSVD_MAX_USAGE:
+ page_counter_reset_watermark(rsvd_counter);
+ break;
case RES_FAILCNT:
counter->failcnt = 0;
break;
+ case RES_RSVD_FAILCNT:
+ rsvd_counter->failcnt = 0;
+ break;
default:
ret = -EINVAL;
break;
@@ -471,7 +618,7 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx)
struct hstate *h = &hstates[idx];
/* format the size */
- mem_fmt(buf, 32, huge_page_size(h));
+ mem_fmt(buf, sizeof(buf), huge_page_size(h));
/* Add the limit file */
cft = &h->cgroup_files_dfl[0];
@@ -481,15 +628,30 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx)
cft->write = hugetlb_cgroup_write_dfl;
cft->flags = CFTYPE_NOT_ON_ROOT;
- /* Add the current usage file */
+ /* Add the reservation limit file */
cft = &h->cgroup_files_dfl[1];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT);
+ cft->seq_show = hugetlb_cgroup_read_u64_max;
+ cft->write = hugetlb_cgroup_write_dfl;
+ cft->flags = CFTYPE_NOT_ON_ROOT;
+
+ /* Add the current usage file */
+ cft = &h->cgroup_files_dfl[2];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
cft->seq_show = hugetlb_cgroup_read_u64_max;
cft->flags = CFTYPE_NOT_ON_ROOT;
+ /* Add the current reservation usage file */
+ cft = &h->cgroup_files_dfl[3];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.current", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE);
+ cft->seq_show = hugetlb_cgroup_read_u64_max;
+ cft->flags = CFTYPE_NOT_ON_ROOT;
+
/* Add the events file */
- cft = &h->cgroup_files_dfl[2];
+ cft = &h->cgroup_files_dfl[4];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf);
cft->private = MEMFILE_PRIVATE(idx, 0);
cft->seq_show = hugetlb_events_show;
@@ -497,7 +659,7 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx)
cft->flags = CFTYPE_NOT_ON_ROOT;
/* Add the events.local file */
- cft = &h->cgroup_files_dfl[3];
+ cft = &h->cgroup_files_dfl[5];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events.local", buf);
cft->private = MEMFILE_PRIVATE(idx, 0);
cft->seq_show = hugetlb_events_local_show;
@@ -506,7 +668,7 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx)
cft->flags = CFTYPE_NOT_ON_ROOT;
/* NULL terminate the last cft */
- cft = &h->cgroup_files_dfl[4];
+ cft = &h->cgroup_files_dfl[6];
memset(cft, 0, sizeof(*cft));
WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
@@ -520,7 +682,7 @@ static void __init __hugetlb_cgroup_file_legacy_init(int idx)
struct hstate *h = &hstates[idx];
/* format the size */
- mem_fmt(buf, 32, huge_page_size(h));
+ mem_fmt(buf, sizeof(buf), huge_page_size(h));
/* Add the limit file */
cft = &h->cgroup_files_legacy[0];
@@ -529,28 +691,55 @@ static void __init __hugetlb_cgroup_file_legacy_init(int idx)
cft->read_u64 = hugetlb_cgroup_read_u64;
cft->write = hugetlb_cgroup_write_legacy;
- /* Add the usage file */
+ /* Add the reservation limit file */
cft = &h->cgroup_files_legacy[1];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.limit_in_bytes", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT);
+ cft->read_u64 = hugetlb_cgroup_read_u64;
+ cft->write = hugetlb_cgroup_write_legacy;
+
+ /* Add the usage file */
+ cft = &h->cgroup_files_legacy[2];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
cft->read_u64 = hugetlb_cgroup_read_u64;
+ /* Add the reservation usage file */
+ cft = &h->cgroup_files_legacy[3];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.usage_in_bytes", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE);
+ cft->read_u64 = hugetlb_cgroup_read_u64;
+
/* Add the MAX usage file */
- cft = &h->cgroup_files_legacy[2];
+ cft = &h->cgroup_files_legacy[4];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
cft->write = hugetlb_cgroup_reset;
cft->read_u64 = hugetlb_cgroup_read_u64;
+ /* Add the MAX reservation usage file */
+ cft = &h->cgroup_files_legacy[5];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max_usage_in_bytes", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_MAX_USAGE);
+ cft->write = hugetlb_cgroup_reset;
+ cft->read_u64 = hugetlb_cgroup_read_u64;
+
/* Add the failcntfile */
- cft = &h->cgroup_files_legacy[3];
+ cft = &h->cgroup_files_legacy[6];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
- cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
+ cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
+ cft->write = hugetlb_cgroup_reset;
+ cft->read_u64 = hugetlb_cgroup_read_u64;
+
+ /* Add the reservation failcntfile */
+ cft = &h->cgroup_files_legacy[7];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.failcnt", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_FAILCNT);
cft->write = hugetlb_cgroup_reset;
cft->read_u64 = hugetlb_cgroup_read_u64;
/* NULL terminate the last cft */
- cft = &h->cgroup_files_legacy[4];
+ cft = &h->cgroup_files_legacy[8];
memset(cft, 0, sizeof(*cft));
WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
@@ -585,6 +774,7 @@ void __init hugetlb_cgroup_file_init(void)
void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
{
struct hugetlb_cgroup *h_cg;
+ struct hugetlb_cgroup *h_cg_rsvd;
struct hstate *h = page_hstate(oldhpage);
if (hugetlb_cgroup_disabled())
@@ -593,10 +783,13 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
spin_lock(&hugetlb_lock);
h_cg = hugetlb_cgroup_from_page(oldhpage);
+ h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage);
set_hugetlb_cgroup(oldhpage, NULL);
+ set_hugetlb_cgroup_rsvd(oldhpage, NULL);
/* move the h_cg details to new cgroup */
set_hugetlb_cgroup(newhpage, h_cg);
+ set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd);
list_move(&newhpage->lru, &h->hugepage_activelist);
spin_unlock(&hugetlb_lock);
return;
diff --git a/mm/internal.h b/mm/internal.h
index 3cf20ab3ca01..2d58ae15a958 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -63,6 +63,29 @@ static inline unsigned long ra_submit(struct file_ra_state *ra,
ra->start, ra->size, ra->async_size);
}
+/**
+ * page_evictable - test whether a page is evictable
+ * @page: the page to test
+ *
+ * Test whether page is evictable--i.e., should be placed on active/inactive
+ * lists vs unevictable list.
+ *
+ * Reasons page might not be evictable:
+ * (1) page's mapping marked unevictable
+ * (2) page is part of an mlocked VMA
+ *
+ */
+static inline bool page_evictable(struct page *page)
+{
+ bool ret;
+
+ /* Prevent address_space of inode and swap cache from being freed */
+ rcu_read_lock();
+ ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
+ rcu_read_unlock();
+ return ret;
+}
+
/*
* Turn a non-refcounted page (->_refcount == 0) into refcounted with
* a count of one.
@@ -206,6 +229,7 @@ struct compact_control {
bool whole_zone; /* Whole zone should/has been scanned */
bool contended; /* Signal lock or sched contention */
bool rescan; /* Rescanning the same pageblock */
+ bool alloc_contig; /* alloc_contig_range allocation */
};
/*
@@ -377,10 +401,10 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
/*
* FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
* anything, so we only pin the file and drop the mmap_sem if only
- * FAULT_FLAG_ALLOW_RETRY is set.
+ * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt.
*/
- if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
- FAULT_FLAG_ALLOW_RETRY) {
+ if (fault_flag_allow_retry_first(flags) &&
+ !(flags & FAULT_FLAG_RETRY_NOWAIT)) {
fpin = get_file(vmf->vma->vm_file);
up_read(&vmf->vma->vm_mm->mmap_sem);
}
@@ -532,7 +556,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
#else
#define ALLOC_NOFRAGMENT 0x0
#endif
-#define ALLOC_KSWAPD 0x200 /* allow waking of kswapd */
+#define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
enum ttu_flags;
struct tlbflush_unmap_batch;
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 6aa51723b92b..e61b4a492218 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -105,7 +105,8 @@ EXPORT_SYMBOL(__kasan_check_write);
#undef memset
void *memset(void *addr, int c, size_t len)
{
- check_memory_region((unsigned long)addr, len, true, _RET_IP_);
+ if (!check_memory_region((unsigned long)addr, len, true, _RET_IP_))
+ return NULL;
return __memset(addr, c, len);
}
@@ -114,8 +115,9 @@ void *memset(void *addr, int c, size_t len)
#undef memmove
void *memmove(void *dest, const void *src, size_t len)
{
- check_memory_region((unsigned long)src, len, false, _RET_IP_);
- check_memory_region((unsigned long)dest, len, true, _RET_IP_);
+ if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
+ !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
+ return NULL;
return __memmove(dest, src, len);
}
@@ -124,8 +126,9 @@ void *memmove(void *dest, const void *src, size_t len)
#undef memcpy
void *memcpy(void *dest, const void *src, size_t len)
{
- check_memory_region((unsigned long)src, len, false, _RET_IP_);
- check_memory_region((unsigned long)dest, len, true, _RET_IP_);
+ if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
+ !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
+ return NULL;
return __memcpy(dest, src, len);
}
@@ -634,12 +637,21 @@ void kasan_free_shadow(const struct vm_struct *vm)
#endif
extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip);
+extern bool report_enabled(void);
-void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip)
+bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip)
{
unsigned long flags = user_access_save();
- __kasan_report(addr, size, is_write, ip);
+ bool ret = false;
+
+ if (likely(report_enabled())) {
+ __kasan_report(addr, size, is_write, ip);
+ ret = true;
+ }
+
user_access_restore(flags);
+
+ return ret;
}
#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 616f9dd82d12..56ff8885fe2e 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -173,17 +173,18 @@ static __always_inline bool check_memory_region_inline(unsigned long addr,
if (unlikely(size == 0))
return true;
+ if (unlikely(addr + size < addr))
+ return !kasan_report(addr, size, write, ret_ip);
+
if (unlikely((void *)addr <
kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
- kasan_report(addr, size, write, ret_ip);
- return false;
+ return !kasan_report(addr, size, write, ret_ip);
}
if (likely(!memory_is_poisoned(addr, size)))
return true;
- kasan_report(addr, size, write, ret_ip);
- return false;
+ return !kasan_report(addr, size, write, ret_ip);
}
bool check_memory_region(unsigned long addr, size_t size, bool write,
diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c
index 2d97efd4954f..e200acb2d292 100644
--- a/mm/kasan/generic_report.c
+++ b/mm/kasan/generic_report.c
@@ -110,6 +110,17 @@ static const char *get_wild_bug_type(struct kasan_access_info *info)
const char *get_bug_type(struct kasan_access_info *info)
{
+ /*
+ * If access_size is a negative number, then it has reason to be
+ * defined as out-of-bounds bug type.
+ *
+ * Casting negative numbers to size_t would indeed turn up as
+ * a large size_t and its value will be larger than ULONG_MAX/2,
+ * so that this can qualify as out-of-bounds.
+ */
+ if (info->access_addr + info->access_size < info->access_addr)
+ return "out-of-bounds";
+
if (addr_has_shadow(info->access_addr))
return get_shadow_bug_type(info);
return get_wild_bug_type(info);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 3a083274628e..e8f37199d885 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -153,7 +153,7 @@ bool check_memory_region(unsigned long addr, size_t size, bool write,
void *find_first_bad_addr(void *addr, size_t size);
const char *get_bug_type(struct kasan_access_info *info);
-void kasan_report(unsigned long addr, size_t size,
+bool kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
void kasan_report_invalid_free(void *object, unsigned long ip);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 5ef9f24f566b..cf5c17d5e361 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -446,7 +446,7 @@ static void print_shadow_for_address(const void *addr)
}
}
-static bool report_enabled(void)
+bool report_enabled(void)
{
if (current->kasan_depth)
return false;
@@ -478,9 +478,6 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon
void *untagged_addr;
unsigned long flags;
- if (likely(!report_enabled()))
- return;
-
disable_trace_on_warning();
tagged_addr = (void *)addr;
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index 0e987c9ca052..25b7734e7013 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -86,6 +86,9 @@ bool check_memory_region(unsigned long addr, size_t size, bool write,
if (unlikely(size == 0))
return true;
+ if (unlikely(addr + size < addr))
+ return !kasan_report(addr, size, write, ret_ip);
+
tag = get_tag((const void *)addr);
/*
@@ -111,15 +114,13 @@ bool check_memory_region(unsigned long addr, size_t size, bool write,
untagged_addr = reset_tag((const void *)addr);
if (unlikely(untagged_addr <
kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
- kasan_report(addr, size, write, ret_ip);
- return false;
+ return !kasan_report(addr, size, write, ret_ip);
}
shadow_first = kasan_mem_to_shadow(untagged_addr);
shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1);
for (shadow = shadow_first; shadow <= shadow_last; shadow++) {
if (*shadow != tag) {
- kasan_report(addr, size, write, ret_ip);
- return false;
+ return !kasan_report(addr, size, write, ret_ip);
}
}
diff --git a/mm/kasan/tags_report.c b/mm/kasan/tags_report.c
index 969ae08f59d7..bee43717d6f0 100644
--- a/mm/kasan/tags_report.c
+++ b/mm/kasan/tags_report.c
@@ -60,6 +60,17 @@ const char *get_bug_type(struct kasan_access_info *info)
}
#endif
+ /*
+ * If access_size is a negative number, then it has reason to be
+ * defined as out-of-bounds bug type.
+ *
+ * Casting negative numbers to size_t would indeed turn up as
+ * a large size_t and its value will be larger than ULONG_MAX/2,
+ * so that this can qualify as out-of-bounds.
+ */
+ if (info->access_addr + info->access_size < info->access_addr)
+ return "out-of-bounds";
+
return "invalid-access";
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b679908743cb..c659c68728bc 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -308,8 +308,6 @@ struct attribute_group khugepaged_attr_group = {
};
#endif /* CONFIG_SYSFS */
-#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
-
int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
{
@@ -423,7 +421,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
}
if (!vma->anon_vma || vma->vm_ops)
return false;
- if (is_vma_temporary_stack(vma))
+ if (vma_is_temporary_stack(vma))
return false;
return !(vm_flags & VM_NO_KHUGEPAGED);
}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 3a4259eeb5a0..e362dc3d2028 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1947,7 +1947,7 @@ void __init kmemleak_init(void)
create_object((unsigned long)__bss_start, __bss_stop - __bss_start,
KMEMLEAK_GREY, GFP_ATOMIC);
/* only register .data..ro_after_init if not within .data */
- if (__start_ro_after_init < _sdata || __end_ro_after_init > _edata)
+ if (&__start_ro_after_init < &_sdata || &__end_ro_after_init > &_edata)
create_object((unsigned long)__start_ro_after_init,
__end_ro_after_init - __start_ro_after_init,
KMEMLEAK_GREY, GFP_ATOMIC);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 0f1f6b06b7f3..8de5e3784ee4 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -57,16 +57,6 @@ list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
return &nlru->lru;
}
-static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
-{
- struct page *page;
-
- if (!memcg_kmem_enabled())
- return NULL;
- page = virt_to_head_page(ptr);
- return memcg_from_slab_page(page);
-}
-
static inline struct list_lru_one *
list_lru_from_kmem(struct list_lru_node *nlru, void *ptr,
struct mem_cgroup **memcg_ptr)
@@ -77,7 +67,7 @@ list_lru_from_kmem(struct list_lru_node *nlru, void *ptr,
if (!nlru->memcg_lrus)
goto out;
- memcg = mem_cgroup_from_kmem(ptr);
+ memcg = mem_cgroup_from_obj(ptr);
if (!memcg)
goto out;
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index 71070dda9643..2c7d03675903 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -111,26 +111,60 @@ static int clean_record_pte(pte_t *pte, unsigned long addr,
return 0;
}
-/* wp_clean_pmd_entry - The pagewalk pmd callback. */
+/*
+ * wp_clean_pmd_entry - The pagewalk pmd callback.
+ *
+ * Dirty-tracking should take place on the PTE level, so
+ * WARN() if encountering a dirty huge pmd.
+ * Furthermore, never split huge pmds, since that currently
+ * causes dirty info loss. The pagefault handler should do
+ * that if needed.
+ */
static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- /* Dirty-tracking should be handled on the pte level */
pmd_t pmdval = pmd_read_atomic(pmd);
+ if (!pmd_trans_unstable(&pmdval))
+ return 0;
+
+ if (pmd_none(pmdval)) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
+
+ /* Huge pmd, present or migrated */
+ walk->action = ACTION_CONTINUE;
if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval));
return 0;
}
-/* wp_clean_pud_entry - The pagewalk pud callback. */
+/*
+ * wp_clean_pud_entry - The pagewalk pud callback.
+ *
+ * Dirty-tracking should take place on the PTE level, so
+ * WARN() if encountering a dirty huge puds.
+ * Furthermore, never split huge puds, since that currently
+ * causes dirty info loss. The pagefault handler should do
+ * that if needed.
+ */
static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- /* Dirty-tracking should be handled on the pte level */
pud_t pudval = READ_ONCE(*pud);
+ if (!pud_trans_unstable(&pudval))
+ return 0;
+
+ if (pud_none(pudval)) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
+
+ /* Huge pud */
+ walk->action = ACTION_CONTINUE;
if (pud_trans_huge(pudval) || pud_devmap(pudval))
WARN_ON(pud_write(pudval) || pud_dirty(pudval));
diff --git a/mm/memblock.c b/mm/memblock.c
index eba94ee3de0b..4d06bbaded0f 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1698,7 +1698,7 @@ static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
void __init memblock_enforce_memory_limit(phys_addr_t limit)
{
- phys_addr_t max_addr = PHYS_ADDR_MAX;
+ phys_addr_t max_addr;
if (!limit)
return;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7ddf91c4295f..ca194864d802 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -334,7 +334,7 @@ static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
if (!old)
return 0;
- new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
+ new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
if (!new)
return -ENOMEM;
@@ -378,7 +378,7 @@ static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
mutex_lock(&memcg_shrinker_map_mutex);
size = memcg_shrinker_map_size;
for_each_node(nid) {
- map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
+ map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
if (!map) {
memcg_free_shrinker_maps(memcg);
ret = -ENOMEM;
@@ -656,7 +656,7 @@ retry:
*/
__mem_cgroup_remove_exceeded(mz, mctz);
if (!soft_limit_excess(mz->memcg) ||
- !css_tryget_online(&mz->memcg->css))
+ !css_tryget(&mz->memcg->css))
goto retry;
done:
return mz;
@@ -759,13 +759,12 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
{
- struct page *page = virt_to_head_page(p);
- pg_data_t *pgdat = page_pgdat(page);
+ pg_data_t *pgdat = page_pgdat(virt_to_page(p));
struct mem_cgroup *memcg;
struct lruvec *lruvec;
rcu_read_lock();
- memcg = memcg_from_slab_page(page);
+ memcg = mem_cgroup_from_obj(p);
/* Untracked pages have no memcg, no lruvec. Update only the node */
if (!memcg || memcg == root_mem_cgroup) {
@@ -973,7 +972,8 @@ struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
return NULL;
rcu_read_lock();
- if (!memcg || !css_tryget_online(&memcg->css))
+ /* Page should not get uncharged and freed memcg under us. */
+ if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
memcg = root_mem_cgroup;
rcu_read_unlock();
return memcg;
@@ -986,10 +986,13 @@ EXPORT_SYMBOL(get_mem_cgroup_from_page);
static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
{
if (unlikely(current->active_memcg)) {
- struct mem_cgroup *memcg = root_mem_cgroup;
+ struct mem_cgroup *memcg;
rcu_read_lock();
- if (css_tryget_online(&current->active_memcg->css))
+ /* current->active_memcg must hold a ref. */
+ if (WARN_ON_ONCE(!css_tryget(&current->active_memcg->css)))
+ memcg = root_mem_cgroup;
+ else
memcg = current->active_memcg;
rcu_read_unlock();
return memcg;
@@ -1518,11 +1521,11 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
- K((u64)memcg->memory.max), memcg->memory.failcnt);
+ K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->swap)),
- K((u64)memcg->swap.max), memcg->swap.failcnt);
+ K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
else {
pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memsw)),
@@ -1549,13 +1552,13 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
unsigned long max;
- max = memcg->memory.max;
+ max = READ_ONCE(memcg->memory.max);
if (mem_cgroup_swappiness(memcg)) {
unsigned long memsw_max;
unsigned long swap_max;
memsw_max = memcg->memsw.max;
- swap_max = memcg->swap.max;
+ swap_max = READ_ONCE(memcg->swap.max);
swap_max = min(swap_max, (unsigned long)total_swap_pages);
max = min(max + swap_max, memsw_max);
}
@@ -1928,6 +1931,14 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
goto out;
/*
+ * If the victim task has been asynchronously moved to a different
+ * memory cgroup, we might end up killing tasks outside oom_domain.
+ * In this case it's better to ignore memory.group.oom.
+ */
+ if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
+ goto out;
+
+ /*
* Traverse the memory cgroup hierarchy from the victim task's
* cgroup up to the OOMing cgroup (or root) to find the
* highest-level memory cgroup with oom.group set.
@@ -2239,7 +2250,7 @@ static void reclaim_high(struct mem_cgroup *memcg,
gfp_t gfp_mask)
{
do {
- if (page_counter_read(&memcg->memory) <= memcg->high)
+ if (page_counter_read(&memcg->memory) <= READ_ONCE(memcg->high))
continue;
memcg_memory_event(memcg, MEMCG_HIGH);
try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
@@ -2579,7 +2590,7 @@ done_restock:
* reclaim, the cost of mismatch is negligible.
*/
do {
- if (page_counter_read(&memcg->memory) > memcg->high) {
+ if (page_counter_read(&memcg->memory) > READ_ONCE(memcg->high)) {
/* Don't bother a random interrupted task */
if (in_interrupt()) {
schedule_work(&memcg->high_work);
@@ -2882,18 +2893,16 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep)
}
/**
- * __memcg_kmem_charge_memcg: charge a kmem page
- * @page: page to charge
- * @gfp: reclaim mode
- * @order: allocation order
+ * __memcg_kmem_charge: charge a number of kernel pages to a memcg
* @memcg: memory cgroup to charge
+ * @gfp: reclaim mode
+ * @nr_pages: number of pages to charge
*
* Returns 0 on success, an error code on failure.
*/
-int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
- struct mem_cgroup *memcg)
+int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
+ unsigned int nr_pages)
{
- unsigned int nr_pages = 1 << order;
struct page_counter *counter;
int ret;
@@ -2920,14 +2929,29 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
}
/**
- * __memcg_kmem_charge: charge a kmem page to the current memory cgroup
+ * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
+ * @memcg: memcg to uncharge
+ * @nr_pages: number of pages to uncharge
+ */
+void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, nr_pages);
+}
+
+/**
+ * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
* @page: page to charge
* @gfp: reclaim mode
* @order: allocation order
*
* Returns 0 on success, an error code on failure.
*/
-int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
{
struct mem_cgroup *memcg;
int ret = 0;
@@ -2937,7 +2961,7 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
memcg = get_mem_cgroup_from_current();
if (!mem_cgroup_is_root(memcg)) {
- ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
if (!ret) {
page->mem_cgroup = memcg;
__SetPageKmemcg(page);
@@ -2948,26 +2972,11 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
}
/**
- * __memcg_kmem_uncharge_memcg: uncharge a kmem page
- * @memcg: memcg to uncharge
- * @nr_pages: number of pages to uncharge
- */
-void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
- unsigned int nr_pages)
-{
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- page_counter_uncharge(&memcg->kmem, nr_pages);
-
- page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
-}
-/**
- * __memcg_kmem_uncharge: uncharge a kmem page
+ * __memcg_kmem_uncharge_page: uncharge a kmem page
* @page: page to uncharge
* @order: allocation order
*/
-void __memcg_kmem_uncharge(struct page *page, int order)
+void __memcg_kmem_uncharge_page(struct page *page, int order)
{
struct mem_cgroup *memcg = page->mem_cgroup;
unsigned int nr_pages = 1 << order;
@@ -2976,7 +2985,7 @@ void __memcg_kmem_uncharge(struct page *page, int order)
return;
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
- __memcg_kmem_uncharge_memcg(memcg, nr_pages);
+ __memcg_kmem_uncharge(memcg, nr_pages);
page->mem_cgroup = NULL;
/* slab pages do not have PageKmemcg flag set */
@@ -3067,7 +3076,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
* Make sure that the new limit (memsw or memory limit) doesn't
* break our basic invariant rule memory.max <= memsw.max.
*/
- limits_invariant = memsw ? max >= memcg->memory.max :
+ limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
max <= memcg->memsw.max;
if (!limits_invariant) {
mutex_unlock(&memcg_max_mutex);
@@ -3814,8 +3823,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
- memory = min(memory, mi->memory.max);
- memsw = min(memsw, mi->memsw.max);
+ memory = min(memory, READ_ONCE(mi->memory.max));
+ memsw = min(memsw, READ_ONCE(mi->memsw.max));
}
seq_printf(m, "hierarchical_memory_limit %llu\n",
(u64)memory * PAGE_SIZE);
@@ -4324,7 +4333,8 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
*pheadroom = PAGE_COUNTER_MAX;
while ((parent = parent_mem_cgroup(memcg))) {
- unsigned long ceiling = min(memcg->memory.max, memcg->high);
+ unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
+ READ_ONCE(memcg->high));
unsigned long used = page_counter_read(&memcg->memory);
*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
@@ -4792,7 +4802,8 @@ static struct cftype mem_cgroup_legacy_files[] = {
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
-#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
+#if defined(CONFIG_MEMCG_KMEM) && \
+ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
{
.name = "kmem.slabinfo",
.seq_start = memcg_slab_start,
@@ -4861,7 +4872,8 @@ static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
}
}
-static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
+static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
+ unsigned int n)
{
refcount_add(n, &memcg->id.ref);
}
@@ -5044,7 +5056,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (!memcg)
return ERR_PTR(error);
- memcg->high = PAGE_COUNTER_MAX;
+ WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
if (parent) {
memcg->swappiness = mem_cgroup_swappiness(parent);
@@ -5197,7 +5209,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
- memcg->high = PAGE_COUNTER_MAX;
+ WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
memcg_wb_domain_size_changed(memcg);
}
@@ -6013,7 +6025,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
if (err)
return err;
- memcg->high = high;
+ WRITE_ONCE(memcg->high, high);
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
@@ -6236,6 +6248,117 @@ struct cgroup_subsys memory_cgrp_subsys = {
.early_init = 0,
};
+/*
+ * This function calculates an individual cgroup's effective
+ * protection which is derived from its own memory.min/low, its
+ * parent's and siblings' settings, as well as the actual memory
+ * distribution in the tree.
+ *
+ * The following rules apply to the effective protection values:
+ *
+ * 1. At the first level of reclaim, effective protection is equal to
+ * the declared protection in memory.min and memory.low.
+ *
+ * 2. To enable safe delegation of the protection configuration, at
+ * subsequent levels the effective protection is capped to the
+ * parent's effective protection.
+ *
+ * 3. To make complex and dynamic subtrees easier to configure, the
+ * user is allowed to overcommit the declared protection at a given
+ * level. If that is the case, the parent's effective protection is
+ * distributed to the children in proportion to how much protection
+ * they have declared and how much of it they are utilizing.
+ *
+ * This makes distribution proportional, but also work-conserving:
+ * if one cgroup claims much more protection than it uses memory,
+ * the unused remainder is available to its siblings.
+ *
+ * 4. Conversely, when the declared protection is undercommitted at a
+ * given level, the distribution of the larger parental protection
+ * budget is NOT proportional. A cgroup's protection from a sibling
+ * is capped to its own memory.min/low setting.
+ *
+ * 5. However, to allow protecting recursive subtrees from each other
+ * without having to declare each individual cgroup's fixed share
+ * of the ancestor's claim to protection, any unutilized -
+ * "floating" - protection from up the tree is distributed in
+ * proportion to each cgroup's *usage*. This makes the protection
+ * neutral wrt sibling cgroups and lets them compete freely over
+ * the shared parental protection budget, but it protects the
+ * subtree as a whole from neighboring subtrees.
+ *
+ * Note that 4. and 5. are not in conflict: 4. is about protecting
+ * against immediate siblings whereas 5. is about protecting against
+ * neighboring subtrees.
+ */
+static unsigned long effective_protection(unsigned long usage,
+ unsigned long parent_usage,
+ unsigned long setting,
+ unsigned long parent_effective,
+ unsigned long siblings_protected)
+{
+ unsigned long protected;
+ unsigned long ep;
+
+ protected = min(usage, setting);
+ /*
+ * If all cgroups at this level combined claim and use more
+ * protection then what the parent affords them, distribute
+ * shares in proportion to utilization.
+ *
+ * We are using actual utilization rather than the statically
+ * claimed protection in order to be work-conserving: claimed
+ * but unused protection is available to siblings that would
+ * otherwise get a smaller chunk than what they claimed.
+ */
+ if (siblings_protected > parent_effective)
+ return protected * parent_effective / siblings_protected;
+
+ /*
+ * Ok, utilized protection of all children is within what the
+ * parent affords them, so we know whatever this child claims
+ * and utilizes is effectively protected.
+ *
+ * If there is unprotected usage beyond this value, reclaim
+ * will apply pressure in proportion to that amount.
+ *
+ * If there is unutilized protection, the cgroup will be fully
+ * shielded from reclaim, but we do return a smaller value for
+ * protection than what the group could enjoy in theory. This
+ * is okay. With the overcommit distribution above, effective
+ * protection is always dependent on how memory is actually
+ * consumed among the siblings anyway.
+ */
+ ep = protected;
+
+ /*
+ * If the children aren't claiming (all of) the protection
+ * afforded to them by the parent, distribute the remainder in
+ * proportion to the (unprotected) memory of each cgroup. That
+ * way, cgroups that aren't explicitly prioritized wrt each
+ * other compete freely over the allowance, but they are
+ * collectively protected from neighboring trees.
+ *
+ * We're using unprotected memory for the weight so that if
+ * some cgroups DO claim explicit protection, we don't protect
+ * the same bytes twice.
+ */
+ if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
+ return ep;
+
+ if (parent_effective > siblings_protected && usage > protected) {
+ unsigned long unclaimed;
+
+ unclaimed = parent_effective - siblings_protected;
+ unclaimed *= usage - protected;
+ unclaimed /= parent_usage - siblings_protected;
+
+ ep += unclaimed;
+ }
+
+ return ep;
+}
+
/**
* mem_cgroup_protected - check if memory consumption is in the normal range
* @root: the top ancestor of the sub-tree being checked
@@ -6249,70 +6372,12 @@ struct cgroup_subsys memory_cgrp_subsys = {
* MEMCG_PROT_LOW: cgroup memory is protected as long there is
* an unprotected supply of reclaimable memory from other cgroups.
* MEMCG_PROT_MIN: cgroup memory is protected
- *
- * @root is exclusive; it is never protected when looked at directly
- *
- * To provide a proper hierarchical behavior, effective memory.min/low values
- * are used. Below is the description of how effective memory.low is calculated.
- * Effective memory.min values is calculated in the same way.
- *
- * Effective memory.low is always equal or less than the original memory.low.
- * If there is no memory.low overcommittment (which is always true for
- * top-level memory cgroups), these two values are equal.
- * Otherwise, it's a part of parent's effective memory.low,
- * calculated as a cgroup's memory.low usage divided by sum of sibling's
- * memory.low usages, where memory.low usage is the size of actually
- * protected memory.
- *
- * low_usage
- * elow = min( memory.low, parent->elow * ------------------ ),
- * siblings_low_usage
- *
- * | memory.current, if memory.current < memory.low
- * low_usage = |
- * | 0, otherwise.
- *
- *
- * Such definition of the effective memory.low provides the expected
- * hierarchical behavior: parent's memory.low value is limiting
- * children, unprotected memory is reclaimed first and cgroups,
- * which are not using their guarantee do not affect actual memory
- * distribution.
- *
- * For example, if there are memcgs A, A/B, A/C, A/D and A/E:
- *
- * A A/memory.low = 2G, A/memory.current = 6G
- * //\\
- * BC DE B/memory.low = 3G B/memory.current = 2G
- * C/memory.low = 1G C/memory.current = 2G
- * D/memory.low = 0 D/memory.current = 2G
- * E/memory.low = 10G E/memory.current = 0
- *
- * and the memory pressure is applied, the following memory distribution
- * is expected (approximately):
- *
- * A/memory.current = 2G
- *
- * B/memory.current = 1.3G
- * C/memory.current = 0.6G
- * D/memory.current = 0
- * E/memory.current = 0
- *
- * These calculations require constant tracking of the actual low usages
- * (see propagate_protected_usage()), as well as recursive calculation of
- * effective memory.low values. But as we do call mem_cgroup_protected()
- * path for each memory cgroup top-down from the reclaim,
- * it's possible to optimize this part, and save calculated elow
- * for next usage. This part is intentionally racy, but it's ok,
- * as memory.low is a best-effort mechanism.
*/
enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
struct mem_cgroup *memcg)
{
+ unsigned long usage, parent_usage;
struct mem_cgroup *parent;
- unsigned long emin, parent_emin;
- unsigned long elow, parent_elow;
- unsigned long usage;
if (mem_cgroup_disabled())
return MEMCG_PROT_NONE;
@@ -6326,52 +6391,32 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
if (!usage)
return MEMCG_PROT_NONE;
- emin = memcg->memory.min;
- elow = memcg->memory.low;
-
parent = parent_mem_cgroup(memcg);
/* No parent means a non-hierarchical mode on v1 memcg */
if (!parent)
return MEMCG_PROT_NONE;
- if (parent == root)
- goto exit;
-
- parent_emin = READ_ONCE(parent->memory.emin);
- emin = min(emin, parent_emin);
- if (emin && parent_emin) {
- unsigned long min_usage, siblings_min_usage;
-
- min_usage = min(usage, memcg->memory.min);
- siblings_min_usage = atomic_long_read(
- &parent->memory.children_min_usage);
-
- if (min_usage && siblings_min_usage)
- emin = min(emin, parent_emin * min_usage /
- siblings_min_usage);
+ if (parent == root) {
+ memcg->memory.emin = READ_ONCE(memcg->memory.min);
+ memcg->memory.elow = memcg->memory.low;
+ goto out;
}
- parent_elow = READ_ONCE(parent->memory.elow);
- elow = min(elow, parent_elow);
- if (elow && parent_elow) {
- unsigned long low_usage, siblings_low_usage;
+ parent_usage = page_counter_read(&parent->memory);
- low_usage = min(usage, memcg->memory.low);
- siblings_low_usage = atomic_long_read(
- &parent->memory.children_low_usage);
+ WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
+ READ_ONCE(memcg->memory.min),
+ READ_ONCE(parent->memory.emin),
+ atomic_long_read(&parent->memory.children_min_usage)));
- if (low_usage && siblings_low_usage)
- elow = min(elow, parent_elow * low_usage /
- siblings_low_usage);
- }
+ WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
+ memcg->memory.low, READ_ONCE(parent->memory.elow),
+ atomic_long_read(&parent->memory.children_low_usage)));
-exit:
- memcg->memory.emin = emin;
- memcg->memory.elow = elow;
-
- if (usage <= emin)
+out:
+ if (usage <= memcg->memory.emin)
return MEMCG_PROT_MIN;
- else if (usage <= elow)
+ else if (usage <= memcg->memory.elow)
return MEMCG_PROT_LOW;
else
return MEMCG_PROT_NONE;
@@ -6759,7 +6804,7 @@ void mem_cgroup_sk_alloc(struct sock *sk)
goto out;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
goto out;
- if (css_tryget_online(&memcg->css))
+ if (css_tryget(&memcg->css))
sk->sk_memcg = memcg;
out:
rcu_read_unlock();
@@ -7080,7 +7125,8 @@ bool mem_cgroup_swap_full(struct page *page)
return false;
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
- if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
+ if (page_counter_read(&memcg->swap) * 2 >=
+ READ_ONCE(memcg->swap.max))
return true;
return false;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 41c634f45d45..1c961cd26c0b 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -954,7 +954,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
struct address_space *mapping;
LIST_HEAD(tokill);
- bool unmap_success;
+ bool unmap_success = true;
int kill = 1, forcekill;
struct page *hpage = *hpagep;
bool mlocked = PageMlocked(hpage);
@@ -1016,7 +1016,32 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (kill)
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
- unmap_success = try_to_unmap(hpage, ttu);
+ if (!PageHuge(hpage)) {
+ unmap_success = try_to_unmap(hpage, ttu);
+ } else {
+ /*
+ * For hugetlb pages, try_to_unmap could potentially call
+ * huge_pmd_unshare. Because of this, take semaphore in
+ * write mode here and set TTU_RMAP_LOCKED to indicate we
+ * have taken the lock at this higer level.
+ *
+ * Note that the call to hugetlb_page_mapping_lock_write
+ * is necessary even if mapping is already set. It handles
+ * ugliness of potentially having to drop page lock to obtain
+ * i_mmap_rwsem.
+ */
+ mapping = hugetlb_page_mapping_lock_write(hpage);
+
+ if (mapping) {
+ unmap_success = try_to_unmap(hpage,
+ ttu|TTU_RMAP_LOCKED);
+ i_mmap_unlock_write(mapping);
+ } else {
+ pr_info("Memory failure: %#lx: could not find mapping for mapped huge page\n",
+ pfn);
+ unmap_success = false;
+ }
+ }
if (!unmap_success)
pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
diff --git a/mm/memory.c b/mm/memory.c
index e8bfdf0d9d1d..5c356a57b892 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1939,7 +1939,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
* remap_pfn_range - remap kernel memory to userspace
* @vma: user vma to map to
* @addr: target user address to start at
- * @pfn: physical address of kernel memory
+ * @pfn: page frame number of kernel physical memory address
* @size: size of map area
* @prot: page protection flags for this mapping
*
@@ -2009,7 +2009,7 @@ EXPORT_SYMBOL(remap_pfn_range);
/**
* vm_iomap_memory - remap memory to userspace
* @vma: user vma to map to
- * @start: start of area
+ * @start: start of the physical memory to be mapped
* @len: size of area
*
* This is a simplified io_remap_pfn_range() for common driver use. The
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 977c641f78cf..5fb427aed612 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -557,9 +557,10 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
+ int ret = 0;
#ifdef CONFIG_HUGETLB_PAGE
struct queue_pages *qp = walk->private;
- unsigned long flags = qp->flags;
+ unsigned long flags = (qp->flags & MPOL_MF_VALID);
struct page *page;
spinlock_t *ptl;
pte_t entry;
@@ -571,16 +572,44 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
page = pte_page(entry);
if (!queue_pages_required(page, qp))
goto unlock;
+
+ if (flags == MPOL_MF_STRICT) {
+ /*
+ * STRICT alone means only detecting misplaced page and no
+ * need to further check other vma.
+ */
+ ret = -EIO;
+ goto unlock;
+ }
+
+ if (!vma_migratable(walk->vma)) {
+ /*
+ * Must be STRICT with MOVE*, otherwise .test_walk() have
+ * stopped walking current vma.
+ * Detecting misplaced page but allow migrating pages which
+ * have been queued.
+ */
+ ret = 1;
+ goto unlock;
+ }
+
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
if (flags & (MPOL_MF_MOVE_ALL) ||
- (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
- isolate_huge_page(page, qp->pagelist);
+ (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
+ if (!isolate_huge_page(page, qp->pagelist) &&
+ (flags & MPOL_MF_STRICT))
+ /*
+ * Failed to isolate page but allow migrating pages
+ * which have been queued.
+ */
+ ret = 1;
+ }
unlock:
spin_unlock(ptl);
#else
BUG();
#endif
- return 0;
+ return ret;
}
#ifdef CONFIG_NUMA_BALANCING
@@ -621,7 +650,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
unsigned long flags = qp->flags;
/* range check first */
- VM_BUG_ON((vma->vm_start > start) || (vma->vm_end < end));
+ VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);
if (!qp->first) {
qp->first = vma;
@@ -1714,6 +1743,34 @@ COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
#endif /* CONFIG_COMPAT */
+bool vma_migratable(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ return false;
+
+ /*
+ * DAX device mappings require predictable access latency, so avoid
+ * incurring periodic faults.
+ */
+ if (vma_is_dax(vma))
+ return false;
+
+ if (is_vm_hugetlb_page(vma) &&
+ !hugepage_migration_supported(hstate_vma(vma)))
+ return false;
+
+ /*
+ * Migration allocates pages in the highest zone. If we cannot
+ * do so then migration (at least from node to node) is not
+ * possible.
+ */
+ if (vma->vm_file &&
+ gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
+ < policy_zone)
+ return false;
+ return true;
+}
+
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
unsigned long addr)
{
@@ -2841,7 +2898,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
switch (mode) {
case MPOL_PREFERRED:
/*
- * Insist on a nodelist of one node only
+ * Insist on a nodelist of one node only, although later
+ * we use first_node(nodes) to grab a single node, so here
+ * nodelist (or nodes) cannot be empty.
*/
if (nodelist) {
char *rest = nodelist;
@@ -2849,6 +2908,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
rest++;
if (*rest)
goto out;
+ if (nodes_empty(nodes))
+ goto out;
}
break;
case MPOL_INTERLEAVE:
diff --git a/mm/migrate.c b/mm/migrate.c
index 7605d2c23433..7ded07081be9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1282,6 +1282,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
int page_was_mapped = 0;
struct page *new_hpage;
struct anon_vma *anon_vma = NULL;
+ struct address_space *mapping = NULL;
/*
* Migratability of hugepages depends on architectures and their size.
@@ -1329,18 +1330,36 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
goto put_anon;
if (page_mapped(hpage)) {
+ /*
+ * try_to_unmap could potentially call huge_pmd_unshare.
+ * Because of this, take semaphore in write mode here and
+ * set TTU_RMAP_LOCKED to let lower levels know we have
+ * taken the lock.
+ */
+ mapping = hugetlb_page_mapping_lock_write(hpage);
+ if (unlikely(!mapping))
+ goto unlock_put_anon;
+
try_to_unmap(hpage,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
+ TTU_RMAP_LOCKED);
page_was_mapped = 1;
+ /*
+ * Leave mapping locked until after subsequent call to
+ * remove_migration_ptes()
+ */
}
if (!page_mapped(hpage))
rc = move_to_new_page(new_hpage, hpage, mode);
- if (page_was_mapped)
+ if (page_was_mapped) {
remove_migration_ptes(hpage,
- rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
+ rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, true);
+ i_mmap_unlock_write(mapping);
+ }
+unlock_put_anon:
unlock_page(new_hpage);
put_anon:
diff --git a/mm/mmap.c b/mm/mmap.c
index d681a20eb4ea..94ae18398c59 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -53,6 +53,9 @@
#include <asm/tlb.h>
#include <asm/mmu_context.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/mmap.h>
+
#include "internal.h"
#ifndef arch_mmap_check
@@ -1848,7 +1851,7 @@ unacct_error:
return error;
}
-unsigned long unmapped_area(struct vm_unmapped_area_info *info)
+static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
/*
* We implement the search by looking for an rbtree node that
@@ -1951,7 +1954,7 @@ found:
return gap_start;
}
-unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
+static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
@@ -2050,6 +2053,27 @@ found_highest:
return gap_end;
}
+/*
+ * Search for an unmapped address range.
+ *
+ * We are looking for a range that:
+ * - does not intersect with any VMA;
+ * - is contained within the [low_limit, high_limit) interval;
+ * - is at least the desired size.
+ * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
+ */
+unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
+{
+ unsigned long addr;
+
+ if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
+ addr = unmapped_area_topdown(info);
+ else
+ addr = unmapped_area(info);
+
+ trace_vm_unmapped_area(addr, info);
+ return addr;
+}
#ifndef arch_get_mmap_end
#define arch_get_mmap_end(addr) (TASK_SIZE)
diff --git a/mm/mremap.c b/mm/mremap.c
index d28f08a36b96..a7e282ead438 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -133,7 +133,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
* such races:
*
* - During exec() shift_arg_pages(), we use a specially tagged vma
- * which rmap call sites look for using is_vma_temporary_stack().
+ * which rmap call sites look for using vma_is_temporary_stack().
*
* - During mremap(), new_vma is often known to be placed after vma
* in rmap traversal order. This ensures rmap will always observe
@@ -318,8 +318,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long old_addr, unsigned long old_len,
unsigned long new_len, unsigned long new_addr,
- bool *locked, struct vm_userfaultfd_ctx *uf,
- struct list_head *uf_unmap)
+ bool *locked, unsigned long flags,
+ struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
@@ -408,11 +408,32 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (unlikely(vma->vm_flags & VM_PFNMAP))
untrack_pfn_moved(vma);
+ if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
+ if (vm_flags & VM_ACCOUNT) {
+ /* Always put back VM_ACCOUNT since we won't unmap */
+ vma->vm_flags |= VM_ACCOUNT;
+
+ vm_acct_memory(vma_pages(new_vma));
+ }
+
+ /* We always clear VM_LOCKED[ONFAULT] on the old vma */
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+
+ /* Because we won't unmap we don't need to touch locked_vm */
+ goto out;
+ }
+
if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
/* OOM: unable to split vma, just get accounts right */
vm_unacct_memory(excess >> PAGE_SHIFT);
excess = 0;
}
+
+ if (vm_flags & VM_LOCKED) {
+ mm->locked_vm += new_len >> PAGE_SHIFT;
+ *locked = true;
+ }
+out:
mm->hiwater_vm = hiwater_vm;
/* Restore VM_ACCOUNT if one or two pieces of vma left */
@@ -422,16 +443,12 @@ static unsigned long move_vma(struct vm_area_struct *vma,
vma->vm_next->vm_flags |= VM_ACCOUNT;
}
- if (vm_flags & VM_LOCKED) {
- mm->locked_vm += new_len >> PAGE_SHIFT;
- *locked = true;
- }
-
return new_addr;
}
static struct vm_area_struct *vma_to_resize(unsigned long addr,
- unsigned long old_len, unsigned long new_len, unsigned long *p)
+ unsigned long old_len, unsigned long new_len, unsigned long flags,
+ unsigned long *p)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = find_vma(mm, addr);
@@ -453,6 +470,10 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
return ERR_PTR(-EINVAL);
}
+ if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) ||
+ vma->vm_flags & VM_SHARED))
+ return ERR_PTR(-EINVAL);
+
if (is_vm_hugetlb_page(vma))
return ERR_PTR(-EINVAL);
@@ -497,7 +518,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
unsigned long new_addr, unsigned long new_len, bool *locked,
- struct vm_userfaultfd_ctx *uf,
+ unsigned long flags, struct vm_userfaultfd_ctx *uf,
struct list_head *uf_unmap_early,
struct list_head *uf_unmap)
{
@@ -505,7 +526,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
unsigned long charged = 0;
- unsigned long map_flags;
+ unsigned long map_flags = 0;
if (offset_in_page(new_addr))
goto out;
@@ -534,9 +555,11 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
return -ENOMEM;
- ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
- if (ret)
- goto out;
+ if (flags & MREMAP_FIXED) {
+ ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
+ if (ret)
+ goto out;
+ }
if (old_len >= new_len) {
ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
@@ -545,13 +568,22 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
old_len = new_len;
}
- vma = vma_to_resize(addr, old_len, new_len, &charged);
+ vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
}
- map_flags = MAP_FIXED;
+ /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
+ if (flags & MREMAP_DONTUNMAP &&
+ !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (flags & MREMAP_FIXED)
+ map_flags |= MAP_FIXED;
+
if (vma->vm_flags & VM_MAYSHARE)
map_flags |= MAP_SHARED;
@@ -561,10 +593,16 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if (IS_ERR_VALUE(ret))
goto out1;
- ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
+ /* We got a new mapping */
+ if (!(flags & MREMAP_FIXED))
+ new_addr = ret;
+
+ ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
uf_unmap);
+
if (!(offset_in_page(ret)))
goto out;
+
out1:
vm_unacct_memory(charged);
@@ -618,12 +656,21 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
*/
addr = untagged_addr(addr);
- if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
return ret;
if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
return ret;
+ /*
+ * MREMAP_DONTUNMAP is always a move and it does not allow resizing
+ * in the process.
+ */
+ if (flags & MREMAP_DONTUNMAP &&
+ (!(flags & MREMAP_MAYMOVE) || old_len != new_len))
+ return ret;
+
+
if (offset_in_page(addr))
return ret;
@@ -641,9 +688,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (down_write_killable(&current->mm->mmap_sem))
return -EINTR;
- if (flags & MREMAP_FIXED) {
+ if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
ret = mremap_to(addr, old_len, new_addr, new_len,
- &locked, &uf, &uf_unmap_early, &uf_unmap);
+ &locked, flags, &uf, &uf_unmap_early,
+ &uf_unmap);
goto out;
}
@@ -671,7 +719,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/*
* Ok, we need to grow..
*/
- vma = vma_to_resize(addr, old_len, new_len, &charged);
+ vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
@@ -721,7 +769,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}
ret = move_vma(vma, addr, old_len, new_len, new_addr,
- &locked, &uf, &uf_unmap);
+ &locked, flags, &uf, &uf_unmap);
}
out:
if (offset_in_page(ret)) {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2caf780a42e7..7326b54ab728 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2182,12 +2182,12 @@ int write_cache_pages(struct address_space *mapping,
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
}
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
+ tag_pages_for_writeback(mapping, index, end);
tag = PAGECACHE_TAG_TOWRITE;
- else
+ } else {
tag = PAGECACHE_TAG_DIRTY;
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag_pages_for_writeback(mapping, index, end);
+ }
done_index = index;
while (!done && (index <= end)) {
int i;
@@ -2655,7 +2655,7 @@ int clear_page_dirty_for_io(struct page *page)
struct address_space *mapping = page_mapping(page);
int ret = 0;
- BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
if (mapping && mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
@@ -2764,7 +2764,7 @@ int test_clear_page_writeback(struct page *page)
int __test_set_page_writeback(struct page *page, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
- int ret;
+ int ret, access_ret;
lock_page_memcg(page);
if (mapping && mapping_use_writeback_tags(mapping)) {
@@ -2807,6 +2807,13 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
}
unlock_page_memcg(page);
+ access_ret = arch_make_page_accessible(page);
+ /*
+ * If writeback has been triggered on a page that cannot be made
+ * accessible, it is too late to recover here.
+ */
+ VM_BUG_ON_PAGE(access_ret != 0, page);
+
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3c4eb750a199..e5f76da8cd4e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -95,7 +95,6 @@ DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
*/
DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
EXPORT_PER_CPU_SYMBOL(_numa_mem_);
-int _node_numa_mem_[MAX_NUMNODES];
#endif
/* work_structs for global per-cpu drains */
@@ -689,6 +688,8 @@ void prep_compound_page(struct page *page, unsigned int order)
set_compound_head(p, page);
}
atomic_set(compound_mapcount_ptr(page), -1);
+ if (hpage_pincount_available(page))
+ atomic_set(compound_pincount_ptr(page), 0);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
@@ -791,32 +792,25 @@ static inline void set_page_order(struct page *page, unsigned int order)
*
* For recording page's order, we use page_private(page).
*/
-static inline int page_is_buddy(struct page *page, struct page *buddy,
+static inline bool page_is_buddy(struct page *page, struct page *buddy,
unsigned int order)
{
- if (page_is_guard(buddy) && page_order(buddy) == order) {
- if (page_zone_id(page) != page_zone_id(buddy))
- return 0;
-
- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
+ if (!page_is_guard(buddy) && !PageBuddy(buddy))
+ return false;
- return 1;
- }
+ if (page_order(buddy) != order)
+ return false;
- if (PageBuddy(buddy) && page_order(buddy) == order) {
- /*
- * zone check is done late to avoid uselessly
- * calculating zone/node ids for pages that could
- * never merge.
- */
- if (page_zone_id(page) != page_zone_id(buddy))
- return 0;
+ /*
+ * zone check is done late to avoid uselessly calculating
+ * zone/node ids for pages that could never merge.
+ */
+ if (page_zone_id(page) != page_zone_id(buddy))
+ return false;
- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
- return 1;
- }
- return 0;
+ return true;
}
#ifdef CONFIG_COMPACTION
@@ -1152,7 +1146,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
if (PageMappingFlags(page))
page->mapping = NULL;
if (memcg_kmem_enabled() && PageKmemcg(page))
- __memcg_kmem_uncharge(page, order);
+ __memcg_kmem_uncharge_page(page, order);
if (check_free)
bad += free_pages_check(page);
if (bad)
@@ -3459,8 +3453,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
return true;
}
#endif
- if (alloc_harder &&
- !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
+ if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
return true;
}
return false;
@@ -3535,10 +3528,13 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
static inline unsigned int
alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
{
- unsigned int alloc_flags = 0;
+ unsigned int alloc_flags;
- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
- alloc_flags |= ALLOC_KSWAPD;
+ /*
+ * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
+ * to save a branch.
+ */
+ alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
#ifdef CONFIG_ZONE_DMA32
if (!zone)
@@ -4174,8 +4170,13 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
{
unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
- /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
+ /*
+ * __GFP_HIGH is assumed to be the same as ALLOC_HIGH
+ * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
+ * to save two branches.
+ */
BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
+ BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
/*
* The caller may dip into page reserves a bit more if the caller
@@ -4183,7 +4184,8 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
* set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
*/
- alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
+ alloc_flags |= (__force int)
+ (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
if (gfp_mask & __GFP_ATOMIC) {
/*
@@ -4200,9 +4202,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
- alloc_flags |= ALLOC_KSWAPD;
-
#ifdef CONFIG_CMA
if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
@@ -4745,14 +4744,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
* Restore the original nodemask if it was potentially replaced with
* &cpuset_current_mems_allowed to optimize the fast-path attempt.
*/
- if (unlikely(ac.nodemask != nodemask))
- ac.nodemask = nodemask;
+ ac.nodemask = nodemask;
page = __alloc_pages_slowpath(alloc_mask, order, &ac);
out:
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
- unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) {
+ unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) {
__free_pages(page, order);
page = NULL;
}
@@ -7867,8 +7865,8 @@ int __meminit init_per_zone_wmark_min(void)
min_free_kbytes = new_min_free_kbytes;
if (min_free_kbytes < 128)
min_free_kbytes = 128;
- if (min_free_kbytes > 65536)
- min_free_kbytes = 65536;
+ if (min_free_kbytes > 262144)
+ min_free_kbytes = 262144;
} else {
pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
new_min_free_kbytes, user_min_free_kbytes);
@@ -8253,15 +8251,20 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
/*
* Hugepages are not in LRU lists, but they're movable.
+ * THPs are on the LRU, but need to be counted as #small pages.
* We need not scan over tail pages because we don't
* handle each tail page individually in migration.
*/
- if (PageHuge(page)) {
+ if (PageHuge(page) || PageTransCompound(page)) {
struct page *head = compound_head(page);
unsigned int skip_pages;
- if (!hugepage_migration_supported(page_hstate(head)))
+ if (PageHuge(page)) {
+ if (!hugepage_migration_supported(page_hstate(head)))
+ return page;
+ } else if (!PageLRU(head) && !__PageMovable(head)) {
return page;
+ }
skip_pages = compound_nr(head) - (page - head);
iter += skip_pages - 1;
@@ -8402,6 +8405,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
.ignore_skip_hint = true,
.no_set_skip_hint = true,
.gfp_mask = current_gfp_context(gfp_mask),
+ .alloc_contig = true,
};
INIT_LIST_HEAD(&cc.migratepages);
diff --git a/mm/page_counter.c b/mm/page_counter.c
index de31470655f6..c56db2d5e159 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -17,29 +17,24 @@ static void propagate_protected_usage(struct page_counter *c,
unsigned long usage)
{
unsigned long protected, old_protected;
+ unsigned long low, min;
long delta;
if (!c->parent)
return;
- if (c->min || atomic_long_read(&c->min_usage)) {
- if (usage <= c->min)
- protected = usage;
- else
- protected = 0;
-
+ min = READ_ONCE(c->min);
+ if (min || atomic_long_read(&c->min_usage)) {
+ protected = min(usage, min);
old_protected = atomic_long_xchg(&c->min_usage, protected);
delta = protected - old_protected;
if (delta)
atomic_long_add(delta, &c->parent->children_min_usage);
}
- if (c->low || atomic_long_read(&c->low_usage)) {
- if (usage <= c->low)
- protected = usage;
- else
- protected = 0;
-
+ low = READ_ONCE(c->low);
+ if (low || atomic_long_read(&c->low_usage)) {
+ protected = min(usage, low);
old_protected = atomic_long_xchg(&c->low_usage, protected);
delta = protected - old_protected;
if (delta)
@@ -213,7 +208,7 @@ void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
{
struct page_counter *c;
- counter->min = nr_pages;
+ WRITE_ONCE(counter->min, nr_pages);
for (c = counter; c; c = c->parent)
propagate_protected_usage(c, atomic_long_read(&c->usage));
@@ -230,7 +225,7 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
{
struct page_counter *c;
- counter->low = nr_pages;
+ WRITE_ONCE(counter->low, nr_pages);
for (c = counter; c; c = c->parent)
propagate_protected_usage(c, atomic_long_read(&c->usage));
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 4ade843ff588..08ded037f89f 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -304,7 +304,7 @@ static int __meminit online_page_ext(unsigned long start_pfn,
}
for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
- if (!pfn_present(pfn))
+ if (!pfn_in_present_section(pfn))
continue;
fail = init_section_page_ext(pfn, nid);
}
diff --git a/mm/rmap.c b/mm/rmap.c
index b3e381919835..2df75a119c83 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -22,9 +22,10 @@
*
* inode->i_mutex (while writing or truncating, not reading or faulting)
* mm->mmap_sem
- * page->flags PG_locked (lock_page)
+ * page->flags PG_locked (lock_page) * (see huegtlbfs below)
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
* mapping->i_mmap_rwsem
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
* pgdat->lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -43,6 +44,11 @@
* anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
* ->tasklist_lock
* pte map lock
+ *
+ * * hugetlbfs PageHuge() pages take locks in this order:
+ * mapping->i_mmap_rwsem
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
+ * page->flags PG_locked (lock_page)
*/
#include <linux/mm.h>
@@ -1178,6 +1184,9 @@ void page_add_new_anon_rmap(struct page *page,
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
/* increment count (starts at -1) */
atomic_set(compound_mapcount_ptr(page), 0);
+ if (hpage_pincount_available(page))
+ atomic_set(compound_pincount_ptr(page), 0);
+
__inc_node_page_state(page, NR_ANON_THPS);
} else {
/* Anon THP always mapped first with PMD */
@@ -1406,6 +1415,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/*
* If sharing is possible, start and end will be adjusted
* accordingly.
+ *
+ * If called for a huge page, caller must hold i_mmap_rwsem
+ * in write mode as it is possible to call huge_pmd_unshare.
*/
adjust_range_if_pmd_sharing_possible(vma, &range.start,
&range.end);
@@ -1453,6 +1465,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
address = pvmw.address;
if (PageHuge(page)) {
+ /*
+ * To call huge_pmd_unshare, i_mmap_rwsem must be
+ * held in write mode. Caller needs to explicitly
+ * do this outside rmap routines.
+ */
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (huge_pmd_unshare(mm, &address, pvmw.pte)) {
/*
* huge_pmd_unshare unmapped an entire PMD
@@ -1696,23 +1714,9 @@ discard:
return ret;
}
-bool is_vma_temporary_stack(struct vm_area_struct *vma)
-{
- int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
-
- if (!maybe_stack)
- return false;
-
- if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
- VM_STACK_INCOMPLETE_SETUP)
- return true;
-
- return false;
-}
-
static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
{
- return is_vma_temporary_stack(vma);
+ return vma_is_temporary_stack(vma);
}
static int page_mapcount_is_zero(struct page *page)
@@ -1974,6 +1978,9 @@ void hugepage_add_new_anon_rmap(struct page *page,
{
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
atomic_set(compound_mapcount_ptr(page), 0);
+ if (hpage_pincount_available(page))
+ atomic_set(compound_pincount_ptr(page), 0);
+
__page_set_anon_rmap(page, vma, address, 1);
}
#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shuffle.c b/mm/shuffle.c
index b3fe97fd6654..c716059cbd3c 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -72,7 +72,7 @@ static struct page * __meminit shuffle_valid_page(unsigned long pfn, int order)
return NULL;
/* ...is the pfn in a present section or a hole? */
- if (!pfn_present(pfn))
+ if (!pfn_in_present_section(pfn))
return NULL;
/* ...is the page free and currently on a free_area list? */
diff --git a/mm/slab.h b/mm/slab.h
index 7e94700aa78c..207c83ef6e06 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -348,6 +348,7 @@ static __always_inline int memcg_charge_slab(struct page *page,
gfp_t gfp, int order,
struct kmem_cache *s)
{
+ unsigned int nr_pages = 1 << order;
struct mem_cgroup *memcg;
struct lruvec *lruvec;
int ret;
@@ -360,21 +361,21 @@ static __always_inline int memcg_charge_slab(struct page *page,
if (unlikely(!memcg || mem_cgroup_is_root(memcg))) {
mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
- (1 << order));
- percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
+ nr_pages);
+ percpu_ref_get_many(&s->memcg_params.refcnt, nr_pages);
return 0;
}
- ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ ret = memcg_kmem_charge(memcg, gfp, nr_pages);
if (ret)
goto out;
lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
- mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order);
+ mod_lruvec_state(lruvec, cache_vmstat_idx(s), nr_pages);
/* transer try_charge() page references to kmem_cache */
- percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
- css_put_many(&memcg->css, 1 << order);
+ percpu_ref_get_many(&s->memcg_params.refcnt, nr_pages);
+ css_put_many(&memcg->css, nr_pages);
out:
css_put(&memcg->css);
return ret;
@@ -387,6 +388,7 @@ out:
static __always_inline void memcg_uncharge_slab(struct page *page, int order,
struct kmem_cache *s)
{
+ unsigned int nr_pages = 1 << order;
struct mem_cgroup *memcg;
struct lruvec *lruvec;
@@ -394,15 +396,15 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
memcg = READ_ONCE(s->memcg_params.memcg);
if (likely(!mem_cgroup_is_root(memcg))) {
lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
- mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order));
- memcg_kmem_uncharge_memcg(page, order, memcg);
+ mod_lruvec_state(lruvec, cache_vmstat_idx(s), -nr_pages);
+ memcg_kmem_uncharge(memcg, nr_pages);
} else {
mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
- -(1 << order));
+ -nr_pages);
}
rcu_read_unlock();
- percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order);
+ percpu_ref_put_many(&s->memcg_params.refcnt, nr_pages);
}
extern void slab_init_memcg_params(struct kmem_cache *);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1907cb2903c7..5282f881d2f5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1521,7 +1521,7 @@ void dump_unreclaimable_slab(void)
mutex_unlock(&slab_mutex);
}
-#if defined(CONFIG_MEMCG)
+#if defined(CONFIG_MEMCG_KMEM)
void *memcg_slab_start(struct seq_file *m, loff_t *pos)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
diff --git a/mm/slub.c b/mm/slub.c
index 6589b41d5a60..3098e0cf2899 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -259,7 +259,7 @@ static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
* freepointer to be restored incorrectly.
*/
return (void *)((unsigned long)ptr ^ s->random ^
- (unsigned long)kasan_reset_tag((void *)ptr_addr));
+ swab((unsigned long)kasan_reset_tag((void *)ptr_addr)));
#else
return ptr;
#endif
@@ -2205,11 +2205,11 @@ static void unfreeze_partials(struct kmem_cache *s,
struct kmem_cache_node *n = NULL, *n2 = NULL;
struct page *page, *discard_page = NULL;
- while ((page = c->partial)) {
+ while ((page = slub_percpu_partial(c))) {
struct page new;
struct page old;
- c->partial = page->next;
+ slub_set_percpu_partial(c, page);
n2 = get_node(s, page_to_nid(page));
if (n != n2) {
@@ -2282,7 +2282,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
if (oldpage) {
pobjects = oldpage->pobjects;
pages = oldpage->pages;
- if (drain && pobjects > s->cpu_partial) {
+ if (drain && pobjects > slub_cpu_partial(s)) {
unsigned long flags;
/*
* partial array is full. Move the existing
@@ -2307,7 +2307,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
!= oldpage);
- if (unlikely(!s->cpu_partial)) {
+ if (unlikely(!slub_cpu_partial(s))) {
unsigned long flags;
local_irq_save(flags);
@@ -3512,15 +3512,15 @@ static void set_cpu_partial(struct kmem_cache *s)
* 50% to keep some capacity around for frees.
*/
if (!kmem_cache_has_cpu_partial(s))
- s->cpu_partial = 0;
+ slub_set_cpu_partial(s, 0);
else if (s->size >= PAGE_SIZE)
- s->cpu_partial = 2;
+ slub_set_cpu_partial(s, 2);
else if (s->size >= 1024)
- s->cpu_partial = 6;
+ slub_set_cpu_partial(s, 6);
else if (s->size >= 256)
- s->cpu_partial = 13;
+ slub_set_cpu_partial(s, 13);
else
- s->cpu_partial = 30;
+ slub_set_cpu_partial(s, 30);
#endif
}
@@ -3581,6 +3581,13 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
*/
s->offset = size;
size += sizeof(void *);
+ } else if (size > sizeof(void *)) {
+ /*
+ * Store freelist pointer near middle of object to keep
+ * it away from the edges of the object to avoid small
+ * sized over/underflows from neighboring allocations.
+ */
+ s->offset = ALIGN(size / 2, sizeof(void *));
}
#ifdef CONFIG_SLUB_DEBUG
diff --git a/mm/sparse.c b/mm/sparse.c
index 65599e8bd636..f1af4d4ee80b 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -664,35 +664,14 @@ static void free_map_bootmem(struct page *memmap)
struct page * __meminit populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
- struct page *page, *ret;
- unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
-
- page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
- if (page)
- goto got_map_page;
-
- ret = vmalloc(memmap_size);
- if (ret)
- goto got_map_ptr;
-
- return NULL;
-got_map_page:
- ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
-got_map_ptr:
-
- return ret;
+ return kvmalloc_node(array_size(sizeof(struct page),
+ PAGES_PER_SECTION), GFP_KERNEL, nid);
}
static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
struct vmem_altmap *altmap)
{
- struct page *memmap = pfn_to_page(pfn);
-
- if (is_vmalloc_addr(memmap))
- vfree(memmap);
- else
- free_pages((unsigned long)memmap,
- get_order(sizeof(struct page) * PAGES_PER_SECTION));
+ kvfree(pfn_to_page(pfn));
}
static void free_map_bootmem(struct page *memmap)
@@ -894,7 +873,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
/* Align memmap to section boundary in the subsection case */
if (section_nr_to_pfn(section_nr) != start_pfn)
- memmap = pfn_to_kaddr(section_nr_to_pfn(section_nr));
+ memmap = pfn_to_page(section_nr_to_pfn(section_nr));
sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);
return 0;
diff --git a/mm/swap.c b/mm/swap.c
index cf39d24ada2a..a4af8c999963 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -931,7 +931,6 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
VM_BUG_ON_PAGE(PageLRU(page), page);
- SetPageLRU(page);
/*
* Page becomes evictable in two ways:
* 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
@@ -958,7 +957,8 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
* looking at the same page) and the evictable page will be stranded
* in an unevictable LRU.
*/
- smp_mb();
+ SetPageLRU(page);
+ smp_mb__after_atomic();
if (page_evictable(page)) {
lru = page_lru(page);
@@ -986,7 +986,6 @@ void __pagevec_lru_add(struct pagevec *pvec)
{
pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
}
-EXPORT_SYMBOL(__pagevec_lru_add);
/**
* pagevec_lookup_entries - gang pagecache lookup
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 63a7b4563a57..0975adc72253 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -309,7 +309,7 @@ direct_free:
swp_entry_t get_swap_page(struct page *page)
{
- swp_entry_t entry, *pentry;
+ swp_entry_t entry;
struct swap_slots_cache *cache;
entry.val = 0;
@@ -336,13 +336,11 @@ swp_entry_t get_swap_page(struct page *page)
if (cache->slots) {
repeat:
if (cache->nr) {
- pentry = &cache->slots[cache->cur++];
- entry = *pentry;
- pentry->val = 0;
+ entry = cache->slots[cache->cur];
+ cache->slots[cache->cur++].val = 0;
cache->nr--;
- } else {
- if (refill_swap_slots_cache(cache))
- goto repeat;
+ } else if (refill_swap_slots_cache(cache)) {
+ goto repeat;
}
}
mutex_unlock(&cache->alloc_lock);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8e7ce9a9bc5e..ebed37bbf7a3 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -116,7 +116,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
struct address_space *address_space = swap_address_space(entry);
pgoff_t idx = swp_offset(entry);
XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
- unsigned long i, nr = compound_nr(page);
+ unsigned long i, nr = hpage_nr_pages(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapCache(page), page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index be33e6176cd9..273a923c275c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2132,7 +2132,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
swp_entry_t entry;
unsigned int i;
- if (!si->inuse_pages)
+ if (!READ_ONCE(si->inuse_pages))
return 0;
if (!frontswap)
@@ -2148,7 +2148,7 @@ retry:
spin_lock(&mmlist_lock);
p = &init_mm.mmlist;
- while (si->inuse_pages &&
+ while (READ_ONCE(si->inuse_pages) &&
!signal_pending(current) &&
(p = p->next) != &init_mm.mmlist) {
@@ -2177,7 +2177,7 @@ retry:
mmput(prev_mm);
i = 0;
- while (si->inuse_pages &&
+ while (READ_ONCE(si->inuse_pages) &&
!signal_pending(current) &&
(i = find_next_to_unuse(si, i, frontswap)) != 0) {
@@ -2219,7 +2219,7 @@ retry:
* been preempted after get_swap_page(), temporarily hiding that swap.
* It's easy and robust (though cpu-intensive) just to keep retrying.
*/
- if (si->inuse_pages) {
+ if (READ_ONCE(si->inuse_pages)) {
if (!signal_pending(current))
goto retry;
retval = -EINTR;
@@ -3475,7 +3475,7 @@ int swap_duplicate(swp_entry_t entry)
*
* Called when allocating swap cache for existing swap entry,
* This can return error codes. Returns 0 at success.
- * -EBUSY means there is a swap cache.
+ * -EEXIST means there is a swap cache.
* Note: return code is different from swap_duplicate().
*/
int swapcache_prepare(swp_entry_t entry)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 1b0d7abad1d4..bd96855f3961 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -276,10 +276,14 @@ retry:
BUG_ON(dst_addr >= dst_start + len);
/*
- * Serialize via hugetlb_fault_mutex
+ * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
+ * i_mmap_rwsem ensures the dst_pte remains valid even
+ * in the case of shared pmds. fault mutex prevents
+ * races with other faulting threads.
*/
- idx = linear_page_index(dst_vma, dst_addr);
mapping = dst_vma->vm_file->f_mapping;
+ i_mmap_lock_read(mapping);
+ idx = linear_page_index(dst_vma, dst_addr);
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -287,6 +291,7 @@ retry:
dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize);
if (!dst_pte) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
goto out_unlock;
}
@@ -294,6 +299,7 @@ retry:
dst_pteval = huge_ptep_get(dst_pte);
if (!huge_pte_none(dst_pteval)) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
goto out_unlock;
}
@@ -301,6 +307,7 @@ retry:
dst_addr, src_addr, &page);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
vm_alloc_shared = vm_shared;
cond_resched();
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 4bac22fe1aa2..d69019fc3789 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -280,7 +280,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
enum vmpressure_levels level;
/* For now, no users for root-level efficiency */
- if (!memcg || memcg == root_mem_cgroup)
+ if (!memcg || mem_cgroup_is_root(memcg))
return;
spin_lock(&vmpr->sr_lock);
@@ -371,10 +371,8 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
int ret = 0;
spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL);
- if (!spec) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!spec)
+ return -ENOMEM;
/* Find required level */
token = strsep(&spec, ",");
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 876370565455..2e8e690d2813 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1084,9 +1084,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
- int may_enter_fs;
enum page_references references = PAGEREF_RECLAIM;
- bool dirty, writeback;
+ bool dirty, writeback, may_enter_fs;
unsigned int nr_pages;
cond_resched();
@@ -1267,7 +1266,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto activate_locked_split;
}
- may_enter_fs = 1;
+ may_enter_fs = true;
/* Adding to swap updated mapping */
mapping = page_mapping(page);
@@ -2096,7 +2095,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
unsigned long reclaim_pages(struct list_head *page_list)
{
- int nid = -1;
+ int nid = NUMA_NO_NODE;
unsigned long nr_reclaimed = 0;
LIST_HEAD(node_page_list);
struct reclaim_stat dummy_stat;
@@ -2111,7 +2110,7 @@ unsigned long reclaim_pages(struct list_head *page_list)
while (!list_empty(page_list)) {
page = lru_to_page(page_list);
- if (nid == -1) {
+ if (nid == NUMA_NO_NODE) {
nid = page_to_nid(page);
INIT_LIST_HEAD(&node_page_list);
}
@@ -2132,7 +2131,7 @@ unsigned long reclaim_pages(struct list_head *page_list)
putback_lru_page(page);
}
- nid = -1;
+ nid = NUMA_NO_NODE;
}
if (!list_empty(&node_page_list)) {
@@ -2427,10 +2426,8 @@ out:
case SCAN_FILE:
case SCAN_ANON:
/* Scan one type exclusively */
- if ((scan_balance == SCAN_FILE) != file) {
- lruvec_size = 0;
+ if ((scan_balance == SCAN_FILE) != file)
scan = 0;
- }
break;
default:
/* Look ma, no brain */
@@ -3096,7 +3093,6 @@ retry:
if (sc->memcg_low_skipped) {
sc->priority = initial_priority;
sc->force_deactivate = 0;
- sc->skipped_deactivate = 0;
sc->memcg_low_reclaim = 1;
sc->memcg_low_skipped = 0;
goto retry;
@@ -3136,8 +3132,9 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
/* kswapd must be awake if processes are being throttled */
if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
- pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx,
- (enum zone_type)ZONE_NORMAL);
+ if (READ_ONCE(pgdat->kswapd_classzone_idx) > ZONE_NORMAL)
+ WRITE_ONCE(pgdat->kswapd_classzone_idx, ZONE_NORMAL);
+
wake_up_interruptible(&pgdat->kswapd_wait);
}
@@ -3769,9 +3766,9 @@ out:
static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
enum zone_type prev_classzone_idx)
{
- if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
- return prev_classzone_idx;
- return pgdat->kswapd_classzone_idx;
+ enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx);
+
+ return curr_idx == MAX_NR_ZONES ? prev_classzone_idx : curr_idx;
}
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
@@ -3815,8 +3812,11 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
* the previous request that slept prematurely.
*/
if (remaining) {
- pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
- pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
+ WRITE_ONCE(pgdat->kswapd_classzone_idx,
+ kswapd_classzone_idx(pgdat, classzone_idx));
+
+ if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
+ WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
}
finish_wait(&pgdat->kswapd_wait, &wait);
@@ -3893,12 +3893,12 @@ static int kswapd(void *p)
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
set_freezable();
- pgdat->kswapd_order = 0;
- pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
+ WRITE_ONCE(pgdat->kswapd_order, 0);
+ WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES);
for ( ; ; ) {
bool ret;
- alloc_order = reclaim_order = pgdat->kswapd_order;
+ alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
kswapd_try_sleep:
@@ -3906,10 +3906,10 @@ kswapd_try_sleep:
classzone_idx);
/* Read the new order and classzone_idx */
- alloc_order = reclaim_order = pgdat->kswapd_order;
+ alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
- pgdat->kswapd_order = 0;
- pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
+ WRITE_ONCE(pgdat->kswapd_order, 0);
+ WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES);
ret = try_to_freeze();
if (kthread_should_stop())
@@ -3953,20 +3953,23 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
enum zone_type classzone_idx)
{
pg_data_t *pgdat;
+ enum zone_type curr_idx;
if (!managed_zone(zone))
return;
if (!cpuset_zone_allowed(zone, gfp_flags))
return;
+
pgdat = zone->zone_pgdat;
+ curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx);
+
+ if (curr_idx == MAX_NR_ZONES || curr_idx < classzone_idx)
+ WRITE_ONCE(pgdat->kswapd_classzone_idx, classzone_idx);
+
+ if (READ_ONCE(pgdat->kswapd_order) < order)
+ WRITE_ONCE(pgdat->kswapd_order, order);
- if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
- pgdat->kswapd_classzone_idx = classzone_idx;
- else
- pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx,
- classzone_idx);
- pgdat->kswapd_order = max(pgdat->kswapd_order, order);
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
@@ -4030,27 +4033,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
}
#endif /* CONFIG_HIBERNATION */
-/* It's optimal to keep kswapds on the same CPUs as their memory, but
- not required for correctness. So if the last cpu in a node goes
- away, we get changed to run anywhere: as the first one comes back,
- restore their cpu bindings. */
-static int kswapd_cpu_online(unsigned int cpu)
-{
- int nid;
-
- for_each_node_state(nid, N_MEMORY) {
- pg_data_t *pgdat = NODE_DATA(nid);
- const struct cpumask *mask;
-
- mask = cpumask_of_node(pgdat->node_id);
-
- if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
- /* One of our CPUs online: restore mask */
- set_cpus_allowed_ptr(pgdat->kswapd, mask);
- }
- return 0;
-}
-
/*
* This kswapd start function will be called by init and node-hot-add.
* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
@@ -4090,15 +4072,11 @@ void kswapd_stop(int nid)
static int __init kswapd_init(void)
{
- int nid, ret;
+ int nid;
swap_setup();
for_each_node_state(nid, N_MEMORY)
kswapd_run(nid);
- ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
- "mm/vmscan:online", kswapd_cpu_online,
- NULL);
- WARN_ON(ret < 0);
return 0;
}
@@ -4277,29 +4255,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
}
#endif
-/*
- * page_evictable - test whether a page is evictable
- * @page: the page to test
- *
- * Test whether page is evictable--i.e., should be placed on active/inactive
- * lists vs unevictable list.
- *
- * Reasons page might not be evictable:
- * (1) page's mapping marked unevictable
- * (2) page is part of an mlocked VMA
- *
- */
-int page_evictable(struct page *page)
-{
- int ret;
-
- /* Prevent address_space of inode and swap cache from being freed */
- rcu_read_lock();
- ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
- rcu_read_unlock();
- return ret;
-}
-
/**
* check_move_unevictable_pages - check pages for evictability and move to
* appropriate zone lru list
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 78d53378db99..c9c0d71f917f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1168,6 +1168,8 @@ const char * const vmstat_text[] = {
"nr_dirtied",
"nr_written",
"nr_kernel_misc_reclaimable",
+ "nr_foll_pin_acquired",
+ "nr_foll_pin_released",
/* enum writeback_stat_item counters */
"nr_dirty_threshold",