summaryrefslogtreecommitdiff
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c1400
1 files changed, 992 insertions, 408 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bb0b7c128015..0b7656e804d1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1,6 +1,6 @@
/*
* Generic hugetlb support.
- * (C) William Irwin, April 2004
+ * (C) Nadia Yvette Chambers, April 2004
*/
#include <linux/list.h>
#include <linux/init.h>
@@ -21,20 +21,22 @@
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/page-isolation.h>
#include <asm/page.h>
#include <asm/pgtable.h>
-#include <asm/io.h>
+#include <asm/tlb.h>
+#include <linux/io.h>
#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
#include "internal.h"
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
unsigned long hugepages_treat_as_movable;
-static int max_hstate;
+int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
@@ -45,27 +47,103 @@ static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
static unsigned long __initdata default_hstate_size;
-#define for_each_hstate(h) \
- for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
-
/*
- * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
+ * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
+ * free_huge_pages, and surplus_huge_pages.
*/
-static DEFINE_SPINLOCK(hugetlb_lock);
+DEFINE_SPINLOCK(hugetlb_lock);
+
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+{
+ bool free = (spool->count == 0) && (spool->used_hpages == 0);
+
+ spin_unlock(&spool->lock);
+
+ /* If no pages are used, and no other handles to the subpool
+ * remain, free the subpool the subpool remain */
+ if (free)
+ kfree(spool);
+}
+
+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+{
+ struct hugepage_subpool *spool;
+
+ spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+ if (!spool)
+ return NULL;
+
+ spin_lock_init(&spool->lock);
+ spool->count = 1;
+ spool->max_hpages = nr_blocks;
+ spool->used_hpages = 0;
+
+ return spool;
+}
+
+void hugepage_put_subpool(struct hugepage_subpool *spool)
+{
+ spin_lock(&spool->lock);
+ BUG_ON(!spool->count);
+ spool->count--;
+ unlock_or_release_subpool(spool);
+}
+
+static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+ long delta)
+{
+ int ret = 0;
+
+ if (!spool)
+ return 0;
+
+ spin_lock(&spool->lock);
+ if ((spool->used_hpages + delta) <= spool->max_hpages) {
+ spool->used_hpages += delta;
+ } else {
+ ret = -ENOMEM;
+ }
+ spin_unlock(&spool->lock);
+
+ return ret;
+}
+
+static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+ long delta)
+{
+ if (!spool)
+ return;
+
+ spin_lock(&spool->lock);
+ spool->used_hpages -= delta;
+ /* If hugetlbfs_put_super couldn't free spool due to
+ * an outstanding quota reference, free it now. */
+ unlock_or_release_subpool(spool);
+}
+
+static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
+{
+ return HUGETLBFS_SB(inode->i_sb)->spool;
+}
+
+static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
+{
+ return subpool_inode(file_inode(vma->vm_file));
+}
/*
* Region tracking -- allows tracking of reservations and instantiated pages
* across the pages in a mapping.
*
* The region data structures are protected by a combination of the mmap_sem
- * and the hugetlb_instantion_mutex. To access or modify a region the caller
+ * and the hugetlb_instantiation_mutex. To access or modify a region the caller
* must either hold the mmap_sem for write, or the mmap_sem for read and
- * the hugetlb_instantiation mutex:
+ * the hugetlb_instantiation_mutex:
*
- * down_write(&mm->mmap_sem);
+ * down_write(&mm->mmap_sem);
* or
- * down_read(&mm->mmap_sem);
- * mutex_lock(&hugetlb_instantiation_mutex);
+ * down_read(&mm->mmap_sem);
+ * mutex_lock(&hugetlb_instantiation_mutex);
*/
struct file_region {
struct list_head link;
@@ -146,7 +224,7 @@ static long region_chg(struct list_head *head, long f, long t)
if (rg->from > t)
return chg;
- /* We overlap with this area, if it extends futher than
+ /* We overlap with this area, if it extends further than
* us then we must extend ourselves. Account for its
* existing reservation. */
if (rg->to > t) {
@@ -195,8 +273,8 @@ static long region_count(struct list_head *head, long f, long t)
/* Locate each segment we overlap with, and count that overlap. */
list_for_each_entry(rg, head, link) {
- int seg_from;
- int seg_to;
+ long seg_from;
+ long seg_to;
if (rg->to <= f)
continue;
@@ -242,7 +320,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
hstate = hstate_vma(vma);
- return 1UL << (hstate->order + PAGE_SHIFT);
+ return 1UL << huge_page_shift(hstate);
}
EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
@@ -357,25 +435,6 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
return (get_vma_private_data(vma) & flag) != 0;
}
-/* Decrement the reserved pages in the hugepage pool by one */
-static void decrement_hugepage_resv_vma(struct hstate *h,
- struct vm_area_struct *vma)
-{
- if (vma->vm_flags & VM_NORESERVE)
- return;
-
- if (vma->vm_flags & VM_MAYSHARE) {
- /* Shared mappings always use reserves */
- h->resv_huge_pages--;
- } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
- /*
- * Only the process that called mmap() has reserves for
- * private mappings.
- */
- h->resv_huge_pages--;
- }
-}
-
/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
@@ -385,12 +444,35 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
}
/* Returns true if the VMA has associated reserve pages */
-static int vma_has_reserves(struct vm_area_struct *vma)
+static int vma_has_reserves(struct vm_area_struct *vma, long chg)
{
+ if (vma->vm_flags & VM_NORESERVE) {
+ /*
+ * This address is already reserved by other process(chg == 0),
+ * so, we should decrement reserved count. Without decrementing,
+ * reserve count remains after releasing inode, because this
+ * allocated page will go into page cache and is regarded as
+ * coming from reserved pool in releasing step. Currently, we
+ * don't have any other solution to deal with this situation
+ * properly, so add work-around here.
+ */
+ if (vma->vm_flags & VM_MAYSHARE && chg == 0)
+ return 1;
+ else
+ return 0;
+ }
+
+ /* Shared mappings always use reserves */
if (vma->vm_flags & VM_MAYSHARE)
return 1;
+
+ /*
+ * Only the process that called mmap() has reserves for
+ * private mappings.
+ */
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return 1;
+
return 0;
}
@@ -431,7 +513,7 @@ void copy_huge_page(struct page *dst, struct page *src)
static void enqueue_huge_page(struct hstate *h, struct page *page)
{
int nid = page_to_nid(page);
- list_add(&page->lru, &h->hugepage_freelists[nid]);
+ list_move(&page->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
}
@@ -440,19 +522,35 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
- if (list_empty(&h->hugepage_freelists[nid]))
+ list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
+ if (!is_migrate_isolate_page(page))
+ break;
+ /*
+ * if 'non-isolated free hugepage' not found on the list,
+ * the allocation fails.
+ */
+ if (&h->hugepage_freelists[nid] == &page->lru)
return NULL;
- page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
- list_del(&page->lru);
+ list_move(&page->lru, &h->hugepage_activelist);
set_page_refcounted(page);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
return page;
}
+/* Movability of hugepages depends on migration support. */
+static inline gfp_t htlb_alloc_mask(struct hstate *h)
+{
+ if (hugepages_treat_as_movable || hugepage_migration_support(h))
+ return GFP_HIGHUSER_MOVABLE;
+ else
+ return GFP_HIGHUSER;
+}
+
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
- unsigned long address, int avoid_reserve)
+ unsigned long address, int avoid_reserve,
+ long chg)
{
struct page *page = NULL;
struct mempolicy *mpol;
@@ -460,38 +558,50 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
+ unsigned int cpuset_mems_cookie;
- get_mems_allowed();
- zonelist = huge_zonelist(vma, address,
- htlb_alloc_mask, &mpol, &nodemask);
/*
* A child process with MAP_PRIVATE mappings created by their parent
* have no page reserves. This check ensures that reservations are
* not "stolen". The child may still get SIGKILLed
*/
- if (!vma_has_reserves(vma) &&
+ if (!vma_has_reserves(vma, chg) &&
h->free_huge_pages - h->resv_huge_pages == 0)
goto err;
/* If reserves cannot be used, ensure enough pages are in the pool */
if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
- goto err;;
+ goto err;
+
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
+ zonelist = huge_zonelist(vma, address,
+ htlb_alloc_mask(h), &mpol, &nodemask);
for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) {
- if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
+ if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) {
page = dequeue_huge_page_node(h, zone_to_nid(zone));
if (page) {
- if (!avoid_reserve)
- decrement_hugepage_resv_vma(h, vma);
+ if (avoid_reserve)
+ break;
+ if (!vma_has_reserves(vma, chg))
+ break;
+
+ SetPagePrivate(page);
+ h->resv_huge_pages--;
break;
}
}
}
-err:
+
mpol_cond_put(mpol);
- put_mems_allowed();
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
return page;
+
+err:
+ return NULL;
}
static void update_and_free_page(struct hstate *h, struct page *page)
@@ -503,10 +613,12 @@ static void update_and_free_page(struct hstate *h, struct page *page)
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
for (i = 0; i < pages_per_huge_page(h); i++) {
- page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
- 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
- 1 << PG_private | 1<< PG_writeback);
+ page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
+ 1 << PG_referenced | 1 << PG_dirty |
+ 1 << PG_active | 1 << PG_reserved |
+ 1 << PG_private | 1 << PG_writeback);
}
+ VM_BUG_ON(hugetlb_cgroup_from_page(page));
set_compound_page_dtor(page, NULL);
set_page_refcounted(page);
arch_release_hugepage(page);
@@ -532,32 +644,43 @@ static void free_huge_page(struct page *page)
*/
struct hstate *h = page_hstate(page);
int nid = page_to_nid(page);
- struct address_space *mapping;
+ struct hugepage_subpool *spool =
+ (struct hugepage_subpool *)page_private(page);
+ bool restore_reserve;
- mapping = (struct address_space *) page_private(page);
set_page_private(page, 0);
page->mapping = NULL;
BUG_ON(page_count(page));
BUG_ON(page_mapcount(page));
- INIT_LIST_HEAD(&page->lru);
+ restore_reserve = PagePrivate(page);
+ ClearPagePrivate(page);
spin_lock(&hugetlb_lock);
+ hugetlb_cgroup_uncharge_page(hstate_index(h),
+ pages_per_huge_page(h), page);
+ if (restore_reserve)
+ h->resv_huge_pages++;
+
if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
+ /* remove the page from active list */
+ list_del(&page->lru);
update_and_free_page(h, page);
h->surplus_huge_pages--;
h->surplus_huge_pages_node[nid]--;
} else {
+ arch_clear_hugepage_flags(page);
enqueue_huge_page(h, page);
}
spin_unlock(&hugetlb_lock);
- if (mapping)
- hugetlb_put_quota(mapping, 1);
+ hugepage_subpool_put_pages(spool, 1);
}
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
+ INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, free_huge_page);
spin_lock(&hugetlb_lock);
+ set_hugetlb_cgroup(page, NULL);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
spin_unlock(&hugetlb_lock);
@@ -573,12 +696,32 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
/* we rely on prep_new_huge_page to set the destructor */
set_compound_order(page, order);
__SetPageHead(page);
+ __ClearPageReserved(page);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
__SetPageTail(p);
+ /*
+ * For gigantic hugepages allocated through bootmem at
+ * boot, it's safer to be consistent with the not-gigantic
+ * hugepages and clear the PG_reserved bit from all tail pages
+ * too. Otherwse drivers using get_user_pages() to access tail
+ * pages may get the reference counting wrong if they see
+ * PG_reserved set on a tail page (despite the head page not
+ * having PG_reserved set). Enforcing this consistency between
+ * head and tail pages allows drivers to optimize away a check
+ * on the head page when they need know if put_page() is needed
+ * after get_user_pages().
+ */
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
p->first_page = page;
}
}
+/*
+ * PageHuge() only returns true for hugetlbfs pages, but not for normal or
+ * transparent huge pages. See the PageTransHuge() documentation for more
+ * details.
+ */
int PageHuge(struct page *page)
{
compound_page_dtor *dtor;
@@ -591,9 +734,25 @@ int PageHuge(struct page *page)
return dtor == free_huge_page;
}
-
EXPORT_SYMBOL_GPL(PageHuge);
+pgoff_t __basepage_index(struct page *page)
+{
+ struct page *page_head = compound_head(page);
+ pgoff_t index = page_index(page_head);
+ unsigned long compound_idx;
+
+ if (!PageHuge(page_head))
+ return page_index(page);
+
+ if (compound_order(page_head) >= MAX_ORDER)
+ compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
+ else
+ compound_idx = page - page_head;
+
+ return (index << compound_order(page_head)) + compound_idx;
+}
+
static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
@@ -602,7 +761,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
return NULL;
page = alloc_pages_exact_node(nid,
- htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+ htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
if (page) {
@@ -659,33 +818,6 @@ static int hstate_next_node_to_alloc(struct hstate *h,
return nid;
}
-static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
-{
- struct page *page;
- int start_nid;
- int next_nid;
- int ret = 0;
-
- start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
- next_nid = start_nid;
-
- do {
- page = alloc_fresh_huge_page_node(h, next_nid);
- if (page) {
- ret = 1;
- break;
- }
- next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
- } while (next_nid != start_nid);
-
- if (ret)
- count_vm_event(HTLB_BUDDY_PGALLOC);
- else
- count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
-
- return ret;
-}
-
/*
* helper for free_pool_huge_page() - return the previously saved
* node ["this node"] from which to free a huge page. Advance the
@@ -704,6 +836,40 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
return nid;
}
+#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
+ nr_nodes--)
+
+#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_free(hs, mask)) || 1); \
+ nr_nodes--)
+
+static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+{
+ struct page *page;
+ int nr_nodes, node;
+ int ret = 0;
+
+ for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+ page = alloc_fresh_huge_page_node(h, node);
+ if (page) {
+ ret = 1;
+ break;
+ }
+ }
+
+ if (ret)
+ count_vm_event(HTLB_BUDDY_PGALLOC);
+ else
+ count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+
+ return ret;
+}
+
/*
* Free huge page from pool from next node to free.
* Attempt to keep persistent huge pages more or less
@@ -713,40 +879,73 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
bool acct_surplus)
{
- int start_nid;
- int next_nid;
+ int nr_nodes, node;
int ret = 0;
- start_nid = hstate_next_node_to_free(h, nodes_allowed);
- next_nid = start_nid;
-
- do {
+ for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
/*
* If we're returning unused surplus pages, only examine
* nodes with surplus pages.
*/
- if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
- !list_empty(&h->hugepage_freelists[next_nid])) {
+ if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
+ !list_empty(&h->hugepage_freelists[node])) {
struct page *page =
- list_entry(h->hugepage_freelists[next_nid].next,
+ list_entry(h->hugepage_freelists[node].next,
struct page, lru);
list_del(&page->lru);
h->free_huge_pages--;
- h->free_huge_pages_node[next_nid]--;
+ h->free_huge_pages_node[node]--;
if (acct_surplus) {
h->surplus_huge_pages--;
- h->surplus_huge_pages_node[next_nid]--;
+ h->surplus_huge_pages_node[node]--;
}
update_and_free_page(h, page);
ret = 1;
break;
}
- next_nid = hstate_next_node_to_free(h, nodes_allowed);
- } while (next_nid != start_nid);
+ }
return ret;
}
+/*
+ * Dissolve a given free hugepage into free buddy pages. This function does
+ * nothing for in-use (including surplus) hugepages.
+ */
+static void dissolve_free_huge_page(struct page *page)
+{
+ spin_lock(&hugetlb_lock);
+ if (PageHuge(page) && !page_count(page)) {
+ struct hstate *h = page_hstate(page);
+ int nid = page_to_nid(page);
+ list_del(&page->lru);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ update_and_free_page(h, page);
+ }
+ spin_unlock(&hugetlb_lock);
+}
+
+/*
+ * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
+ * make specified memory blocks removable from the system.
+ * Note that start_pfn should aligned with (minimum) hugepage size.
+ */
+void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned int order = 8 * sizeof(void *);
+ unsigned long pfn;
+ struct hstate *h;
+
+ /* Set scan step to minimum hugepage size */
+ for_each_hstate(h)
+ if (order > huge_page_order(h))
+ order = huge_page_order(h);
+ VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order));
+ for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order)
+ dissolve_free_huge_page(pfn_to_page(pfn));
+}
+
static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
{
struct page *page;
@@ -789,23 +988,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
spin_unlock(&hugetlb_lock);
if (nid == NUMA_NO_NODE)
- page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
+ page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
__GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
else
page = alloc_pages_exact_node(nid,
- htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+ htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
if (page && arch_prepare_hugepage(page)) {
__free_pages(page, huge_page_order(h));
- return NULL;
+ page = NULL;
}
spin_lock(&hugetlb_lock);
if (page) {
+ INIT_LIST_HEAD(&page->lru);
r_nid = page_to_nid(page);
set_compound_page_dtor(page, free_huge_page);
+ set_hugetlb_cgroup(page, NULL);
/*
* We incremented the global counters already
*/
@@ -829,10 +1030,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
*/
struct page *alloc_huge_page_node(struct hstate *h, int nid)
{
- struct page *page;
+ struct page *page = NULL;
spin_lock(&hugetlb_lock);
- page = dequeue_huge_page_node(h, nid);
+ if (h->free_huge_pages - h->resv_huge_pages > 0)
+ page = dequeue_huge_page_node(h, nid);
spin_unlock(&hugetlb_lock);
if (!page)
@@ -842,7 +1044,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
}
/*
- * Increase the hugetlb pool such that it can accomodate a reservation
+ * Increase the hugetlb pool such that it can accommodate a reservation
* of size 'delta'.
*/
static int gather_surplus_pages(struct hstate *h, int delta)
@@ -851,6 +1053,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
struct page *page, *tmp;
int ret, i;
int needed, allocated;
+ bool alloc_ok = true;
needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
if (needed <= 0) {
@@ -866,17 +1069,13 @@ retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
- if (!page)
- /*
- * We were not able to allocate enough pages to
- * satisfy the entire reservation so we free what
- * we've allocated so far.
- */
- goto free;
-
+ if (!page) {
+ alloc_ok = false;
+ break;
+ }
list_add(&page->lru, &surplus_list);
}
- allocated += needed;
+ allocated += i;
/*
* After retaking hugetlb_lock, we need to recalculate 'needed'
@@ -885,12 +1084,19 @@ retry:
spin_lock(&hugetlb_lock);
needed = (h->resv_huge_pages + delta) -
(h->free_huge_pages + allocated);
- if (needed > 0)
- goto retry;
-
+ if (needed > 0) {
+ if (alloc_ok)
+ goto retry;
+ /*
+ * We were not able to allocate enough pages to
+ * satisfy the entire reservation so we free what
+ * we've allocated so far.
+ */
+ goto free;
+ }
/*
* The surplus_list now contains _at_least_ the number of extra pages
- * needed to accomodate the reservation. Add the appropriate number
+ * needed to accommodate the reservation. Add the appropriate number
* of pages to the hugetlb pool and free the extras back to the buddy
* allocator. Commit the entire reservation here to prevent another
* process from stealing the pages as they are added to the pool but
@@ -900,12 +1106,10 @@ retry:
h->resv_huge_pages += delta;
ret = 0;
- spin_unlock(&hugetlb_lock);
/* Free the needed pages to the hugetlb pool */
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
if ((--needed) < 0)
break;
- list_del(&page->lru);
/*
* This page is now managed by the hugetlb allocator and has
* no users -- drop the buddy allocator's reference.
@@ -914,15 +1118,12 @@ retry:
VM_BUG_ON(page_count(page));
enqueue_huge_page(h, page);
}
+free:
+ spin_unlock(&hugetlb_lock);
/* Free unnecessary surplus pages to the buddy allocator */
-free:
- if (!list_empty(&surplus_list)) {
- list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
- list_del(&page->lru);
- put_page(page);
- }
- }
+ list_for_each_entry_safe(page, tmp, &surplus_list, lru)
+ put_page(page);
spin_lock(&hugetlb_lock);
return ret;
@@ -957,7 +1158,7 @@ static void return_unused_surplus_pages(struct hstate *h,
* on-line nodes with memory and will handle the hstate accounting.
*/
while (nr_pages--) {
- if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1))
+ if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
break;
}
}
@@ -965,11 +1166,12 @@ static void return_unused_surplus_pages(struct hstate *h,
/*
* Determine if the huge page at addr within the vma has an associated
* reservation. Where it does not we will need to logically increase
- * reservation and actually increase quota before an allocation can occur.
- * Where any new reservation would be required the reservation change is
- * prepared, but not committed. Once the page has been quota'd allocated
- * an instantiated the change should be committed via vma_commit_reservation.
- * No action is required on failure.
+ * reservation and actually increase subpool usage before an allocation
+ * can occur. Where any new reservation would be required the
+ * reservation change is prepared, but not committed. Once the page
+ * has been allocated from the subpool and instantiated the change should
+ * be committed via vma_commit_reservation. No action is required on
+ * failure.
*/
static long vma_needs_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
@@ -988,9 +1190,9 @@ static long vma_needs_reservation(struct hstate *h,
} else {
long err;
pgoff_t idx = vma_hugecache_offset(h, vma, addr);
- struct resv_map *reservations = vma_resv_map(vma);
+ struct resv_map *resv = vma_resv_map(vma);
- err = region_chg(&reservations->regions, idx, idx + 1);
+ err = region_chg(&resv->regions, idx, idx + 1);
if (err < 0)
return err;
return 0;
@@ -1008,66 +1210,94 @@ static void vma_commit_reservation(struct hstate *h,
} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
pgoff_t idx = vma_hugecache_offset(h, vma, addr);
- struct resv_map *reservations = vma_resv_map(vma);
+ struct resv_map *resv = vma_resv_map(vma);
/* Mark this page used in the map. */
- region_add(&reservations->regions, idx, idx + 1);
+ region_add(&resv->regions, idx, idx + 1);
}
}
static struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
+ struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct page *page;
- struct address_space *mapping = vma->vm_file->f_mapping;
- struct inode *inode = mapping->host;
long chg;
+ int ret, idx;
+ struct hugetlb_cgroup *h_cg;
+ idx = hstate_index(h);
/*
- * Processes that did not create the mapping will have no reserves and
- * will not have accounted against quota. Check that the quota can be
- * made before satisfying the allocation
- * MAP_NORESERVE mappings may also need pages and quota allocated
- * if no reserve mapping overlaps.
+ * Processes that did not create the mapping will have no
+ * reserves and will not have accounted against subpool
+ * limit. Check that the subpool limit can be made before
+ * satisfying the allocation MAP_NORESERVE mappings may also
+ * need pages and subpool limit allocated allocated if no reserve
+ * mapping overlaps.
*/
chg = vma_needs_reservation(h, vma, addr);
if (chg < 0)
- return ERR_PTR(chg);
- if (chg)
- if (hugetlb_get_quota(inode->i_mapping, chg))
+ return ERR_PTR(-ENOMEM);
+ if (chg || avoid_reserve)
+ if (hugepage_subpool_get_pages(spool, 1))
return ERR_PTR(-ENOSPC);
+ ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
+ if (ret) {
+ if (chg || avoid_reserve)
+ hugepage_subpool_put_pages(spool, 1);
+ return ERR_PTR(-ENOSPC);
+ }
spin_lock(&hugetlb_lock);
- page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
- spin_unlock(&hugetlb_lock);
-
+ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
if (!page) {
+ spin_unlock(&hugetlb_lock);
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
if (!page) {
- hugetlb_put_quota(inode->i_mapping, chg);
- return ERR_PTR(-VM_FAULT_SIGBUS);
+ hugetlb_cgroup_uncharge_cgroup(idx,
+ pages_per_huge_page(h),
+ h_cg);
+ if (chg || avoid_reserve)
+ hugepage_subpool_put_pages(spool, 1);
+ return ERR_PTR(-ENOSPC);
}
+ spin_lock(&hugetlb_lock);
+ list_move(&page->lru, &h->hugepage_activelist);
+ /* Fall through */
}
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
+ spin_unlock(&hugetlb_lock);
- set_page_private(page, (unsigned long) mapping);
+ set_page_private(page, (unsigned long)spool);
vma_commit_reservation(h, vma, addr);
+ return page;
+}
+/*
+ * alloc_huge_page()'s wrapper which simply returns the page if allocation
+ * succeeds, otherwise NULL. This function is called from new_vma_page(),
+ * where no ERR_VALUE is expected to be returned.
+ */
+struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
+ unsigned long addr, int avoid_reserve)
+{
+ struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
+ if (IS_ERR(page))
+ page = NULL;
return page;
}
int __weak alloc_bootmem_huge_page(struct hstate *h)
{
struct huge_bootmem_page *m;
- int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
+ int nr_nodes, node;
- while (nr_nodes) {
+ for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
void *addr;
- addr = __alloc_bootmem_node_nopanic(
- NODE_DATA(hstate_next_node_to_alloc(h,
- &node_states[N_HIGH_MEMORY])),
+ addr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
huge_page_size(h), huge_page_size(h), 0);
if (addr) {
@@ -1079,7 +1309,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
m = addr;
goto found;
}
- nr_nodes--;
}
return 0;
@@ -1105,12 +1334,28 @@ static void __init gather_bootmem_prealloc(void)
struct huge_bootmem_page *m;
list_for_each_entry(m, &huge_boot_pages, list) {
- struct page *page = virt_to_page(m);
struct hstate *h = m->hstate;
- __ClearPageReserved(page);
+ struct page *page;
+
+#ifdef CONFIG_HIGHMEM
+ page = pfn_to_page(m->phys >> PAGE_SHIFT);
+ free_bootmem_late((unsigned long)m,
+ sizeof(struct huge_bootmem_page));
+#else
+ page = virt_to_page(m);
+#endif
WARN_ON(page_count(page) != 1);
prep_compound_huge_page(page, h->order);
+ WARN_ON(PageReserved(page));
prep_new_huge_page(h, page, page_to_nid(page));
+ /*
+ * If we had gigantic hugepages allocated at boot time, we need
+ * to restore the 'stolen' pages to totalram_pages in order to
+ * fix confusing memory reports from free(1) and another
+ * side-effects, like CommitLimit going negative.
+ */
+ if (h->order > (MAX_ORDER - 1))
+ adjust_managed_page_count(page, 1 << h->order);
}
}
@@ -1123,7 +1368,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_fresh_huge_page(h,
- &node_states[N_HIGH_MEMORY]))
+ &node_states[N_MEMORY]))
break;
}
h->max_huge_pages = i;
@@ -1157,8 +1402,7 @@ static void __init report_hugepages(void)
for_each_hstate(h) {
char buf[32];
- printk(KERN_INFO "HugeTLB registered %s page size, "
- "pre-allocated %ld pages\n",
+ pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
memfmt(buf, huge_page_size(h)),
h->free_huge_pages);
}
@@ -1203,48 +1447,28 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count,
static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
int delta)
{
- int start_nid, next_nid;
- int ret = 0;
+ int nr_nodes, node;
VM_BUG_ON(delta != -1 && delta != 1);
- if (delta < 0)
- start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
- else
- start_nid = hstate_next_node_to_free(h, nodes_allowed);
- next_nid = start_nid;
-
- do {
- int nid = next_nid;
- if (delta < 0) {
- /*
- * To shrink on this node, there must be a surplus page
- */
- if (!h->surplus_huge_pages_node[nid]) {
- next_nid = hstate_next_node_to_alloc(h,
- nodes_allowed);
- continue;
- }
+ if (delta < 0) {
+ for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+ if (h->surplus_huge_pages_node[node])
+ goto found;
}
- if (delta > 0) {
- /*
- * Surplus cannot exceed the total number of pages
- */
- if (h->surplus_huge_pages_node[nid] >=
- h->nr_huge_pages_node[nid]) {
- next_nid = hstate_next_node_to_free(h,
- nodes_allowed);
- continue;
- }
+ } else {
+ for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
+ if (h->surplus_huge_pages_node[node] <
+ h->nr_huge_pages_node[node])
+ goto found;
}
+ }
+ return 0;
- h->surplus_huge_pages += delta;
- h->surplus_huge_pages_node[nid] += delta;
- ret = 1;
- break;
- } while (next_nid != start_nid);
-
- return ret;
+found:
+ h->surplus_huge_pages += delta;
+ h->surplus_huge_pages_node[node] += delta;
+ return 1;
}
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
@@ -1374,7 +1598,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
struct hstate *h;
NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
- err = strict_strtoul(buf, 10, &count);
+ err = kstrtoul(buf, 10, &count);
if (err)
goto out;
@@ -1391,7 +1615,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
if (!(obey_mempolicy &&
init_nodemask_of_mempolicy(nodes_allowed))) {
NODEMASK_FREE(nodes_allowed);
- nodes_allowed = &node_states[N_HIGH_MEMORY];
+ nodes_allowed = &node_states[N_MEMORY];
}
} else if (nodes_allowed) {
/*
@@ -1401,11 +1625,11 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
init_nodemask_of_node(nodes_allowed, nid);
} else
- nodes_allowed = &node_states[N_HIGH_MEMORY];
+ nodes_allowed = &node_states[N_MEMORY];
h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
- if (nodes_allowed != &node_states[N_HIGH_MEMORY])
+ if (nodes_allowed != &node_states[N_MEMORY])
NODEMASK_FREE(nodes_allowed);
return len;
@@ -1465,7 +1689,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
if (h->order >= MAX_ORDER)
return -EINVAL;
- err = strict_strtoul(buf, 10, &input);
+ err = kstrtoul(buf, 10, &input);
if (err)
return err;
@@ -1540,7 +1764,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
struct attribute_group *hstate_attr_group)
{
int retval;
- int hi = h - hstates;
+ int hi = hstate_index(h);
hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
if (!hstate_kobjs[hi])
@@ -1566,8 +1790,7 @@ static void __init hugetlb_sysfs_init(void)
err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
hstate_kobjs, &hstate_attr_group);
if (err)
- printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
- h->name);
+ pr_err("Hugetlb: Unable to add hstate %s", h->name);
}
}
@@ -1575,9 +1798,9 @@ static void __init hugetlb_sysfs_init(void)
/*
* node_hstate/s - associate per node hstate attributes, via their kobjects,
- * with node sysdevs in node_devices[] using a parallel array. The array
- * index of a node sysdev or _hstate == node id.
- * This is here to avoid any static dependency of the node sysdev driver, in
+ * with node devices in node_devices[] using a parallel array. The array
+ * index of a node device or _hstate == node id.
+ * This is here to avoid any static dependency of the node device driver, in
* the base kernel, on the hugetlb module.
*/
struct node_hstate {
@@ -1587,7 +1810,7 @@ struct node_hstate {
struct node_hstate node_hstates[MAX_NUMNODES];
/*
- * A subset of global hstate attributes for node sysdevs
+ * A subset of global hstate attributes for node devices
*/
static struct attribute *per_node_hstate_attrs[] = {
&nr_hugepages_attr.attr,
@@ -1601,7 +1824,7 @@ static struct attribute_group per_node_hstate_attr_group = {
};
/*
- * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj.
+ * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
* Returns node id via non-NULL nidp.
*/
static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
@@ -1624,29 +1847,31 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
}
/*
- * Unregister hstate attributes from a single node sysdev.
+ * Unregister hstate attributes from a single node device.
* No-op if no hstate attributes attached.
*/
-void hugetlb_unregister_node(struct node *node)
+static void hugetlb_unregister_node(struct node *node)
{
struct hstate *h;
- struct node_hstate *nhs = &node_hstates[node->sysdev.id];
+ struct node_hstate *nhs = &node_hstates[node->dev.id];
if (!nhs->hugepages_kobj)
return; /* no hstate attributes */
- for_each_hstate(h)
- if (nhs->hstate_kobjs[h - hstates]) {
- kobject_put(nhs->hstate_kobjs[h - hstates]);
- nhs->hstate_kobjs[h - hstates] = NULL;
+ for_each_hstate(h) {
+ int idx = hstate_index(h);
+ if (nhs->hstate_kobjs[idx]) {
+ kobject_put(nhs->hstate_kobjs[idx]);
+ nhs->hstate_kobjs[idx] = NULL;
}
+ }
kobject_put(nhs->hugepages_kobj);
nhs->hugepages_kobj = NULL;
}
/*
- * hugetlb module exit: unregister hstate attributes from node sysdevs
+ * hugetlb module exit: unregister hstate attributes from node devices
* that have them.
*/
static void hugetlb_unregister_all_nodes(void)
@@ -1654,7 +1879,7 @@ static void hugetlb_unregister_all_nodes(void)
int nid;
/*
- * disable node sysdev registrations.
+ * disable node device registrations.
*/
register_hugetlbfs_with_node(NULL, NULL);
@@ -1662,24 +1887,24 @@ static void hugetlb_unregister_all_nodes(void)
* remove hstate attributes from any nodes that have them.
*/
for (nid = 0; nid < nr_node_ids; nid++)
- hugetlb_unregister_node(&node_devices[nid]);
+ hugetlb_unregister_node(node_devices[nid]);
}
/*
- * Register hstate attributes for a single node sysdev.
+ * Register hstate attributes for a single node device.
* No-op if attributes already registered.
*/
-void hugetlb_register_node(struct node *node)
+static void hugetlb_register_node(struct node *node)
{
struct hstate *h;
- struct node_hstate *nhs = &node_hstates[node->sysdev.id];
+ struct node_hstate *nhs = &node_hstates[node->dev.id];
int err;
if (nhs->hugepages_kobj)
return; /* already allocated */
nhs->hugepages_kobj = kobject_create_and_add("hugepages",
- &node->sysdev.kobj);
+ &node->dev.kobj);
if (!nhs->hugepages_kobj)
return;
@@ -1688,9 +1913,8 @@ void hugetlb_register_node(struct node *node)
nhs->hstate_kobjs,
&per_node_hstate_attr_group);
if (err) {
- printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
- " for node %d\n",
- h->name, node->sysdev.id);
+ pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
+ h->name, node->dev.id);
hugetlb_unregister_node(node);
break;
}
@@ -1699,21 +1923,21 @@ void hugetlb_register_node(struct node *node)
/*
* hugetlb init time: register hstate attributes for all registered node
- * sysdevs of nodes that have memory. All on-line nodes should have
- * registered their associated sysdev by this time.
+ * devices of nodes that have memory. All on-line nodes should have
+ * registered their associated device by this time.
*/
static void hugetlb_register_all_nodes(void)
{
int nid;
- for_each_node_state(nid, N_HIGH_MEMORY) {
- struct node *node = &node_devices[nid];
- if (node->sysdev.id == nid)
+ for_each_node_state(nid, N_MEMORY) {
+ struct node *node = node_devices[nid];
+ if (node->dev.id == nid)
hugetlb_register_node(node);
}
/*
- * Let the node sysdev driver know we're here so it can
+ * Let the node device driver know we're here so it can
* [un]register hstate attributes on node hotplug.
*/
register_hugetlbfs_with_node(hugetlb_register_node,
@@ -1742,7 +1966,7 @@ static void __exit hugetlb_exit(void)
hugetlb_unregister_all_nodes();
for_each_hstate(h) {
- kobject_put(hstate_kobjs[h - hstates]);
+ kobject_put(hstate_kobjs[hstate_index(h)]);
}
kobject_put(hugepages_kobj);
@@ -1763,19 +1987,17 @@ static int __init hugetlb_init(void)
if (!size_to_hstate(default_hstate_size))
hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
}
- default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
+ default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
if (default_hstate_max_huge_pages)
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
hugetlb_init_hstates();
-
gather_bootmem_prealloc();
-
report_hugepages();
hugetlb_sysfs_init();
-
hugetlb_register_all_nodes();
+ hugetlb_cgroup_file_init();
return 0;
}
@@ -1788,20 +2010,21 @@ void __init hugetlb_add_hstate(unsigned order)
unsigned long i;
if (size_to_hstate(PAGE_SIZE << order)) {
- printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
+ pr_warning("hugepagesz= specified twice, ignoring\n");
return;
}
- BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
+ BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
BUG_ON(order == 0);
- h = &hstates[max_hstate++];
+ h = &hstates[hugetlb_max_hstate++];
h->order = order;
h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
h->nr_huge_pages = 0;
h->free_huge_pages = 0;
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
- h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
- h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
+ INIT_LIST_HEAD(&h->hugepage_activelist);
+ h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
+ h->next_nid_to_free = first_node(node_states[N_MEMORY]);
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
@@ -1814,17 +2037,17 @@ static int __init hugetlb_nrpages_setup(char *s)
static unsigned long *last_mhp;
/*
- * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
+ * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
* so this hugepages= parameter goes to the "default hstate".
*/
- if (!max_hstate)
+ if (!hugetlb_max_hstate)
mhp = &default_hstate_max_huge_pages;
else
mhp = &parsed_hstate->max_huge_pages;
if (mhp == last_mhp) {
- printk(KERN_WARNING "hugepages= specified twice without "
- "interleaving hugepagesz=, ignoring\n");
+ pr_warning("hugepages= specified twice without "
+ "interleaving hugepagesz=, ignoring\n");
return 1;
}
@@ -1836,7 +2059,7 @@ static int __init hugetlb_nrpages_setup(char *s)
* But we need to allocate >= MAX_ORDER hstates here early to still
* use the bootmem allocator.
*/
- if (max_hstate && parsed_hstate->order >= MAX_ORDER)
+ if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
hugetlb_hstate_alloc_pages(parsed_hstate);
last_mhp = mhp;
@@ -1872,8 +2095,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
unsigned long tmp;
int ret;
- if (!write)
- tmp = h->max_huge_pages;
+ tmp = h->max_huge_pages;
if (write && h->order >= MAX_ORDER)
return -EINVAL;
@@ -1890,11 +2112,11 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
if (!(obey_mempolicy &&
init_nodemask_of_mempolicy(nodes_allowed))) {
NODEMASK_FREE(nodes_allowed);
- nodes_allowed = &node_states[N_HIGH_MEMORY];
+ nodes_allowed = &node_states[N_MEMORY];
}
h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
- if (nodes_allowed != &node_states[N_HIGH_MEMORY])
+ if (nodes_allowed != &node_states[N_MEMORY])
NODEMASK_FREE(nodes_allowed);
}
out:
@@ -1918,18 +2140,6 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
}
#endif /* CONFIG_NUMA */
-int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *length, loff_t *ppos)
-{
- proc_dointvec(table, write, buffer, length, ppos);
- if (hugepages_treat_as_movable)
- htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
- else
- htlb_alloc_mask = GFP_HIGHUSER;
- return 0;
-}
-
int hugetlb_overcommit_handler(struct ctl_table *table, int write,
void __user *buffer,
size_t *length, loff_t *ppos)
@@ -1938,8 +2148,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
unsigned long tmp;
int ret;
- if (!write)
- tmp = h->nr_overcommit_huge_pages;
+ tmp = h->nr_overcommit_huge_pages;
if (write && h->order >= MAX_ORDER)
return -EINVAL;
@@ -1989,11 +2198,30 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
nid, h->surplus_huge_pages_node[nid]);
}
+void hugetlb_show_meminfo(void)
+{
+ struct hstate *h;
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY)
+ for_each_hstate(h)
+ pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
+ nid,
+ h->nr_huge_pages_node[nid],
+ h->free_huge_pages_node[nid],
+ h->surplus_huge_pages_node[nid],
+ 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
+}
+
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
- struct hstate *h = &default_hstate;
- return h->nr_huge_pages * pages_per_huge_page(h);
+ struct hstate *h;
+ unsigned long nr_total_pages = 0;
+
+ for_each_hstate(h)
+ nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
+ return nr_total_pages;
}
static int hugetlb_acct_memory(struct hstate *h, long delta)
@@ -2039,40 +2267,50 @@ out:
static void hugetlb_vm_op_open(struct vm_area_struct *vma)
{
- struct resv_map *reservations = vma_resv_map(vma);
+ struct resv_map *resv = vma_resv_map(vma);
/*
* This new VMA should share its siblings reservation map if present.
* The VMA will only ever have a valid reservation map pointer where
* it is being copied for another still existing VMA. As that VMA
- * has a reference to the reservation map it cannot dissappear until
+ * has a reference to the reservation map it cannot disappear until
* after this open call completes. It is therefore safe to take a
* new reference here without additional locking.
*/
- if (reservations)
- kref_get(&reservations->refs);
+ if (resv)
+ kref_get(&resv->refs);
+}
+
+static void resv_map_put(struct vm_area_struct *vma)
+{
+ struct resv_map *resv = vma_resv_map(vma);
+
+ if (!resv)
+ return;
+ kref_put(&resv->refs, resv_map_release);
}
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
- struct resv_map *reservations = vma_resv_map(vma);
+ struct resv_map *resv = vma_resv_map(vma);
+ struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve;
unsigned long start;
unsigned long end;
- if (reservations) {
+ if (resv) {
start = vma_hugecache_offset(h, vma, vma->vm_start);
end = vma_hugecache_offset(h, vma, vma->vm_end);
reserve = (end - start) -
- region_count(&reservations->regions, start, end);
+ region_count(&resv->regions, start, end);
- kref_put(&reservations->refs, resv_map_release);
+ resv_map_put(vma);
if (reserve) {
hugetlb_acct_memory(h, -reserve);
- hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
+ hugepage_subpool_put_pages(spool, reserve);
}
}
}
@@ -2101,13 +2339,15 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
pte_t entry;
if (writable) {
- entry =
- pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+ entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
+ vma->vm_page_prot)));
} else {
- entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+ entry = huge_pte_wrprotect(mk_huge_pte(page,
+ vma->vm_page_prot));
}
entry = pte_mkyoung(entry);
entry = pte_mkhuge(entry);
+ entry = arch_make_huge_pte(entry, vma, page, writable);
return entry;
}
@@ -2117,10 +2357,9 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
{
pte_t entry;
- entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
- if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
+ entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
+ if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
update_mmu_cache(vma, address, ptep);
- }
}
@@ -2175,9 +2414,9 @@ static int is_hugetlb_entry_migration(pte_t pte)
if (huge_pte_none(pte) || pte_present(pte))
return 0;
swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_migration_entry(swp)) {
+ if (non_swap_entry(swp) && is_migration_entry(swp))
return 1;
- } else
+ else
return 0;
}
@@ -2188,36 +2427,34 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
if (huge_pte_none(pte) || pte_present(pte))
return 0;
swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
+ if (non_swap_entry(swp) && is_hwpoison_entry(swp))
return 1;
- } else
+ else
return 0;
}
-void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page)
+void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct page *ref_page)
{
+ int force_flush = 0;
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *ptep;
pte_t pte;
struct page *page;
- struct page *tmp;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
-
- /*
- * A page gathering list, protected by per file i_mmap_lock. The
- * lock is used to avoid list corruption from multiple unmapping
- * of the same page since we are using page->lru.
- */
- LIST_HEAD(page_list);
+ const unsigned long mmun_start = start; /* For mmu_notifiers */
+ const unsigned long mmun_end = end; /* For mmu_notifiers */
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
BUG_ON(end & ~huge_page_mask(h));
- mmu_notifier_invalidate_range_start(mm, start, end);
+ tlb_start_vma(tlb, vma);
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+again:
spin_lock(&mm->page_table_lock);
for (address = start; address < end; address += sz) {
ptep = huge_pte_offset(mm, address);
@@ -2227,16 +2464,25 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
if (huge_pmd_unshare(mm, &address, ptep))
continue;
+ pte = huge_ptep_get(ptep);
+ if (huge_pte_none(pte))
+ continue;
+
+ /*
+ * HWPoisoned hugepage is already unmapped and dropped reference
+ */
+ if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
+ huge_pte_clear(mm, address, ptep);
+ continue;
+ }
+
+ page = pte_page(pte);
/*
* If a reference page is supplied, it is because a specific
* page is being unmapped, not a range. Ensure the page we
* are about to unmap is the actual page of interest.
*/
if (ref_page) {
- pte = huge_ptep_get(ptep);
- if (huge_pte_none(pte))
- continue;
- page = pte_page(pte);
if (page != ref_page)
continue;
@@ -2249,36 +2495,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
}
pte = huge_ptep_get_and_clear(mm, address, ptep);
- if (huge_pte_none(pte))
- continue;
-
- /*
- * HWPoisoned hugepage is already unmapped and dropped reference
- */
- if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
- continue;
-
- page = pte_page(pte);
- if (pte_dirty(pte))
+ tlb_remove_tlb_entry(tlb, ptep, address);
+ if (huge_pte_dirty(pte))
set_page_dirty(page);
- list_add(&page->lru, &page_list);
+
+ page_remove_rmap(page);
+ force_flush = !__tlb_remove_page(tlb, page);
+ if (force_flush)
+ break;
+ /* Bail out after unmapping reference page if supplied */
+ if (ref_page)
+ break;
}
spin_unlock(&mm->page_table_lock);
- flush_tlb_range(vma, start, end);
- mmu_notifier_invalidate_range_end(mm, start, end);
- list_for_each_entry_safe(page, tmp, &page_list, lru) {
- page_remove_rmap(page);
- list_del(&page->lru);
- put_page(page);
+ /*
+ * mmu_gather ran out of room to batch pages, we break out of
+ * the PTE lock to avoid doing the potential expensive TLB invalidate
+ * and page-free while holding it.
+ */
+ if (force_flush) {
+ force_flush = 0;
+ tlb_flush_mmu(tlb);
+ if (address < end && !ref_page)
+ goto again;
}
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ tlb_end_vma(tlb, vma);
+}
+
+void __unmap_hugepage_range_final(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, struct page *ref_page)
+{
+ __unmap_hugepage_range(tlb, vma, start, end, ref_page);
+
+ /*
+ * Clear this flag so that x86's huge_pmd_share page_table_shareable
+ * test will fail on a vma being torn down, and not grab a page table
+ * on its way out. We're lucky that the flag has such an appropriate
+ * name, and can in fact be safely cleared here. We could clear it
+ * before the __unmap_hugepage_range above, but all that's necessary
+ * is to clear it before releasing the i_mmap_mutex. This works
+ * because in the context this is called, the VMA is about to be
+ * destroyed and the i_mmap_mutex is held.
+ */
+ vma->vm_flags &= ~VM_MAYSHARE;
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page)
{
- spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
- __unmap_hugepage_range(vma, start, end, ref_page);
- spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+ struct mm_struct *mm;
+ struct mmu_gather tlb;
+
+ mm = vma->vm_mm;
+
+ tlb_gather_mmu(&tlb, mm, start, end);
+ __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+ tlb_finish_mmu(&tlb, start, end);
}
/*
@@ -2293,7 +2567,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
struct hstate *h = hstate_vma(vma);
struct vm_area_struct *iter_vma;
struct address_space *mapping;
- struct prio_tree_iter iter;
pgoff_t pgoff;
/*
@@ -2301,17 +2574,17 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* from page cache lookup which is in HPAGE_SIZE units.
*/
address = address & huge_page_mask(h);
- pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
- + (vma->vm_pgoff >> PAGE_SHIFT);
- mapping = (struct address_space *)page_private(page);
+ pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
+ mapping = file_inode(vma->vm_file)->i_mapping;
/*
* Take the mapping lock for the duration of the table walk. As
* this mapping should be shared between all the VMAs,
* __unmap_hugepage_range() is called as the lock is already held
*/
- spin_lock(&mapping->i_mmap_lock);
- vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ mutex_lock(&mapping->i_mmap_mutex);
+ vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
/* Do not unmap the current VMA */
if (iter_vma == vma)
continue;
@@ -2324,17 +2597,19 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* from the time of fork. This would look like data corruption
*/
if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
- __unmap_hugepage_range(iter_vma,
- address, address + huge_page_size(h),
- page);
+ unmap_hugepage_range(iter_vma, address,
+ address + huge_page_size(h), page);
}
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
return 1;
}
/*
* Hugetlb_cow() should be called with page lock of the original hugepage held.
+ * Called with hugetlb_instantiation_mutex held and pte_page locked so we
+ * cannot race with other handlers or page migration.
+ * Keep the pte_same checks anyway to make transition from the mutex easier.
*/
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, pte_t pte,
@@ -2342,18 +2617,17 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct hstate *h = hstate_vma(vma);
struct page *old_page, *new_page;
- int avoidcopy;
int outside_reserve = 0;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
old_page = pte_page(pte);
retry_avoidcopy:
/* If no-one else is actually using this page, avoid the copy
* and just make the page writable */
- avoidcopy = (page_mapcount(old_page) == 1);
- if (avoidcopy) {
- if (PageAnon(old_page))
- page_move_anon_rmap(old_page, vma, address);
+ if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
+ page_move_anon_rmap(old_page, vma, address);
set_huge_ptep_writable(vma, address, ptep);
return 0;
}
@@ -2367,8 +2641,7 @@ retry_avoidcopy:
* at the time of fork() could consume its reserves on COW instead
* of the full address range.
*/
- if (!(vma->vm_flags & VM_MAYSHARE) &&
- is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
+ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
old_page != pagecache_page)
outside_reserve = 1;
@@ -2379,6 +2652,7 @@ retry_avoidcopy:
new_page = alloc_huge_page(vma, address, outside_reserve);
if (IS_ERR(new_page)) {
+ long err = PTR_ERR(new_page);
page_cache_release(old_page);
/*
@@ -2391,17 +2665,26 @@ retry_avoidcopy:
if (outside_reserve) {
BUG_ON(huge_pte_none(pte));
if (unmap_ref_private(mm, vma, old_page, address)) {
- BUG_ON(page_count(old_page) != 1);
BUG_ON(huge_pte_none(pte));
spin_lock(&mm->page_table_lock);
- goto retry_avoidcopy;
+ ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+ if (likely(pte_same(huge_ptep_get(ptep), pte)))
+ goto retry_avoidcopy;
+ /*
+ * race occurs while re-acquiring page_table_lock, and
+ * our job is done.
+ */
+ return 0;
}
WARN_ON_ONCE(1);
}
/* Caller expects lock to be held */
spin_lock(&mm->page_table_lock);
- return -PTR_ERR(new_page);
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ else
+ return VM_FAULT_SIGBUS;
}
/*
@@ -2409,6 +2692,8 @@ retry_avoidcopy:
* anon_vma prepared.
*/
if (unlikely(anon_vma_prepare(vma))) {
+ page_cache_release(new_page);
+ page_cache_release(old_page);
/* Caller expects lock to be held */
spin_lock(&mm->page_table_lock);
return VM_FAULT_OOM;
@@ -2418,6 +2703,9 @@ retry_avoidcopy:
pages_per_huge_page(h));
__SetPageUptodate(new_page);
+ mmun_start = address & huge_page_mask(h);
+ mmun_end = mmun_start + huge_page_size(h);
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
/*
* Retake the page_table_lock to check for racing updates
* before the page tables are altered
@@ -2425,10 +2713,9 @@ retry_avoidcopy:
spin_lock(&mm->page_table_lock);
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
if (likely(pte_same(huge_ptep_get(ptep), pte))) {
+ ClearPagePrivate(new_page);
+
/* Break COW */
- mmu_notifier_invalidate_range_start(mm,
- address & huge_page_mask(h),
- (address & huge_page_mask(h)) + huge_page_size(h));
huge_ptep_clear_flush(vma, address, ptep);
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
@@ -2436,12 +2723,14 @@ retry_avoidcopy:
hugepage_add_new_anon_rmap(new_page, vma, address);
/* Make the old page be freed below */
new_page = old_page;
- mmu_notifier_invalidate_range_end(mm,
- address & huge_page_mask(h),
- (address & huge_page_mask(h)) + huge_page_size(h));
}
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
page_cache_release(new_page);
page_cache_release(old_page);
+
+ /* Caller expects lock to be held */
+ spin_lock(&mm->page_table_lock);
return 0;
}
@@ -2483,6 +2772,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct hstate *h = hstate_vma(vma);
int ret = VM_FAULT_SIGBUS;
+ int anon_rmap = 0;
pgoff_t idx;
unsigned long size;
struct page *page;
@@ -2492,12 +2782,11 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Currently, we are forced to kill the process in the event the
* original mapper has unmapped pages from the child due to a failed
- * COW. Warn that such a situation has occured as it may not be obvious
+ * COW. Warn that such a situation has occurred as it may not be obvious
*/
if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
- printk(KERN_WARNING
- "PID %d killed due to inadequate hugepage pool\n",
- current->pid);
+ pr_warning("PID %d killed due to inadequate hugepage pool\n",
+ current->pid);
return ret;
}
@@ -2516,7 +2805,11 @@ retry:
goto out;
page = alloc_huge_page(vma, address, 0);
if (IS_ERR(page)) {
- ret = -PTR_ERR(page);
+ ret = PTR_ERR(page);
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else
+ ret = VM_FAULT_SIGBUS;
goto out;
}
clear_huge_page(page, address, pages_per_huge_page(h));
@@ -2533,18 +2826,18 @@ retry:
goto retry;
goto out;
}
+ ClearPagePrivate(page);
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h);
spin_unlock(&inode->i_lock);
- page_dup_rmap(page);
} else {
lock_page(page);
if (unlikely(anon_vma_prepare(vma))) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
}
- hugepage_add_new_anon_rmap(page, vma, address);
+ anon_rmap = 1;
}
} else {
/*
@@ -2553,11 +2846,10 @@ retry:
* So we need to block hugepage fault by PG_hwpoison bit check.
*/
if (unlikely(PageHWPoison(page))) {
- ret = VM_FAULT_HWPOISON |
- VM_FAULT_SET_HINDEX(h - hstates);
+ ret = VM_FAULT_HWPOISON |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
goto backout_unlocked;
}
- page_dup_rmap(page);
}
/*
@@ -2581,6 +2873,12 @@ retry:
if (!huge_pte_none(huge_ptep_get(ptep)))
goto backout;
+ if (anon_rmap) {
+ ClearPagePrivate(page);
+ hugepage_add_new_anon_rmap(page, vma, address);
+ }
+ else
+ page_dup_rmap(page);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
set_huge_pte_at(mm, address, ptep, new_pte);
@@ -2614,15 +2912,17 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
static DEFINE_MUTEX(hugetlb_instantiation_mutex);
struct hstate *h = hstate_vma(vma);
+ address &= huge_page_mask(h);
+
ptep = huge_pte_offset(mm, address);
if (ptep) {
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
- migration_entry_wait(mm, (pmd_t *)ptep, address);
+ migration_entry_wait_huge(mm, ptep);
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
- return VM_FAULT_HWPOISON_LARGE |
- VM_FAULT_SET_HINDEX(h - hstates);
+ return VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
}
ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2651,7 +2951,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* page now as it is used to determine if a reservation has been
* consumed.
*/
- if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) {
+ if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
if (vma_needs_reservation(h, vma, address) < 0) {
ret = VM_FAULT_OOM;
goto out_mutex;
@@ -2670,6 +2970,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* so no worry about deadlock.
*/
page = pte_page(entry);
+ get_page(page);
if (page != pagecache_page)
lock_page(page);
@@ -2680,12 +2981,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & FAULT_FLAG_WRITE) {
- if (!pte_write(entry)) {
+ if (!huge_pte_write(entry)) {
ret = hugetlb_cow(mm, vma, address, ptep, entry,
pagecache_page);
goto out_page_table_lock;
}
- entry = pte_mkdirty(entry);
+ entry = huge_pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (huge_ptep_set_access_flags(vma, address, ptep, entry,
@@ -2701,6 +3002,7 @@ out_page_table_lock:
}
if (page != pagecache_page)
unlock_page(page);
+ put_page(page);
out_mutex:
mutex_unlock(&hugetlb_instantiation_mutex);
@@ -2708,23 +3010,14 @@ out_mutex:
return ret;
}
-/* Can be overriden by architectures */
-__attribute__((weak)) struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int write)
-{
- BUG();
- return NULL;
-}
-
-int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, struct vm_area_struct **vmas,
- unsigned long *position, int *length, int i,
- unsigned int flags)
+long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page **pages, struct vm_area_struct **vmas,
+ unsigned long *position, unsigned long *nr_pages,
+ long i, unsigned int flags)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
- int remainder = *length;
+ unsigned long remainder = *nr_pages;
struct hstate *h = hstate_vma(vma);
spin_lock(&mm->page_table_lock);
@@ -2754,8 +3047,19 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
break;
}
- if (absent ||
- ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
+ /*
+ * We need call hugetlb_fault for both hugepages under migration
+ * (in which case hugetlb_fault waits for the migration,) and
+ * hwpoisoned hugepages (in which case we need to prevent the
+ * caller from accessing to them.) In order to do this, we use
+ * here is_swap_pte instead of is_hugetlb_entry_migration and
+ * is_hugetlb_entry_hwpoisoned. This is because it simply covers
+ * both cases, and because we can't follow correct pages
+ * directly from any kind of swap entries.
+ */
+ if (absent || is_swap_pte(huge_ptep_get(pte)) ||
+ ((flags & FOLL_WRITE) &&
+ !huge_pte_write(huge_ptep_get(pte)))) {
int ret;
spin_unlock(&mm->page_table_lock);
@@ -2794,13 +3098,13 @@ same_page:
}
}
spin_unlock(&mm->page_table_lock);
- *length = remainder;
+ *nr_pages = remainder;
*position = vaddr;
return i ? i : -EFAULT;
}
-void hugetlb_change_protection(struct vm_area_struct *vma,
+unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot)
{
struct mm_struct *mm = vma->vm_mm;
@@ -2808,44 +3112,57 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
pte_t *ptep;
pte_t pte;
struct hstate *h = hstate_vma(vma);
+ unsigned long pages = 0;
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
- spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+ mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
spin_lock(&mm->page_table_lock);
for (; address < end; address += huge_page_size(h)) {
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
- if (huge_pmd_unshare(mm, &address, ptep))
+ if (huge_pmd_unshare(mm, &address, ptep)) {
+ pages++;
continue;
+ }
if (!huge_pte_none(huge_ptep_get(ptep))) {
pte = huge_ptep_get_and_clear(mm, address, ptep);
- pte = pte_mkhuge(pte_modify(pte, newprot));
+ pte = pte_mkhuge(huge_pte_modify(pte, newprot));
+ pte = arch_make_huge_pte(pte, vma, NULL, 0);
set_huge_pte_at(mm, address, ptep, pte);
+ pages++;
}
}
spin_unlock(&mm->page_table_lock);
- spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
-
+ /*
+ * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+ * may have cleared our pud entry and done put_page on the page table:
+ * once we release i_mmap_mutex, another task can do the final put_page
+ * and that page table be reused and filled with junk.
+ */
flush_tlb_range(vma, start, end);
+ mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+
+ return pages << h->order;
}
int hugetlb_reserve_pages(struct inode *inode,
long from, long to,
struct vm_area_struct *vma,
- int acctflag)
+ vm_flags_t vm_flags)
{
long ret, chg;
struct hstate *h = hstate_inode(inode);
+ struct hugepage_subpool *spool = subpool_inode(inode);
/*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
- * and filesystem quota without using reserves
+ * without using reserves
*/
- if (acctflag & VM_NORESERVE)
+ if (vm_flags & VM_NORESERVE)
return 0;
/*
@@ -2867,21 +3184,25 @@ int hugetlb_reserve_pages(struct inode *inode,
set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
}
- if (chg < 0)
- return chg;
+ if (chg < 0) {
+ ret = chg;
+ goto out_err;
+ }
- /* There must be enough filesystem quota for the mapping */
- if (hugetlb_get_quota(inode->i_mapping, chg))
- return -ENOSPC;
+ /* There must be enough pages in the subpool for the mapping */
+ if (hugepage_subpool_get_pages(spool, chg)) {
+ ret = -ENOSPC;
+ goto out_err;
+ }
/*
* Check enough hugepages are available for the reservation.
- * Hand back the quota if there are not
+ * Hand the pages back to the subpool if there are not
*/
ret = hugetlb_acct_memory(h, chg);
if (ret < 0) {
- hugetlb_put_quota(inode->i_mapping, chg);
- return ret;
+ hugepage_subpool_put_pages(spool, chg);
+ goto out_err;
}
/*
@@ -2898,21 +3219,236 @@ int hugetlb_reserve_pages(struct inode *inode,
if (!vma || vma->vm_flags & VM_MAYSHARE)
region_add(&inode->i_mapping->private_list, from, to);
return 0;
+out_err:
+ if (vma)
+ resv_map_put(vma);
+ return ret;
}
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
{
struct hstate *h = hstate_inode(inode);
long chg = region_truncate(&inode->i_mapping->private_list, offset);
+ struct hugepage_subpool *spool = subpool_inode(inode);
spin_lock(&inode->i_lock);
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
spin_unlock(&inode->i_lock);
- hugetlb_put_quota(inode->i_mapping, (chg - freed));
+ hugepage_subpool_put_pages(spool, (chg - freed));
hugetlb_acct_memory(h, -(chg - freed));
}
+#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+static unsigned long page_table_shareable(struct vm_area_struct *svma,
+ struct vm_area_struct *vma,
+ unsigned long addr, pgoff_t idx)
+{
+ unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
+ svma->vm_start;
+ unsigned long sbase = saddr & PUD_MASK;
+ unsigned long s_end = sbase + PUD_SIZE;
+
+ /* Allow segments to share if only one is marked locked */
+ unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
+ unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
+
+ /*
+ * match the virtual addresses, permission and the alignment of the
+ * page table page.
+ */
+ if (pmd_index(addr) != pmd_index(saddr) ||
+ vm_flags != svm_flags ||
+ sbase < svma->vm_start || svma->vm_end < s_end)
+ return 0;
+
+ return saddr;
+}
+
+static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+{
+ unsigned long base = addr & PUD_MASK;
+ unsigned long end = base + PUD_SIZE;
+
+ /*
+ * check on proper vm_flags and page table alignment
+ */
+ if (vma->vm_flags & VM_MAYSHARE &&
+ vma->vm_start <= base && end <= vma->vm_end)
+ return 1;
+ return 0;
+}
+
+/*
+ * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
+ * and returns the corresponding pte. While this is not necessary for the
+ * !shared pmd case because we can allocate the pmd later as well, it makes the
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_mutex section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
+ */
+pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+ struct vm_area_struct *vma = find_vma(mm, addr);
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
+ struct vm_area_struct *svma;
+ unsigned long saddr;
+ pte_t *spte = NULL;
+ pte_t *pte;
+
+ if (!vma_shareable(vma, addr))
+ return (pte_t *)pmd_alloc(mm, pud, addr);
+
+ mutex_lock(&mapping->i_mmap_mutex);
+ vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
+ if (svma == vma)
+ continue;
+
+ saddr = page_table_shareable(svma, vma, addr, idx);
+ if (saddr) {
+ spte = huge_pte_offset(svma->vm_mm, saddr);
+ if (spte) {
+ get_page(virt_to_page(spte));
+ break;
+ }
+ }
+ }
+
+ if (!spte)
+ goto out;
+
+ spin_lock(&mm->page_table_lock);
+ if (pud_none(*pud))
+ pud_populate(mm, pud,
+ (pmd_t *)((unsigned long)spte & PAGE_MASK));
+ else
+ put_page(virt_to_page(spte));
+ spin_unlock(&mm->page_table_lock);
+out:
+ pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ mutex_unlock(&mapping->i_mmap_mutex);
+ return pte;
+}
+
+/*
+ * unmap huge page backed by shared pte.
+ *
+ * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
+ * indicated by page_count > 1, unmap is achieved by clearing pud and
+ * decrementing the ref count. If count == 1, the pte page is not shared.
+ *
+ * called with vma->vm_mm->page_table_lock held.
+ *
+ * returns: 1 successfully unmapped a shared pte page
+ * 0 the underlying pte page is not shared, or it is the last user
+ */
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+ pgd_t *pgd = pgd_offset(mm, *addr);
+ pud_t *pud = pud_offset(pgd, *addr);
+
+ BUG_ON(page_count(virt_to_page(ptep)) == 0);
+ if (page_count(virt_to_page(ptep)) == 1)
+ return 0;
+
+ pud_clear(pud);
+ put_page(virt_to_page(ptep));
+ *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+ return 1;
+}
+#define want_pmd_share() (1)
+#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+ return NULL;
+}
+#define want_pmd_share() (0)
+#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+
+#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+ unsigned long addr, unsigned long sz)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pte_t *pte = NULL;
+
+ pgd = pgd_offset(mm, addr);
+ pud = pud_alloc(mm, pgd, addr);
+ if (pud) {
+ if (sz == PUD_SIZE) {
+ pte = (pte_t *)pud;
+ } else {
+ BUG_ON(sz != PMD_SIZE);
+ if (want_pmd_share() && pud_none(*pud))
+ pte = huge_pmd_share(mm, addr, pud);
+ else
+ pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ }
+ }
+ BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
+
+ return pte;
+}
+
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd = NULL;
+
+ pgd = pgd_offset(mm, addr);
+ if (pgd_present(*pgd)) {
+ pud = pud_offset(pgd, addr);
+ if (pud_present(*pud)) {
+ if (pud_huge(*pud))
+ return (pte_t *)pud;
+ pmd = pmd_offset(pud, addr);
+ }
+ }
+ return (pte_t *) pmd;
+}
+
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+ pmd_t *pmd, int write)
+{
+ struct page *page;
+
+ page = pte_page(*(pte_t *)pmd);
+ if (page)
+ page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
+ return page;
+}
+
+struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int write)
+{
+ struct page *page;
+
+ page = pte_page(*(pte_t *)pud);
+ if (page)
+ page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
+ return page;
+}
+
+#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+
+/* Can be overriden by architectures */
+__attribute__((weak)) struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int write)
+{
+ BUG();
+ return NULL;
+}
+
+#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+
#ifdef CONFIG_MEMORY_FAILURE
/* Should be called in hugetlb_lock */
@@ -2941,7 +3477,13 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
spin_lock(&hugetlb_lock);
if (is_hugepage_on_freelist(hpage)) {
- list_del(&hpage->lru);
+ /*
+ * Hwpoisoned hugepage isn't linked to activelist or freelist,
+ * but dangling hpage->lru can trigger list-debug warnings
+ * (this happens when we call unpoison_memory() on it),
+ * so let it point to itself with list_del_init().
+ */
+ list_del_init(&hpage->lru);
set_page_refcounted(hpage);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
@@ -2951,3 +3493,45 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
return ret;
}
#endif
+
+bool isolate_huge_page(struct page *page, struct list_head *list)
+{
+ VM_BUG_ON(!PageHead(page));
+ if (!get_page_unless_zero(page))
+ return false;
+ spin_lock(&hugetlb_lock);
+ list_move_tail(&page->lru, list);
+ spin_unlock(&hugetlb_lock);
+ return true;
+}
+
+void putback_active_hugepage(struct page *page)
+{
+ VM_BUG_ON(!PageHead(page));
+ spin_lock(&hugetlb_lock);
+ list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
+ spin_unlock(&hugetlb_lock);
+ put_page(page);
+}
+
+bool is_hugepage_active(struct page *page)
+{
+ VM_BUG_ON(!PageHuge(page));
+ /*
+ * This function can be called for a tail page because the caller,
+ * scan_movable_pages, scans through a given pfn-range which typically
+ * covers one memory block. In systems using gigantic hugepage (1GB
+ * for x86_64,) a hugepage is larger than a memory block, and we don't
+ * support migrating such large hugepages for now, so return false
+ * when called for tail pages.
+ */
+ if (PageTail(page))
+ return false;
+ /*
+ * Refcount of a hwpoisoned hugepages is 1, but they are not active,
+ * so we should return false for them.
+ */
+ if (unlikely(PageHWPoison(page)))
+ return false;
+ return page_count(page) > 0;
+}