summaryrefslogtreecommitdiff
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c803
1 files changed, 618 insertions, 185 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dd8737a94bec..f5fb53fdfa02 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -220,132 +220,303 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
return subpool_inode(file_inode(vma->vm_file));
}
-/*
- * Region tracking -- allows tracking of reservations and instantiated pages
- * across the pages in a mapping.
- *
- * The region data structures are embedded into a resv_map and protected
- * by a resv_map's lock. The set of regions within the resv_map represent
- * reservations for huge pages, or huge pages that have already been
- * instantiated within the map. The from and to elements are huge page
- * indicies into the associated mapping. from indicates the starting index
- * of the region. to represents the first index past the end of the region.
- *
- * For example, a file region structure with from == 0 and to == 4 represents
- * four huge pages in a mapping. It is important to note that the to element
- * represents the first element past the end of the region. This is used in
- * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
- *
- * Interval notation of the form [from, to) will be used to indicate that
- * the endpoint from is inclusive and to is exclusive.
+/* Helper that removes a struct file_region from the resv_map cache and returns
+ * it for use.
*/
-struct file_region {
- struct list_head link;
- long from;
- long to;
-};
+static struct file_region *
+get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
+{
+ struct file_region *nrg = NULL;
+
+ VM_BUG_ON(resv->region_cache_count <= 0);
+
+ resv->region_cache_count--;
+ nrg = list_first_entry(&resv->region_cache, struct file_region, link);
+ VM_BUG_ON(!nrg);
+ list_del(&nrg->link);
+
+ nrg->from = from;
+ nrg->to = to;
+
+ return nrg;
+}
+
+static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
+ struct file_region *rg)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ nrg->reservation_counter = rg->reservation_counter;
+ nrg->css = rg->css;
+ if (rg->css)
+ css_get(rg->css);
+#endif
+}
+
+/* Helper that records hugetlb_cgroup uncharge info. */
+static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
+ struct hstate *h,
+ struct resv_map *resv,
+ struct file_region *nrg)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ if (h_cg) {
+ nrg->reservation_counter =
+ &h_cg->rsvd_hugepage[hstate_index(h)];
+ nrg->css = &h_cg->css;
+ if (!resv->pages_per_hpage)
+ resv->pages_per_hpage = pages_per_huge_page(h);
+ /* pages_per_hpage should be the same for all entries in
+ * a resv_map.
+ */
+ VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
+ } else {
+ nrg->reservation_counter = NULL;
+ nrg->css = NULL;
+ }
+#endif
+}
+
+static bool has_same_uncharge_info(struct file_region *rg,
+ struct file_region *org)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ return rg && org &&
+ rg->reservation_counter == org->reservation_counter &&
+ rg->css == org->css;
+
+#else
+ return true;
+#endif
+}
+
+static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
+{
+ struct file_region *nrg = NULL, *prg = NULL;
+
+ prg = list_prev_entry(rg, link);
+ if (&prg->link != &resv->regions && prg->to == rg->from &&
+ has_same_uncharge_info(prg, rg)) {
+ prg->to = rg->to;
+
+ list_del(&rg->link);
+ kfree(rg);
+
+ coalesce_file_region(resv, prg);
+ return;
+ }
+
+ nrg = list_next_entry(rg, link);
+ if (&nrg->link != &resv->regions && nrg->from == rg->to &&
+ has_same_uncharge_info(nrg, rg)) {
+ nrg->from = rg->from;
+
+ list_del(&rg->link);
+ kfree(rg);
+
+ coalesce_file_region(resv, nrg);
+ return;
+ }
+}
/* Must be called with resv->lock held. Calling this with count_only == true
* will count the number of pages to be added but will not modify the linked
- * list.
+ * list. If regions_needed != NULL and count_only == true, then regions_needed
+ * will indicate the number of file_regions needed in the cache to carry out to
+ * add the regions for this range.
*/
static long add_reservation_in_range(struct resv_map *resv, long f, long t,
+ struct hugetlb_cgroup *h_cg,
+ struct hstate *h, long *regions_needed,
bool count_only)
{
- long chg = 0;
+ long add = 0;
struct list_head *head = &resv->regions;
+ long last_accounted_offset = f;
struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
- /* Locate the region we are before or in. */
- list_for_each_entry(rg, head, link)
- if (f <= rg->to)
- break;
-
- /* Round our left edge to the current segment if it encloses us. */
- if (f > rg->from)
- f = rg->from;
+ if (regions_needed)
+ *regions_needed = 0;
- chg = t - f;
+ /* In this loop, we essentially handle an entry for the range
+ * [last_accounted_offset, rg->from), at every iteration, with some
+ * bounds checking.
+ */
+ list_for_each_entry_safe(rg, trg, head, link) {
+ /* Skip irrelevant regions that start before our range. */
+ if (rg->from < f) {
+ /* If this region ends after the last accounted offset,
+ * then we need to update last_accounted_offset.
+ */
+ if (rg->to > last_accounted_offset)
+ last_accounted_offset = rg->to;
+ continue;
+ }
- /* Check for and consume any regions we now overlap with. */
- nrg = rg;
- list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
- if (&rg->link == head)
- break;
+ /* When we find a region that starts beyond our range, we've
+ * finished.
+ */
if (rg->from > t)
break;
- /* We overlap with this area, if it extends further than
- * us then we must extend ourselves. Account for its
- * existing reservation.
+ /* Add an entry for last_accounted_offset -> rg->from, and
+ * update last_accounted_offset.
+ */
+ if (rg->from > last_accounted_offset) {
+ add += rg->from - last_accounted_offset;
+ if (!count_only) {
+ nrg = get_file_region_entry_from_cache(
+ resv, last_accounted_offset, rg->from);
+ record_hugetlb_cgroup_uncharge_info(h_cg, h,
+ resv, nrg);
+ list_add(&nrg->link, rg->link.prev);
+ coalesce_file_region(resv, nrg);
+ } else if (regions_needed)
+ *regions_needed += 1;
+ }
+
+ last_accounted_offset = rg->to;
+ }
+
+ /* Handle the case where our range extends beyond
+ * last_accounted_offset.
+ */
+ if (last_accounted_offset < t) {
+ add += t - last_accounted_offset;
+ if (!count_only) {
+ nrg = get_file_region_entry_from_cache(
+ resv, last_accounted_offset, t);
+ record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
+ list_add(&nrg->link, rg->link.prev);
+ coalesce_file_region(resv, nrg);
+ } else if (regions_needed)
+ *regions_needed += 1;
+ }
+
+ VM_BUG_ON(add < 0);
+ return add;
+}
+
+/* Must be called with resv->lock acquired. Will drop lock to allocate entries.
+ */
+static int allocate_file_region_entries(struct resv_map *resv,
+ int regions_needed)
+ __must_hold(&resv->lock)
+{
+ struct list_head allocated_regions;
+ int to_allocate = 0, i = 0;
+ struct file_region *trg = NULL, *rg = NULL;
+
+ VM_BUG_ON(regions_needed < 0);
+
+ INIT_LIST_HEAD(&allocated_regions);
+
+ /*
+ * Check for sufficient descriptors in the cache to accommodate
+ * the number of in progress add operations plus regions_needed.
+ *
+ * This is a while loop because when we drop the lock, some other call
+ * to region_add or region_del may have consumed some region_entries,
+ * so we keep looping here until we finally have enough entries for
+ * (adds_in_progress + regions_needed).
+ */
+ while (resv->region_cache_count <
+ (resv->adds_in_progress + regions_needed)) {
+ to_allocate = resv->adds_in_progress + regions_needed -
+ resv->region_cache_count;
+
+ /* At this point, we should have enough entries in the cache
+ * for all the existings adds_in_progress. We should only be
+ * needing to allocate for regions_needed.
*/
- if (rg->to > t) {
- chg += rg->to - t;
- t = rg->to;
+ VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
+
+ spin_unlock(&resv->lock);
+ for (i = 0; i < to_allocate; i++) {
+ trg = kmalloc(sizeof(*trg), GFP_KERNEL);
+ if (!trg)
+ goto out_of_memory;
+ list_add(&trg->link, &allocated_regions);
}
- chg -= rg->to - rg->from;
- if (!count_only && rg != nrg) {
+ spin_lock(&resv->lock);
+
+ list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
list_del(&rg->link);
- kfree(rg);
+ list_add(&rg->link, &resv->region_cache);
+ resv->region_cache_count++;
}
}
- if (!count_only) {
- nrg->from = f;
- nrg->to = t;
- }
+ return 0;
- return chg;
+out_of_memory:
+ list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
+ list_del(&rg->link);
+ kfree(rg);
+ }
+ return -ENOMEM;
}
/*
* Add the huge page range represented by [f, t) to the reserve
- * map. Existing regions will be expanded to accommodate the specified
- * range, or a region will be taken from the cache. Sufficient regions
- * must exist in the cache due to the previous call to region_chg with
- * the same range.
+ * map. Regions will be taken from the cache to fill in this range.
+ * Sufficient regions should exist in the cache due to the previous
+ * call to region_chg with the same range, but in some cases the cache will not
+ * have sufficient entries due to races with other code doing region_add or
+ * region_del. The extra needed entries will be allocated.
*
- * Return the number of new huge pages added to the map. This
- * number is greater than or equal to zero.
+ * regions_needed is the out value provided by a previous call to region_chg.
+ *
+ * Return the number of new huge pages added to the map. This number is greater
+ * than or equal to zero. If file_region entries needed to be allocated for
+ * this operation and we were not able to allocate, it ruturns -ENOMEM.
+ * region_add of regions of length 1 never allocate file_regions and cannot
+ * fail; region_chg will always allocate at least 1 entry and a region_add for
+ * 1 page will only require at most 1 entry.
*/
-static long region_add(struct resv_map *resv, long f, long t)
+static long region_add(struct resv_map *resv, long f, long t,
+ long in_regions_needed, struct hstate *h,
+ struct hugetlb_cgroup *h_cg)
{
- struct list_head *head = &resv->regions;
- struct file_region *rg, *nrg;
- long add = 0;
+ long add = 0, actual_regions_needed = 0;
spin_lock(&resv->lock);
- /* Locate the region we are either in or before. */
- list_for_each_entry(rg, head, link)
- if (f <= rg->to)
- break;
+retry:
+
+ /* Count how many regions are actually needed to execute this add. */
+ add_reservation_in_range(resv, f, t, NULL, NULL, &actual_regions_needed,
+ true);
/*
- * If no region exists which can be expanded to include the
- * specified range, pull a region descriptor from the cache
- * and use it for this range.
+ * Check for sufficient descriptors in the cache to accommodate
+ * this add operation. Note that actual_regions_needed may be greater
+ * than in_regions_needed, as the resv_map may have been modified since
+ * the region_chg call. In this case, we need to make sure that we
+ * allocate extra entries, such that we have enough for all the
+ * existing adds_in_progress, plus the excess needed for this
+ * operation.
*/
- if (&rg->link == head || t < rg->from) {
- VM_BUG_ON(resv->region_cache_count <= 0);
-
- resv->region_cache_count--;
- nrg = list_first_entry(&resv->region_cache, struct file_region,
- link);
- list_del(&nrg->link);
+ if (actual_regions_needed > in_regions_needed &&
+ resv->region_cache_count <
+ resv->adds_in_progress +
+ (actual_regions_needed - in_regions_needed)) {
+ /* region_add operation of range 1 should never need to
+ * allocate file_region entries.
+ */
+ VM_BUG_ON(t - f <= 1);
- nrg->from = f;
- nrg->to = t;
- list_add(&nrg->link, rg->link.prev);
+ if (allocate_file_region_entries(
+ resv, actual_regions_needed - in_regions_needed)) {
+ return -ENOMEM;
+ }
- add += t - f;
- goto out_locked;
+ goto retry;
}
- add = add_reservation_in_range(resv, f, t, false);
+ add = add_reservation_in_range(resv, f, t, h_cg, h, NULL, false);
+
+ resv->adds_in_progress -= in_regions_needed;
-out_locked:
- resv->adds_in_progress--;
spin_unlock(&resv->lock);
VM_BUG_ON(add < 0);
return add;
@@ -358,46 +529,37 @@ out_locked:
* call to region_add that will actually modify the reserve
* map to add the specified range [f, t). region_chg does
* not change the number of huge pages represented by the
- * map. A new file_region structure is added to the cache
- * as a placeholder, so that the subsequent region_add
- * call will have all the regions it needs and will not fail.
+ * map. A number of new file_region structures is added to the cache as a
+ * placeholder, for the subsequent region_add call to use. At least 1
+ * file_region structure is added.
+ *
+ * out_regions_needed is the number of regions added to the
+ * resv->adds_in_progress. This value needs to be provided to a follow up call
+ * to region_add or region_abort for proper accounting.
*
* Returns the number of huge pages that need to be added to the existing
* reservation map for the range [f, t). This number is greater or equal to
* zero. -ENOMEM is returned if a new file_region structure or cache entry
* is needed and can not be allocated.
*/
-static long region_chg(struct resv_map *resv, long f, long t)
+static long region_chg(struct resv_map *resv, long f, long t,
+ long *out_regions_needed)
{
long chg = 0;
spin_lock(&resv->lock);
-retry_locked:
- resv->adds_in_progress++;
- /*
- * Check for sufficient descriptors in the cache to accommodate
- * the number of in progress add operations.
- */
- if (resv->adds_in_progress > resv->region_cache_count) {
- struct file_region *trg;
-
- VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
- /* Must drop lock to allocate a new descriptor. */
- resv->adds_in_progress--;
- spin_unlock(&resv->lock);
+ /* Count how many hugepages in this range are NOT respresented. */
+ chg = add_reservation_in_range(resv, f, t, NULL, NULL,
+ out_regions_needed, true);
- trg = kmalloc(sizeof(*trg), GFP_KERNEL);
- if (!trg)
- return -ENOMEM;
+ if (*out_regions_needed == 0)
+ *out_regions_needed = 1;
- spin_lock(&resv->lock);
- list_add(&trg->link, &resv->region_cache);
- resv->region_cache_count++;
- goto retry_locked;
- }
+ if (allocate_file_region_entries(resv, *out_regions_needed))
+ return -ENOMEM;
- chg = add_reservation_in_range(resv, f, t, true);
+ resv->adds_in_progress += *out_regions_needed;
spin_unlock(&resv->lock);
return chg;
@@ -408,17 +570,20 @@ retry_locked:
* of the resv_map keeps track of the operations in progress between
* calls to region_chg and region_add. Operations are sometimes
* aborted after the call to region_chg. In such cases, region_abort
- * is called to decrement the adds_in_progress counter.
+ * is called to decrement the adds_in_progress counter. regions_needed
+ * is the value returned by the region_chg call, it is used to decrement
+ * the adds_in_progress counter.
*
* NOTE: The range arguments [f, t) are not needed or used in this
* routine. They are kept to make reading the calling code easier as
* arguments will match the associated region_chg call.
*/
-static void region_abort(struct resv_map *resv, long f, long t)
+static void region_abort(struct resv_map *resv, long f, long t,
+ long regions_needed)
{
spin_lock(&resv->lock);
VM_BUG_ON(!resv->region_cache_count);
- resv->adds_in_progress--;
+ resv->adds_in_progress -= regions_needed;
spin_unlock(&resv->lock);
}
@@ -486,11 +651,17 @@ retry:
/* New entry for end of split region */
nrg->from = t;
nrg->to = rg->to;
+
+ copy_hugetlb_cgroup_uncharge_info(nrg, rg);
+
INIT_LIST_HEAD(&nrg->link);
/* Original entry is trimmed */
rg->to = f;
+ hugetlb_cgroup_uncharge_file_region(
+ resv, rg, nrg->to - nrg->from);
+
list_add(&nrg->link, &rg->link);
nrg = NULL;
break;
@@ -498,6 +669,8 @@ retry:
if (f <= rg->from && t >= rg->to) { /* Remove entire region */
del += rg->to - rg->from;
+ hugetlb_cgroup_uncharge_file_region(resv, rg,
+ rg->to - rg->from);
list_del(&rg->link);
kfree(rg);
continue;
@@ -506,9 +679,15 @@ retry:
if (f <= rg->from) { /* Trim beginning of region */
del += t - rg->from;
rg->from = t;
+
+ hugetlb_cgroup_uncharge_file_region(resv, rg,
+ t - rg->from);
} else { /* Trim end of region */
del += rg->to - f;
rg->to = f;
+
+ hugetlb_cgroup_uncharge_file_region(resv, rg,
+ rg->to - f);
}
}
@@ -650,6 +829,25 @@ static void set_vma_private_data(struct vm_area_struct *vma,
vma->vm_private_data = (void *)value;
}
+static void
+resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
+ struct hugetlb_cgroup *h_cg,
+ struct hstate *h)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ if (!h_cg || !h) {
+ resv_map->reservation_counter = NULL;
+ resv_map->pages_per_hpage = 0;
+ resv_map->css = NULL;
+ } else {
+ resv_map->reservation_counter =
+ &h_cg->rsvd_hugepage[hstate_index(h)];
+ resv_map->pages_per_hpage = pages_per_huge_page(h);
+ resv_map->css = &h_cg->css;
+ }
+#endif
+}
+
struct resv_map *resv_map_alloc(void)
{
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
@@ -666,6 +864,13 @@ struct resv_map *resv_map_alloc(void)
INIT_LIST_HEAD(&resv_map->regions);
resv_map->adds_in_progress = 0;
+ /*
+ * Initialize these to 0. On shared mappings, 0's here indicate these
+ * fields don't do cgroup accounting. On private mappings, these will be
+ * re-initialized to the proper values, to indicate that hugetlb cgroup
+ * reservations are to be un-charged from here.
+ */
+ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
INIT_LIST_HEAD(&resv_map->region_cache);
list_add(&rg->link, &resv_map->region_cache);
@@ -1009,6 +1214,9 @@ static void destroy_compound_gigantic_page(struct page *page,
struct page *p = page + 1;
atomic_set(compound_mapcount_ptr(page), 0);
+ if (hpage_pincount_available(page))
+ atomic_set(compound_pincount_ptr(page), 0);
+
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
clear_compound_head(p);
set_page_refcounted(p);
@@ -1069,6 +1277,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
1 << PG_writeback);
}
VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
+ VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
set_page_refcounted(page);
if (hstate_is_gigantic(h)) {
@@ -1180,6 +1389,8 @@ static void __free_huge_page(struct page *page)
clear_page_huge_active(page);
hugetlb_cgroup_uncharge_page(hstate_index(h),
pages_per_huge_page(h), page);
+ hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
+ pages_per_huge_page(h), page);
if (restore_reserve)
h->resv_huge_pages++;
@@ -1254,6 +1465,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
spin_lock(&hugetlb_lock);
set_hugetlb_cgroup(page, NULL);
+ set_hugetlb_cgroup_rsvd(page, NULL);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
spin_unlock(&hugetlb_lock);
@@ -1287,6 +1499,9 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
set_compound_head(p, page);
}
atomic_set(compound_mapcount_ptr(page), -1);
+
+ if (hpage_pincount_available(page))
+ atomic_set(compound_pincount_ptr(page), 0);
}
/*
@@ -1313,7 +1528,107 @@ int PageHeadHuge(struct page *page_head)
if (!PageHead(page_head))
return 0;
- return get_compound_page_dtor(page_head) == free_huge_page;
+ return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
+}
+
+/*
+ * Find address_space associated with hugetlbfs page.
+ * Upon entry page is locked and page 'was' mapped although mapped state
+ * could change. If necessary, use anon_vma to find vma and associated
+ * address space. The returned mapping may be stale, but it can not be
+ * invalid as page lock (which is held) is required to destroy mapping.
+ */
+static struct address_space *_get_hugetlb_page_mapping(struct page *hpage)
+{
+ struct anon_vma *anon_vma;
+ pgoff_t pgoff_start, pgoff_end;
+ struct anon_vma_chain *avc;
+ struct address_space *mapping = page_mapping(hpage);
+
+ /* Simple file based mapping */
+ if (mapping)
+ return mapping;
+
+ /*
+ * Even anonymous hugetlbfs mappings are associated with an
+ * underlying hugetlbfs file (see hugetlb_file_setup in mmap
+ * code). Find a vma associated with the anonymous vma, and
+ * use the file pointer to get address_space.
+ */
+ anon_vma = page_lock_anon_vma_read(hpage);
+ if (!anon_vma)
+ return mapping; /* NULL */
+
+ /* Use first found vma */
+ pgoff_start = page_to_pgoff(hpage);
+ pgoff_end = pgoff_start + hpage_nr_pages(hpage) - 1;
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
+ pgoff_start, pgoff_end) {
+ struct vm_area_struct *vma = avc->vma;
+
+ mapping = vma->vm_file->f_mapping;
+ break;
+ }
+
+ anon_vma_unlock_read(anon_vma);
+ return mapping;
+}
+
+/*
+ * Find and lock address space (mapping) in write mode.
+ *
+ * Upon entry, the page is locked which allows us to find the mapping
+ * even in the case of an anon page. However, locking order dictates
+ * the i_mmap_rwsem be acquired BEFORE the page lock. This is hugetlbfs
+ * specific. So, we first try to lock the sema while still holding the
+ * page lock. If this works, great! If not, then we need to drop the
+ * page lock and then acquire i_mmap_rwsem and reacquire page lock. Of
+ * course, need to revalidate state along the way.
+ */
+struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
+{
+ struct address_space *mapping, *mapping2;
+
+ mapping = _get_hugetlb_page_mapping(hpage);
+retry:
+ if (!mapping)
+ return mapping;
+
+ /*
+ * If no contention, take lock and return
+ */
+ if (i_mmap_trylock_write(mapping))
+ return mapping;
+
+ /*
+ * Must drop page lock and wait on mapping sema.
+ * Note: Once page lock is dropped, mapping could become invalid.
+ * As a hack, increase map count until we lock page again.
+ */
+ atomic_inc(&hpage->_mapcount);
+ unlock_page(hpage);
+ i_mmap_lock_write(mapping);
+ lock_page(hpage);
+ atomic_add_negative(-1, &hpage->_mapcount);
+
+ /* verify page is still mapped */
+ if (!page_mapped(hpage)) {
+ i_mmap_unlock_write(mapping);
+ return NULL;
+ }
+
+ /*
+ * Get address space again and verify it is the same one
+ * we locked. If not, drop lock and retry.
+ */
+ mapping2 = _get_hugetlb_page_mapping(hpage);
+ if (mapping2 != mapping) {
+ i_mmap_unlock_write(mapping);
+ mapping = mapping2;
+ goto retry;
+ }
+
+ return mapping;
}
pgoff_t __basepage_index(struct page *page)
@@ -1695,6 +2010,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
* of size 'delta'.
*/
static int gather_surplus_pages(struct hstate *h, int delta)
+ __must_hold(&hugetlb_lock)
{
struct list_head surplus_list;
struct page *page, *tmp;
@@ -1870,6 +2186,7 @@ static long __vma_reservation_common(struct hstate *h,
struct resv_map *resv;
pgoff_t idx;
long ret;
+ long dummy_out_regions_needed;
resv = vma_resv_map(vma);
if (!resv)
@@ -1878,20 +2195,29 @@ static long __vma_reservation_common(struct hstate *h,
idx = vma_hugecache_offset(h, vma, addr);
switch (mode) {
case VMA_NEEDS_RESV:
- ret = region_chg(resv, idx, idx + 1);
+ ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
+ /* We assume that vma_reservation_* routines always operate on
+ * 1 page, and that adding to resv map a 1 page entry can only
+ * ever require 1 region.
+ */
+ VM_BUG_ON(dummy_out_regions_needed != 1);
break;
case VMA_COMMIT_RESV:
- ret = region_add(resv, idx, idx + 1);
+ ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
+ /* region_add calls of range 1 should never fail. */
+ VM_BUG_ON(ret < 0);
break;
case VMA_END_RESV:
- region_abort(resv, idx, idx + 1);
+ region_abort(resv, idx, idx + 1, 1);
ret = 0;
break;
case VMA_ADD_RESV:
- if (vma->vm_flags & VM_MAYSHARE)
- ret = region_add(resv, idx, idx + 1);
- else {
- region_abort(resv, idx, idx + 1);
+ if (vma->vm_flags & VM_MAYSHARE) {
+ ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
+ /* region_add calls of range 1 should never fail. */
+ VM_BUG_ON(ret < 0);
+ } else {
+ region_abort(resv, idx, idx + 1, 1);
ret = region_del(resv, idx, idx + 1);
}
break;
@@ -2002,6 +2328,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
long gbl_chg;
int ret, idx;
struct hugetlb_cgroup *h_cg;
+ bool deferred_reserve;
idx = hstate_index(h);
/*
@@ -2039,9 +2366,19 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
gbl_chg = 1;
}
+ /* If this allocation is not consuming a reservation, charge it now.
+ */
+ deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma);
+ if (deferred_reserve) {
+ ret = hugetlb_cgroup_charge_cgroup_rsvd(
+ idx, pages_per_huge_page(h), &h_cg);
+ if (ret)
+ goto out_subpool_put;
+ }
+
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
if (ret)
- goto out_subpool_put;
+ goto out_uncharge_cgroup_reservation;
spin_lock(&hugetlb_lock);
/*
@@ -2064,6 +2401,14 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
/* Fall through */
}
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
+ /* If allocation is not consuming a reservation, also store the
+ * hugetlb_cgroup pointer on the page.
+ */
+ if (deferred_reserve) {
+ hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
+ h_cg, page);
+ }
+
spin_unlock(&hugetlb_lock);
set_page_private(page, (unsigned long)spool);
@@ -2088,6 +2433,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
+out_uncharge_cgroup_reservation:
+ if (deferred_reserve)
+ hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
+ h_cg);
out_subpool_put:
if (map_chg || avoid_reserve)
hugepage_subpool_put_pages(spool, 1);
@@ -3188,9 +3537,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
end = vma_hugecache_offset(h, vma, vma->vm_end);
reserve = (end - start) - region_count(resv, start, end);
-
- kref_put(&resv->refs, resv_map_release);
-
+ hugetlb_cgroup_uncharge_counter(resv, start, end);
if (reserve) {
/*
* Decrement reserve counts. The global reserve count may be
@@ -3199,6 +3546,8 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
hugetlb_acct_memory(h, -gbl_reserve);
}
+
+ kref_put(&resv->refs, resv_map_release);
}
static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
@@ -3306,6 +3655,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
int cow;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
+ struct address_space *mapping = vma->vm_file->f_mapping;
struct mmu_notifier_range range;
int ret = 0;
@@ -3316,6 +3666,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
vma->vm_start,
vma->vm_end);
mmu_notifier_invalidate_range_start(&range);
+ } else {
+ /*
+ * For shared mappings i_mmap_rwsem must be held to call
+ * huge_pte_alloc, otherwise the returned ptep could go
+ * away if part of a shared pmd and another thread calls
+ * huge_pmd_unshare.
+ */
+ i_mmap_lock_read(mapping);
}
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
@@ -3393,6 +3751,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
if (cow)
mmu_notifier_invalidate_range_end(&range);
+ else
+ i_mmap_unlock_read(mapping);
return ret;
}
@@ -3812,16 +4172,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
}
/*
- * Use page lock to guard against racing truncation
- * before we get page_table_lock.
+ * We can not race with truncation due to holding i_mmap_rwsem.
+ * i_size is modified when holding i_mmap_rwsem, so check here
+ * once for faults beyond end of file.
*/
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto out;
+
retry:
page = find_lock_page(mapping, idx);
if (!page) {
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
- goto out;
-
/*
* Check for page in userfault range
*/
@@ -3841,13 +4202,15 @@ retry:
};
/*
- * hugetlb_fault_mutex must be dropped before
- * handling userfault. Reacquire after handling
- * fault to make calling code simpler.
+ * hugetlb_fault_mutex and i_mmap_rwsem must be
+ * dropped before handling userfault. Reacquire
+ * after handling fault to make calling code simpler.
*/
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
+ i_mmap_lock_read(mapping);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
goto out;
}
@@ -3925,10 +4288,6 @@ retry:
}
ptl = huge_pte_lock(h, mm, ptep);
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
- goto backout;
-
ret = 0;
if (!huge_pte_none(huge_ptep_get(ptep)))
goto backout;
@@ -4012,6 +4371,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (ptep) {
+ /*
+ * Since we hold no locks, ptep could be stale. That is
+ * OK as we are only making decisions based on content and
+ * not actually modifying content here.
+ */
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
migration_entry_wait_huge(vma, mm, ptep);
@@ -4025,14 +4389,31 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
}
+ /*
+ * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
+ * until finished with ptep. This serves two purposes:
+ * 1) It prevents huge_pmd_unshare from being called elsewhere
+ * and making the ptep no longer valid.
+ * 2) It synchronizes us with i_size modifications during truncation.
+ *
+ * ptep could have already be assigned via huge_pte_offset. That
+ * is OK, as huge_pte_alloc will return the same value unless
+ * something has changed.
+ */
mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, haddr);
+ i_mmap_lock_read(mapping);
+ ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+ if (!ptep) {
+ i_mmap_unlock_read(mapping);
+ return VM_FAULT_OOM;
+ }
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
+ idx = vma_hugecache_offset(h, vma, haddr);
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -4120,6 +4501,7 @@ out_ptl:
}
out_mutex:
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
/*
* Generally it's safe to hold refcount during waiting page lock. But
* here we just wait to defer the next page fault to avoid busy loop and
@@ -4266,7 +4648,7 @@ out_release_nounlock:
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
- long i, unsigned int flags, int *nonblocking)
+ long i, unsigned int flags, int *locked)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
@@ -4337,14 +4719,17 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
spin_unlock(ptl);
if (flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
- if (nonblocking)
- fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+ if (locked)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY |
+ FAULT_FLAG_KILLABLE;
if (flags & FOLL_NOWAIT)
fault_flags |= FAULT_FLAG_ALLOW_RETRY |
FAULT_FLAG_RETRY_NOWAIT;
if (flags & FOLL_TRIED) {
- VM_WARN_ON_ONCE(fault_flags &
- FAULT_FLAG_ALLOW_RETRY);
+ /*
+ * Note: FAULT_FLAG_ALLOW_RETRY and
+ * FAULT_FLAG_TRIED can co-exist
+ */
fault_flags |= FAULT_FLAG_TRIED;
}
ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
@@ -4354,9 +4739,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
break;
}
if (ret & VM_FAULT_RETRY) {
- if (nonblocking &&
+ if (locked &&
!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
- *nonblocking = 0;
+ *locked = 0;
*nr_pages = 0;
/*
* VM_FAULT_RETRY must not return an
@@ -4376,19 +4761,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = pte_page(huge_ptep_get(pte));
/*
- * Instead of doing 'try_get_page()' below in the same_page
- * loop, just check the count once here.
- */
- if (unlikely(page_count(page) <= 0)) {
- if (pages) {
- spin_unlock(ptl);
- remainder = 0;
- err = -ENOMEM;
- break;
- }
- }
-
- /*
* If subpage information not requested, update counters
* and skip the same_page loop below.
*/
@@ -4405,7 +4777,22 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
- get_page(pages[i]);
+ /*
+ * try_grab_page() should always succeed here, because:
+ * a) we hold the ptl lock, and b) we've just checked
+ * that the huge page is present in the page tables. If
+ * the huge page is present, then the tail pages must
+ * also be present. The ptl prevents the head page and
+ * tail pages from being rearranged in any way. So this
+ * page must be available at this point, unless the page
+ * refcount overflowed:
+ */
+ if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
+ spin_unlock(ptl);
+ remainder = 0;
+ err = -ENOMEM;
+ break;
+ }
}
if (vmas)
@@ -4541,11 +4928,12 @@ int hugetlb_reserve_pages(struct inode *inode,
struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- long ret, chg;
+ long ret, chg, add = -1;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
- long gbl_reserve;
+ struct hugetlb_cgroup *h_cg = NULL;
+ long gbl_reserve, regions_needed = 0;
/* This should never happen */
if (from > to) {
@@ -4575,9 +4963,10 @@ int hugetlb_reserve_pages(struct inode *inode,
*/
resv_map = inode_resv_map(inode);
- chg = region_chg(resv_map, from, to);
+ chg = region_chg(resv_map, from, to, &regions_needed);
} else {
+ /* Private mapping. */
resv_map = resv_map_alloc();
if (!resv_map)
return -ENOMEM;
@@ -4593,6 +4982,21 @@ int hugetlb_reserve_pages(struct inode *inode,
goto out_err;
}
+ ret = hugetlb_cgroup_charge_cgroup_rsvd(
+ hstate_index(h), chg * pages_per_huge_page(h), &h_cg);
+
+ if (ret < 0) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
+ /* For private mappings, the hugetlb_cgroup uncharge info hangs
+ * of the resv_map.
+ */
+ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
+ }
+
/*
* There must be enough pages in the subpool for the mapping. If
* the subpool has a minimum size, there may be some global
@@ -4601,7 +5005,7 @@ int hugetlb_reserve_pages(struct inode *inode,
gbl_reserve = hugepage_subpool_get_pages(spool, chg);
if (gbl_reserve < 0) {
ret = -ENOSPC;
- goto out_err;
+ goto out_uncharge_cgroup;
}
/*
@@ -4610,9 +5014,7 @@ int hugetlb_reserve_pages(struct inode *inode,
*/
ret = hugetlb_acct_memory(h, gbl_reserve);
if (ret < 0) {
- /* put back original number of pages, chg */
- (void)hugepage_subpool_put_pages(spool, chg);
- goto out_err;
+ goto out_put_pages;
}
/*
@@ -4627,9 +5029,12 @@ int hugetlb_reserve_pages(struct inode *inode,
* else has to be done for private mappings here
*/
if (!vma || vma->vm_flags & VM_MAYSHARE) {
- long add = region_add(resv_map, from, to);
+ add = region_add(resv_map, from, to, regions_needed, h, h_cg);
- if (unlikely(chg > add)) {
+ if (unlikely(add < 0)) {
+ hugetlb_acct_memory(h, -gbl_reserve);
+ goto out_put_pages;
+ } else if (unlikely(chg > add)) {
/*
* pages in this range were added to the reserve
* map between region_chg and region_add. This
@@ -4639,17 +5044,29 @@ int hugetlb_reserve_pages(struct inode *inode,
*/
long rsv_adjust;
+ hugetlb_cgroup_uncharge_cgroup_rsvd(
+ hstate_index(h),
+ (chg - add) * pages_per_huge_page(h), h_cg);
+
rsv_adjust = hugepage_subpool_put_pages(spool,
chg - add);
hugetlb_acct_memory(h, -rsv_adjust);
}
}
return 0;
+out_put_pages:
+ /* put back original number of pages, chg */
+ (void)hugepage_subpool_put_pages(spool, chg);
+out_uncharge_cgroup:
+ hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
+ chg * pages_per_huge_page(h), h_cg);
out_err:
if (!vma || vma->vm_flags & VM_MAYSHARE)
- /* Don't call region_abort if region_chg failed */
- if (chg >= 0)
- region_abort(resv_map, from, to);
+ /* Only call region_abort if the region_chg succeeded but the
+ * region_add failed or didn't run.
+ */
+ if (chg >= 0 && add < 0)
+ region_abort(resv_map, from, to, regions_needed);
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
kref_put(&resv_map->refs, resv_map_release);
return ret;
@@ -4740,7 +5157,7 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
- unsigned long check_addr = *start;
+ unsigned long check_addr;
if (!(vma->vm_flags & VM_MAYSHARE))
return;
@@ -4765,10 +5182,12 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
* !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner. pmd allocation is essential for the shared case because
- * pud has to be populated inside the same i_mmap_rwsem section - otherwise
- * racing tasks could either miss the sharing (see huge_pte_offset) or select a
- * bad pmd for sharing.
+ * code much cleaner.
+ *
+ * This routine must be called with i_mmap_rwsem held in at least read mode.
+ * For hugetlbfs, this prevents removal of any page table entries associated
+ * with the address space. This is important as we are setting up sharing
+ * based on existing page table entries (mappings).
*/
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
@@ -4785,7 +5204,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!vma_shareable(vma, addr))
return (pte_t *)pmd_alloc(mm, pud, addr);
- i_mmap_lock_read(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
@@ -4815,7 +5233,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
- i_mmap_unlock_read(mapping);
return pte;
}
@@ -4826,7 +5243,7 @@ out:
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
- * called with page table lock held.
+ * Called with page table lock held and i_mmap_rwsem held in write mode.
*
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
@@ -4965,6 +5382,12 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
struct page *page = NULL;
spinlock_t *ptl;
pte_t pte;
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET)))
+ return NULL;
+
retry:
ptl = pmd_lockptr(mm, pmd);
spin_lock(ptl);
@@ -4977,8 +5400,18 @@ retry:
pte = huge_ptep_get((pte_t *)pmd);
if (pte_present(pte)) {
page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
- if (flags & FOLL_GET)
- get_page(page);
+ /*
+ * try_grab_page() should always succeed here, because: a) we
+ * hold the pmd (ptl) lock, and b) we've just checked that the
+ * huge pmd (head) page is present in the page tables. The ptl
+ * prevents the head page and tail pages from being rearranged
+ * in any way. So this page must be available at this point,
+ * unless the page refcount overflowed:
+ */
+ if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+ page = NULL;
+ goto out;
+ }
} else {
if (is_hugetlb_entry_migration(pte)) {
spin_unlock(ptl);
@@ -4999,7 +5432,7 @@ struct page * __weak
follow_huge_pud(struct mm_struct *mm, unsigned long address,
pud_t *pud, int flags)
{
- if (flags & FOLL_GET)
+ if (flags & (FOLL_GET | FOLL_PIN))
return NULL;
return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
@@ -5008,7 +5441,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
struct page * __weak
follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
{
- if (flags & FOLL_GET)
+ if (flags & (FOLL_GET | FOLL_PIN))
return NULL;
return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);