summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/hugetlbfs/inode.c61
-rw-r--r--mm/hugetlb.c21
2 files changed, 38 insertions, 44 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 32920a10100e..a2fcea5f8225 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -383,17 +383,16 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
* truncation is indicated by end of range being LLONG_MAX
* In this case, we first scan the range and release found pages.
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
- * maps and global counts. Page faults can not race with truncation
- * in this routine. hugetlb_no_page() prevents page faults in the
- * truncated range. It checks i_size before allocation, and again after
- * with the page table lock for the page held. The same lock must be
- * acquired to unmap a page.
+ * maps and global counts.
* hole punch is indicated if end is not LLONG_MAX
* In the hole punch case we scan the range and release found pages.
* Only when releasing a page is the associated region/reserv map
* deleted. The region/reserv map for ranges without associated
- * pages are not modified. Page faults can race with hole punch.
- * This is indicated if we find a mapped page.
+ * pages are not modified.
+ *
+ * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent
+ * races with page faults.
+ *
* Note: If the passed end of range value is beyond the end of file, but
* not LLONG_MAX this routine still performs a hole punch operation.
*/
@@ -423,32 +422,14 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
- u32 hash;
index = page->index;
- hash = hugetlb_fault_mutex_hash(h, current->mm,
- &pseudo_vma,
- mapping, index, 0);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
-
/*
- * If page is mapped, it was faulted in after being
- * unmapped in caller. Unmap (again) now after taking
- * the fault mutex. The mutex will prevent faults
- * until we finish removing the page.
- *
- * This race can only happen in the hole punch case.
- * Getting here in a truncate operation is a bug.
+ * A mapped page is impossible as callers should unmap
+ * all references before calling. And, i_mmap_rwsem
+ * prevents the creation of additional mappings.
*/
- if (unlikely(page_mapped(page))) {
- BUG_ON(truncate_op);
-
- i_mmap_lock_write(mapping);
- hugetlb_vmdelete_list(&mapping->i_mmap,
- index * pages_per_huge_page(h),
- (index + 1) * pages_per_huge_page(h));
- i_mmap_unlock_write(mapping);
- }
+ VM_BUG_ON(page_mapped(page));
lock_page(page);
/*
@@ -470,7 +451,6 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
}
unlock_page(page);
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
huge_pagevec_release(&pvec);
cond_resched();
@@ -482,9 +462,20 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
static void hugetlbfs_evict_inode(struct inode *inode)
{
+ struct address_space *mapping = inode->i_mapping;
struct resv_map *resv_map;
+ /*
+ * The vfs layer guarantees that there are no other users of this
+ * inode. Therefore, it would be safe to call remove_inode_hugepages
+ * without holding i_mmap_rwsem. We acquire and hold here to be
+ * consistent with other callers. Since there will be no contention
+ * on the semaphore, overhead is negligible.
+ */
+ i_mmap_lock_write(mapping);
remove_inode_hugepages(inode, 0, LLONG_MAX);
+ i_mmap_unlock_write(mapping);
+
resv_map = (struct resv_map *)inode->i_mapping->private_data;
/* root inode doesn't have the resv_map, so we should check it */
if (resv_map)
@@ -505,8 +496,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
- i_mmap_unlock_write(mapping);
remove_inode_hugepages(inode, offset, LLONG_MAX);
+ i_mmap_unlock_write(mapping);
return 0;
}
@@ -540,8 +531,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
hugetlb_vmdelete_list(&mapping->i_mmap,
hole_start >> PAGE_SHIFT,
hole_end >> PAGE_SHIFT);
- i_mmap_unlock_write(mapping);
remove_inode_hugepages(inode, hole_start, hole_end);
+ i_mmap_unlock_write(mapping);
inode_unlock(inode);
}
@@ -624,7 +615,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
/* addr is the offset within the file (zero based) */
addr = index * hpage_size;
- /* mutex taken here, fault path and hole punch */
+ /*
+ * fault mutex taken here, protects against fault path
+ * and hole punch. inode_lock previously taken protects
+ * against truncation.
+ */
hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
index, addr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 87fd3ab809c6..e37efd5d8318 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3755,16 +3755,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
}
/*
- * Use page lock to guard against racing truncation
- * before we get page_table_lock.
+ * We can not race with truncation due to holding i_mmap_rwsem.
+ * Check once here for faults beyond end of file.
*/
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto out;
+
retry:
page = find_lock_page(mapping, idx);
if (!page) {
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
- goto out;
-
/*
* Check for page in userfault range
*/
@@ -3854,9 +3854,6 @@ retry:
}
ptl = huge_pte_lock(h, mm, ptep);
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
- goto backout;
ret = 0;
if (!huge_pte_none(huge_ptep_get(ptep)))
@@ -3959,8 +3956,10 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
- * until finished with ptep. This prevents huge_pmd_unshare from
- * being called elsewhere and making the ptep no longer valid.
+ * until finished with ptep. This serves two purposes:
+ * 1) It prevents huge_pmd_unshare from being called elsewhere
+ * and making the ptep no longer valid.
+ * 2) It synchronizes us with file truncation.
*
* ptep could have already be assigned via huge_pte_offset. That
* is OK, as huge_pte_alloc will return the same value unless