summaryrefslogtreecommitdiff
path: root/fs/btrfs/file.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r--fs/btrfs/file.c166
1 files changed, 166 insertions, 0 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0c23053951be..1b972bd05af1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2029,6 +2029,172 @@ out_release_extents:
goto out;
}
+/*
+ * btrfs_page_mkwrite() is not allowed to change the file size as it gets
+ * called from a page fault handler when a page is first dirtied. Hence we must
+ * be careful to check for EOF conditions here. We set the page up correctly
+ * for a written page which means we get ENOSPC checking when writing into
+ * holes and correct delalloc and unwritten extent mapping on filesystems that
+ * support these features.
+ *
+ * We are not allowed to take the i_mutex here so we have to play games to
+ * protect against truncate races as the page could now be beyond EOF. Because
+ * truncate_setsize() writes the inode size before removing pages, once we have
+ * the page lock we can determine safely if the page is beyond EOF. If it is not
+ * beyond EOF, then the page is guaranteed safe against truncation until we
+ * unlock the page.
+ */
+static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct folio *folio = page_folio(page);
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
+ struct extent_changeset *data_reserved = NULL;
+ unsigned long zero_start;
+ loff_t size;
+ vm_fault_t ret;
+ int ret2;
+ int reserved = 0;
+ u64 reserved_space;
+ u64 page_start;
+ u64 page_end;
+ u64 end;
+
+ ASSERT(folio_order(folio) == 0);
+
+ reserved_space = PAGE_SIZE;
+
+ sb_start_pagefault(inode->i_sb);
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_SIZE - 1;
+ end = page_end;
+
+ /*
+ * Reserving delalloc space after obtaining the page lock can lead to
+ * deadlock. For example, if a dirty page is locked by this function
+ * and the call to btrfs_delalloc_reserve_space() ends up triggering
+ * dirty page write out, then the btrfs_writepages() function could
+ * end up waiting indefinitely to get a lock on the page currently
+ * being processed by btrfs_page_mkwrite() function.
+ */
+ ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
+ page_start, reserved_space);
+ if (!ret2) {
+ ret2 = file_update_time(vmf->vma->vm_file);
+ reserved = 1;
+ }
+ if (ret2) {
+ ret = vmf_error(ret2);
+ if (reserved)
+ goto out;
+ goto out_noreserve;
+ }
+
+ /* Make the VM retry the fault. */
+ ret = VM_FAULT_NOPAGE;
+again:
+ down_read(&BTRFS_I(inode)->i_mmap_lock);
+ lock_page(page);
+ size = i_size_read(inode);
+
+ if ((page->mapping != inode->i_mapping) ||
+ (page_start >= size)) {
+ /* Page got truncated out from underneath us. */
+ goto out_unlock;
+ }
+ wait_on_page_writeback(page);
+
+ lock_extent(io_tree, page_start, page_end, &cached_state);
+ ret2 = set_page_extent_mapped(page);
+ if (ret2 < 0) {
+ ret = vmf_error(ret2);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ goto out_unlock;
+ }
+
+ /*
+ * We can't set the delalloc bits if there are pending ordered
+ * extents. Drop our locks and wait for them to finish.
+ */
+ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE);
+ if (ordered) {
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ unlock_page(page);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
+ btrfs_start_ordered_extent(ordered);
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+ }
+
+ if (page->index == ((size - 1) >> PAGE_SHIFT)) {
+ reserved_space = round_up(size - page_start, fs_info->sectorsize);
+ if (reserved_space < PAGE_SIZE) {
+ end = page_start + reserved_space - 1;
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ data_reserved, page_start,
+ PAGE_SIZE - reserved_space, true);
+ }
+ }
+
+ /*
+ * page_mkwrite gets called when the page is firstly dirtied after it's
+ * faulted in, but write(2) could also dirty a page and set delalloc
+ * bits, thus in this case for space account reason, we still need to
+ * clear any delalloc bits within this page range since we have to
+ * reserve data&meta space before lock_page() (see above comments).
+ */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, &cached_state);
+
+ ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
+ &cached_state);
+ if (ret2) {
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ ret = VM_FAULT_SIGBUS;
+ goto out_unlock;
+ }
+
+ /* Page is wholly or partially inside EOF. */
+ if (page_start + PAGE_SIZE > size)
+ zero_start = offset_in_page(size);
+ else
+ zero_start = PAGE_SIZE;
+
+ if (zero_start != PAGE_SIZE)
+ memzero_page(page, zero_start, PAGE_SIZE - zero_start);
+
+ btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
+ btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
+ btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
+
+ btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
+
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
+
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ sb_end_pagefault(inode->i_sb);
+ extent_changeset_free(data_reserved);
+ return VM_FAULT_LOCKED;
+
+out_unlock:
+ unlock_page(page);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
+out:
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
+ reserved_space, (ret != 0));
+out_noreserve:
+ sb_end_pagefault(inode->i_sb);
+ extent_changeset_free(data_reserved);
+ return ret;
+}
+
static const struct vm_operations_struct btrfs_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,