summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-12-14 17:09:00 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2016-12-14 17:09:00 -0800
commit80eabba70260dcb55b05098f6c1fecbe5c0e518b (patch)
treec9f5d19803a3387d2b9d8a6998eb9c58bad2a15a
parent852d21ae1fcdf0e4de6b5bfa730d29cb013c7ff3 (diff)
parentce98321bf7d274a470642ef99e1d82512673ce7c (diff)
Merge branch 'for-4.10/fs-unmap' of git://git.kernel.dk/linux-block
Pull fs meta data unmap optimization from Jens Axboe: "A series from Jan Kara, providing a more efficient way for unmapping meta data from in the buffer cache than doing it block-by-block. Provide a general helper that existing callers can use" * 'for-4.10/fs-unmap' of git://git.kernel.dk/linux-block: fs: Remove unmap_underlying_metadata fs: Add helper to clean bdev aliases under a bh and use it ext2: Use clean_bdev_aliases() instead of iteration ext4: Use clean_bdev_aliases() instead of iteration direct-io: Use clean_bdev_aliases() instead of handmade iteration fs: Provide function to unmap metadata for a range of blocks
-rw-r--r--fs/buffer.c104
-rw-r--r--fs/direct-io.c28
-rw-r--r--fs/ext2/inode.c9
-rw-r--r--fs/ext4/extents.c13
-rw-r--r--fs/ext4/inode.c18
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/ntfs/aops.c2
-rw-r--r--fs/ntfs/file.c5
-rw-r--r--fs/ocfs2/aops.c2
-rw-r--r--fs/ufs/balloc.c3
-rw-r--r--fs/ufs/inode.c3
-rw-r--r--include/linux/buffer_head.h7
13 files changed, 104 insertions, 95 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index a3bfd57c2697..d21771fcf7d3 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -43,6 +43,7 @@
#include <linux/bitops.h>
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
+#include <linux/pagevec.h>
#include <trace/events/block.h>
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
@@ -1604,37 +1605,80 @@ void create_empty_buffers(struct page *page,
}
EXPORT_SYMBOL(create_empty_buffers);
-/*
- * We are taking a block for data and we don't want any output from any
- * buffer-cache aliases starting from return from that function and
- * until the moment when something will explicitly mark the buffer
- * dirty (hopefully that will not happen until we will free that block ;-)
- * We don't even need to mark it not-uptodate - nobody can expect
- * anything from a newly allocated buffer anyway. We used to used
- * unmap_buffer() for such invalidation, but that was wrong. We definitely
- * don't want to mark the alias unmapped, for example - it would confuse
- * anyone who might pick it with bread() afterwards...
- *
- * Also.. Note that bforget() doesn't lock the buffer. So there can
- * be writeout I/O going on against recently-freed buffers. We don't
- * wait on that I/O in bforget() - it's more efficient to wait on the I/O
- * only if we really need to. That happens here.
- */
-void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
+/**
+ * clean_bdev_aliases: clean a range of buffers in block device
+ * @bdev: Block device to clean buffers in
+ * @block: Start of a range of blocks to clean
+ * @len: Number of blocks to clean
+ *
+ * We are taking a range of blocks for data and we don't want writeback of any
+ * buffer-cache aliases starting from return from this function and until the
+ * moment when something will explicitly mark the buffer dirty (hopefully that
+ * will not happen until we will free that block ;-) We don't even need to mark
+ * it not-uptodate - nobody can expect anything from a newly allocated buffer
+ * anyway. We used to use unmap_buffer() for such invalidation, but that was
+ * wrong. We definitely don't want to mark the alias unmapped, for example - it
+ * would confuse anyone who might pick it with bread() afterwards...
+ *
+ * Also.. Note that bforget() doesn't lock the buffer. So there can be
+ * writeout I/O going on against recently-freed buffers. We don't wait on that
+ * I/O in bforget() - it's more efficient to wait on the I/O only if we really
+ * need to. That happens here.
+ */
+void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
{
- struct buffer_head *old_bh;
+ struct inode *bd_inode = bdev->bd_inode;
+ struct address_space *bd_mapping = bd_inode->i_mapping;
+ struct pagevec pvec;
+ pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
+ pgoff_t end;
+ int i;
+ struct buffer_head *bh;
+ struct buffer_head *head;
- might_sleep();
+ end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
+ pagevec_init(&pvec, 0);
+ while (index <= end && pagevec_lookup(&pvec, bd_mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
- old_bh = __find_get_block_slow(bdev, block);
- if (old_bh) {
- clear_buffer_dirty(old_bh);
- wait_on_buffer(old_bh);
- clear_buffer_req(old_bh);
- __brelse(old_bh);
+ index = page->index;
+ if (index > end)
+ break;
+ if (!page_has_buffers(page))
+ continue;
+ /*
+ * We use page lock instead of bd_mapping->private_lock
+ * to pin buffers here since we can afford to sleep and
+ * it scales better than a global spinlock lock.
+ */
+ lock_page(page);
+ /* Recheck when the page is locked which pins bhs */
+ if (!page_has_buffers(page))
+ goto unlock_page;
+ head = page_buffers(page);
+ bh = head;
+ do {
+ if (!buffer_mapped(bh))
+ goto next;
+ if (bh->b_blocknr >= block + len)
+ break;
+ clear_buffer_dirty(bh);
+ wait_on_buffer(bh);
+ clear_buffer_req(bh);
+next:
+ bh = bh->b_this_page;
+ } while (bh != head);
+unlock_page:
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ index++;
}
}
-EXPORT_SYMBOL(unmap_underlying_metadata);
+EXPORT_SYMBOL(clean_bdev_aliases);
/*
* Size is a power-of-two in the range 512..PAGE_SIZE,
@@ -1745,8 +1789,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
if (buffer_new(bh)) {
/* blockdev mappings never come here */
clear_buffer_new(bh);
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
}
}
bh = bh->b_this_page;
@@ -1992,8 +2035,7 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
}
if (buffer_new(bh)) {
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
if (PageUptodate(page)) {
clear_buffer_new(bh);
set_buffer_uptodate(bh);
@@ -2633,7 +2675,7 @@ int nobh_write_begin(struct address_space *mapping,
if (!buffer_mapped(bh))
is_mapped_to_disk = 0;
if (buffer_new(bh))
- unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
if (PageUptodate(page)) {
set_buffer_uptodate(bh);
continue;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 835e23a4ee4b..86aa79859d4d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -843,24 +843,6 @@ out:
}
/*
- * Clean any dirty buffers in the blockdev mapping which alias newly-created
- * file blocks. Only called for S_ISREG files - blockdevs do not set
- * buffer_new
- */
-static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)
-{
- unsigned i;
- unsigned nblocks;
-
- nblocks = map_bh->b_size >> dio->inode->i_blkbits;
-
- for (i = 0; i < nblocks; i++) {
- unmap_underlying_metadata(map_bh->b_bdev,
- map_bh->b_blocknr + i);
- }
-}
-
-/*
* If we are not writing the entire block and get_block() allocated
* the block for us, we need to fill-in the unused portion of the
* block with zeros. This happens only if user-buffer, fileoffset or
@@ -960,11 +942,15 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
goto do_holes;
sdio->blocks_available =
- map_bh->b_size >> sdio->blkbits;
+ map_bh->b_size >> blkbits;
sdio->next_block_for_io =
map_bh->b_blocknr << sdio->blkfactor;
- if (buffer_new(map_bh))
- clean_blockdev_aliases(dio, map_bh);
+ if (buffer_new(map_bh)) {
+ clean_bdev_aliases(
+ map_bh->b_bdev,
+ map_bh->b_blocknr,
+ map_bh->b_size >> blkbits);
+ }
if (!sdio->blkfactor)
goto do_holes;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 046b642f3585..e173afe92661 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -732,16 +732,13 @@ static int ext2_get_blocks(struct inode *inode,
}
if (IS_DAX(inode)) {
- int i;
-
/*
* We must unmap blocks before zeroing so that writeback cannot
* overwrite zeros with stale data from block device page cache.
*/
- for (i = 0; i < count; i++) {
- unmap_underlying_metadata(inode->i_sb->s_bdev,
- le32_to_cpu(chain[depth-1].key) + i);
- }
+ clean_bdev_aliases(inode->i_sb->s_bdev,
+ le32_to_cpu(chain[depth-1].key),
+ count);
/*
* block must be initialised before we put it in the tree
* so that it's not found by another thread before it's
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3e1014fe835e..b1f8416923ab 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3777,14 +3777,6 @@ out:
return err;
}
-static void unmap_underlying_metadata_blocks(struct block_device *bdev,
- sector_t block, int count)
-{
- int i;
- for (i = 0; i < count; i++)
- unmap_underlying_metadata(bdev, block + i);
-}
-
/*
* Handle EOFBLOCKS_FL flag, clearing it if necessary
*/
@@ -4121,9 +4113,8 @@ out:
* new.
*/
if (allocated > map->m_len) {
- unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
- newblock + map->m_len,
- allocated - map->m_len);
+ clean_bdev_aliases(inode->i_sb->s_bdev, newblock + map->m_len,
+ allocated - map->m_len);
allocated = map->m_len;
}
map->m_len = allocated;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 72d593fa690d..88d57af1b516 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -661,12 +661,8 @@ found:
if (flags & EXT4_GET_BLOCKS_ZERO &&
map->m_flags & EXT4_MAP_MAPPED &&
map->m_flags & EXT4_MAP_NEW) {
- ext4_lblk_t i;
-
- for (i = 0; i < map->m_len; i++) {
- unmap_underlying_metadata(inode->i_sb->s_bdev,
- map->m_pblk + i);
- }
+ clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
+ map->m_len);
ret = ext4_issue_zeroout(inode, map->m_lblk,
map->m_pblk, map->m_len);
if (ret) {
@@ -1137,8 +1133,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
if (err)
break;
if (buffer_new(bh)) {
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
if (PageUptodate(page)) {
clear_buffer_new(bh);
set_buffer_uptodate(bh);
@@ -2371,11 +2366,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
BUG_ON(map->m_len == 0);
if (map->m_flags & EXT4_MAP_NEW) {
- struct block_device *bdev = inode->i_sb->s_bdev;
- int i;
-
- for (i = 0; i < map->m_len; i++)
- unmap_underlying_metadata(bdev, map->m_pblk + i);
+ clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
+ map->m_len);
}
return 0;
}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index e2332a65e8fb..d83b0f3c5fe9 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -457,7 +457,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
}
if (buffer_new(bh)) {
clear_buffer_new(bh);
- unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
}
set_buffer_async_write(bh);
nr_to_submit++;
diff --git a/fs/mpage.c b/fs/mpage.c
index 98fc11aa7e0b..28af984a3d96 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -555,8 +555,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
if (mpd->get_block(inode, block_in_file, &map_bh, 1))
goto confused;
if (buffer_new(&map_bh))
- unmap_underlying_metadata(map_bh.b_bdev,
- map_bh.b_blocknr);
+ clean_bdev_bh_alias(&map_bh);
if (buffer_boundary(&map_bh)) {
boundary_block = map_bh.b_blocknr;
boundary_bdev = map_bh.b_bdev;
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index d0cf6fee5c77..cc91856b5e2d 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -765,7 +765,7 @@ lock_retry_remap:
}
// TODO: Instantiate the hole.
// clear_buffer_new(bh);
- // unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ // clean_bdev_bh_alias(bh);
ntfs_error(vol->sb, "Writing into sparse regions is "
"not supported yet. Sorry.");
err = -EOPNOTSUPP;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index bf72a2c58b75..99510d811a8c 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -740,8 +740,7 @@ map_buffer_cached:
set_buffer_uptodate(bh);
if (unlikely(was_hole)) {
/* We allocated the buffer. */
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
if (bh_end <= pos || bh_pos >= end)
mark_buffer_dirty(bh);
else
@@ -784,7 +783,7 @@ map_buffer_cached:
continue;
}
/* We allocated the buffer. */
- unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
/*
* If the buffer is fully outside the write, zero it,
* set it uptodate, and mark it dirty so it gets
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 9a88984f9f6f..4d9c6f5ec28a 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -630,7 +630,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
if (!buffer_mapped(bh)) {
map_bh(bh, inode->i_sb, *p_blkno);
- unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
}
if (PageUptodate(page)) {
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index b035af54f538..a0376a2c1c29 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -307,8 +307,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
(unsigned long long)(pos + newb), pos);
bh->b_blocknr = newb + pos;
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
mark_buffer_dirty(bh);
++j;
bh = bh->b_this_page;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 190d64be22ed..45ceb94e89e4 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -1070,8 +1070,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
if (buffer_new(bh)) {
clear_buffer_new(bh);
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
/*
* we do not zeroize fragment, because of
* if it maped to hole, it already contains zeroes
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index ebbacd14d450..d67ab83823ad 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -168,7 +168,12 @@ int inode_has_buffers(struct inode *);
void invalidate_inode_buffers(struct inode *);
int remove_inode_buffers(struct inode *inode);
int sync_mapping_buffers(struct address_space *mapping);
-void unmap_underlying_metadata(struct block_device *bdev, sector_t block);
+void clean_bdev_aliases(struct block_device *bdev, sector_t block,
+ sector_t len);
+static inline void clean_bdev_bh_alias(struct buffer_head *bh)
+{
+ clean_bdev_aliases(bh->b_bdev, bh->b_blocknr, 1);
+}
void mark_buffer_async_write(struct buffer_head *bh);
void __wait_on_buffer(struct buffer_head *);