summaryrefslogtreecommitdiff
path: root/fs/buffer.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/buffer.c')
-rw-r--r--fs/buffer.c146
1 files changed, 87 insertions, 59 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index c7abb4a029dc..ead4dc85debd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -176,18 +176,8 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
}
EXPORT_SYMBOL(end_buffer_write_sync);
-/*
- * Various filesystems appear to want __find_get_block to be non-blocking.
- * But it's the page lock which protects the buffers. To get around this,
- * we get exclusion from try_to_free_buffers with the blockdev mapping's
- * i_private_lock.
- *
- * Hack idea: for the blockdev mapping, i_private_lock contention
- * may be quite high. This code could TryLock the page, and if that
- * succeeds, there is no need to take i_private_lock.
- */
static struct buffer_head *
-__find_get_block_slow(struct block_device *bdev, sector_t block)
+__find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic)
{
struct address_space *bd_mapping = bdev->bd_mapping;
const int blkbits = bd_mapping->host->i_blkbits;
@@ -204,10 +194,28 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
if (IS_ERR(folio))
goto out;
- spin_lock(&bd_mapping->i_private_lock);
+ /*
+ * Folio lock protects the buffers. Callers that cannot block
+ * will fallback to serializing vs try_to_free_buffers() via
+ * the i_private_lock.
+ */
+ if (atomic)
+ spin_lock(&bd_mapping->i_private_lock);
+ else
+ folio_lock(folio);
+
head = folio_buffers(folio);
if (!head)
goto out_unlock;
+ /*
+ * Upon a noref migration, the folio lock serializes here;
+ * otherwise bail.
+ */
+ if (test_bit_acquire(BH_Migrate, &head->b_state)) {
+ WARN_ON(!atomic);
+ goto out_unlock;
+ }
+
bh = head;
do {
if (!buffer_mapped(bh))
@@ -236,7 +244,10 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
1 << blkbits);
}
out_unlock:
- spin_unlock(&bd_mapping->i_private_lock);
+ if (atomic)
+ spin_unlock(&bd_mapping->i_private_lock);
+ else
+ folio_unlock(folio);
folio_put(folio);
out:
return ret;
@@ -286,7 +297,6 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
still_busy:
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
- return;
}
struct postprocess_bh_ctx {
@@ -411,7 +421,6 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
still_busy:
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
- return;
}
/*
@@ -656,7 +665,9 @@ EXPORT_SYMBOL(generic_buffers_fsync);
void write_boundary_block(struct block_device *bdev,
sector_t bblock, unsigned blocksize)
{
- struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
+ struct buffer_head *bh;
+
+ bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize);
if (bh) {
if (buffer_dirty(bh))
write_dirty_buffer(bh, 0);
@@ -1109,27 +1120,26 @@ static struct buffer_head *
__getblk_slow(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
- /* Size must be multiple of hard sectorsize */
- if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
- (size < 512 || size > PAGE_SIZE))) {
- printk(KERN_ERR "getblk(): invalid block size %d requested\n",
- size);
- printk(KERN_ERR "logical block size: %d\n",
- bdev_logical_block_size(bdev));
+ bool blocking = gfpflags_allow_blocking(gfp);
- dump_stack();
+ if (WARN_ON_ONCE(!IS_ALIGNED(size, bdev_logical_block_size(bdev)))) {
+ printk(KERN_ERR "getblk(): block size %d not aligned to logical block size %d\n",
+ size, bdev_logical_block_size(bdev));
return NULL;
}
for (;;) {
struct buffer_head *bh;
- bh = __find_get_block(bdev, block, size);
- if (bh)
- return bh;
-
if (!grow_buffers(bdev, block, size, gfp))
return NULL;
+
+ if (blocking)
+ bh = __find_get_block_nonatomic(bdev, block, size);
+ else
+ bh = __find_get_block(bdev, block, size);
+ if (bh)
+ return bh;
}
}
@@ -1207,10 +1217,8 @@ void mark_buffer_write_io_error(struct buffer_head *bh)
/* FIXME: do we need to set this in both places? */
if (bh->b_folio && bh->b_folio->mapping)
mapping_set_error(bh->b_folio->mapping, -EIO);
- if (bh->b_assoc_map) {
+ if (bh->b_assoc_map)
mapping_set_error(bh->b_assoc_map, -EIO);
- errseq_set(&bh->b_assoc_map->host->i_sb->s_wb_err, -EIO);
- }
}
EXPORT_SYMBOL(mark_buffer_write_io_error);
@@ -1386,16 +1394,18 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
/*
* Perform a pagecache lookup for the matching buffer. If it's there, refresh
* it in the LRU and mark it as accessed. If it is not present then return
- * NULL
+ * NULL. Atomic context callers may also return NULL if the buffer is being
+ * migrated; similarly the page is not marked accessed either.
*/
-struct buffer_head *
-__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
+static struct buffer_head *
+find_get_block_common(struct block_device *bdev, sector_t block,
+ unsigned size, bool atomic)
{
struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
if (bh == NULL) {
/* __find_get_block_slow will mark the page accessed */
- bh = __find_get_block_slow(bdev, block);
+ bh = __find_get_block_slow(bdev, block, atomic);
if (bh)
bh_lru_install(bh);
} else
@@ -1403,8 +1413,23 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
return bh;
}
+
+struct buffer_head *
+__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
+{
+ return find_get_block_common(bdev, block, size, true);
+}
EXPORT_SYMBOL(__find_get_block);
+/* same as __find_get_block() but allows sleeping contexts */
+struct buffer_head *
+__find_get_block_nonatomic(struct block_device *bdev, sector_t block,
+ unsigned size)
+{
+ return find_get_block_common(bdev, block, size, false);
+}
+EXPORT_SYMBOL(__find_get_block_nonatomic);
+
/**
* bdev_getblk - Get a buffer_head in a block device's buffer cache.
* @bdev: The block device.
@@ -1422,7 +1447,12 @@ EXPORT_SYMBOL(__find_get_block);
struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
- struct buffer_head *bh = __find_get_block(bdev, block, size);
+ struct buffer_head *bh;
+
+ if (gfpflags_allow_blocking(gfp))
+ bh = __find_get_block_nonatomic(bdev, block, size);
+ else
+ bh = __find_get_block(bdev, block, size);
might_alloc(gfp);
if (bh)
@@ -1578,8 +1608,8 @@ static void discard_buffer(struct buffer_head * bh)
bh->b_bdev = NULL;
b_state = READ_ONCE(bh->b_state);
do {
- } while (!try_cmpxchg(&bh->b_state, &b_state,
- b_state & ~BUFFER_FLAGS_DISCARD));
+ } while (!try_cmpxchg_relaxed(&bh->b_state, &b_state,
+ b_state & ~BUFFER_FLAGS_DISCARD));
unlock_buffer(bh);
}
@@ -1644,7 +1674,6 @@ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
filemap_release_folio(folio, 0);
out:
folio_clear_mappedtodisk(folio);
- return;
}
EXPORT_SYMBOL(block_invalidate_folio);
@@ -2236,9 +2265,8 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
}
EXPORT_SYMBOL(block_write_begin);
-int block_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+int block_write_end(loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio)
{
size_t start = pos - folio_pos(folio);
@@ -2269,15 +2297,15 @@ int block_write_end(struct file *file, struct address_space *mapping,
}
EXPORT_SYMBOL(block_write_end);
-int generic_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+int generic_write_end(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
bool i_size_changed = false;
- copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ copied = block_write_end(pos, len, copied, folio);
/*
* No need to use i_size_read() here, the i_size cannot change under us
@@ -2466,7 +2494,8 @@ out:
}
EXPORT_SYMBOL(generic_cont_expand_simple);
-static int cont_expand_zero(struct file *file, struct address_space *mapping,
+static int cont_expand_zero(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, loff_t *bytes)
{
struct inode *inode = mapping->host;
@@ -2490,12 +2519,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
}
len = PAGE_SIZE - zerofrom;
- err = aops->write_begin(file, mapping, curpos, len,
+ err = aops->write_begin(iocb, mapping, curpos, len,
&folio, &fsdata);
if (err)
goto out;
folio_zero_range(folio, offset_in_folio(folio, curpos), len);
- err = aops->write_end(file, mapping, curpos, len, len,
+ err = aops->write_end(iocb, mapping, curpos, len, len,
folio, fsdata);
if (err < 0)
goto out;
@@ -2523,12 +2552,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
}
len = offset - zerofrom;
- err = aops->write_begin(file, mapping, curpos, len,
+ err = aops->write_begin(iocb, mapping, curpos, len,
&folio, &fsdata);
if (err)
goto out;
folio_zero_range(folio, offset_in_folio(folio, curpos), len);
- err = aops->write_end(file, mapping, curpos, len, len,
+ err = aops->write_end(iocb, mapping, curpos, len, len,
folio, fsdata);
if (err < 0)
goto out;
@@ -2543,17 +2572,16 @@ out:
* For moronic filesystems that do not allow holes in file.
* We may have to extend the file.
*/
-int cont_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata,
- get_block_t *get_block, loff_t *bytes)
+int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, unsigned len, struct folio **foliop,
+ void **fsdata, get_block_t *get_block, loff_t *bytes)
{
struct inode *inode = mapping->host;
unsigned int blocksize = i_blocksize(inode);
unsigned int zerofrom;
int err;
- err = cont_expand_zero(file, mapping, pos, bytes);
+ err = cont_expand_zero(iocb, mapping, pos, bytes);
if (err)
return err;
@@ -2575,7 +2603,7 @@ EXPORT_SYMBOL(cont_write_begin);
* holes and correct delalloc and unwritten extent mapping on filesystems that
* support these features.
*
- * We are not allowed to take the i_mutex here so we have to play games to
+ * We are not allowed to take the i_rwsem here so we have to play games to
* protect against truncate races as the page could now be beyond EOF. Because
* truncate writes the inode size before removing pages, once we have the
* page lock we can determine safely if the page is beyond EOF. If it is not
@@ -2695,7 +2723,7 @@ unlock:
EXPORT_SYMBOL(block_truncate_page);
/*
- * The generic ->writepage function for buffer-backed address_spaces
+ * The generic write folio function for buffer-backed address_spaces
*/
int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
void *get_block)
@@ -2715,7 +2743,7 @@ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
/*
* The folio straddles i_size. It must be zeroed out on each and every
- * writepage invocation because it may be mmapped. "A file is mapped
+ * writeback invocation because it may be mmapped. "A file is mapped
* in multiples of the page size. For a file that is not a multiple of
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."