summaryrefslogtreecommitdiff
path: root/fs/ext4/balloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/balloc.c')
-rw-r--r--fs/ext4/balloc.c299
1 files changed, 200 insertions, 99 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e5d6ee61ff48..8040c731b3e4 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -22,6 +22,7 @@
#include "mballoc.h"
#include <trace/events/ext4.h>
+#include <kunit/static_stub.h>
static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
ext4_group_t block_group);
@@ -80,32 +81,54 @@ static inline int ext4_block_in_group(struct super_block *sb,
return (actual_group == block_group) ? 1 : 0;
}
-/* Return the number of clusters used for file system metadata; this
+/*
+ * Return the number of clusters used for file system metadata; this
* represents the overhead needed by the file system.
*/
static unsigned ext4_num_overhead_clusters(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
{
- unsigned num_clusters;
- int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c;
+ unsigned base_clusters, num_clusters;
+ int block_cluster = -1, inode_cluster;
+ int itbl_cluster_start = -1, itbl_cluster_end = -1;
ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group);
- ext4_fsblk_t itbl_blk;
+ ext4_fsblk_t end = start + EXT4_BLOCKS_PER_GROUP(sb) - 1;
+ ext4_fsblk_t itbl_blk_start, itbl_blk_end;
struct ext4_sb_info *sbi = EXT4_SB(sb);
/* This is the number of clusters used by the superblock,
* block group descriptors, and reserved block group
* descriptor blocks */
- num_clusters = ext4_num_base_meta_clusters(sb, block_group);
+ base_clusters = ext4_num_base_meta_clusters(sb, block_group);
+ num_clusters = base_clusters;
+
+ /*
+ * Account and record inode table clusters if any cluster
+ * is in the block group, or inode table cluster range is
+ * [-1, -1] and won't overlap with block/inode bitmap cluster
+ * accounted below.
+ */
+ itbl_blk_start = ext4_inode_table(sb, gdp);
+ itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1;
+ if (itbl_blk_start <= end && itbl_blk_end >= start) {
+ itbl_blk_start = max(itbl_blk_start, start);
+ itbl_blk_end = min(itbl_blk_end, end);
+
+ itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start);
+ itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start);
+
+ num_clusters += itbl_cluster_end - itbl_cluster_start + 1;
+ /* check if border cluster is overlapped */
+ if (itbl_cluster_start == base_clusters - 1)
+ num_clusters--;
+ }
/*
- * For the allocation bitmaps and inode table, we first need
- * to check to see if the block is in the block group. If it
- * is, then check to see if the cluster is already accounted
- * for in the clusters used for the base metadata cluster, or
- * if we can increment the base metadata cluster to include
- * that block. Otherwise, we will have to track the cluster
- * used for the allocation bitmap or inode table explicitly.
+ * For the allocation bitmaps, we first need to check to see
+ * if the block is in the block group. If it is, then check
+ * to see if the cluster is already accounted for in the clusters
+ * used for the base metadata cluster and inode tables cluster.
* Normally all of these blocks are contiguous, so the special
* case handling shouldn't be necessary except for *very*
* unusual file system layouts.
@@ -113,46 +136,26 @@ static unsigned ext4_num_overhead_clusters(struct super_block *sb,
if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) {
block_cluster = EXT4_B2C(sbi,
ext4_block_bitmap(sb, gdp) - start);
- if (block_cluster < num_clusters)
- block_cluster = -1;
- else if (block_cluster == num_clusters) {
+ if (block_cluster >= base_clusters &&
+ (block_cluster < itbl_cluster_start ||
+ block_cluster > itbl_cluster_end))
num_clusters++;
- block_cluster = -1;
- }
}
if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) {
inode_cluster = EXT4_B2C(sbi,
ext4_inode_bitmap(sb, gdp) - start);
- if (inode_cluster < num_clusters)
- inode_cluster = -1;
- else if (inode_cluster == num_clusters) {
- num_clusters++;
- inode_cluster = -1;
- }
- }
-
- itbl_blk = ext4_inode_table(sb, gdp);
- for (i = 0; i < sbi->s_itb_per_group; i++) {
- if (ext4_block_in_group(sb, itbl_blk + i, block_group)) {
- c = EXT4_B2C(sbi, itbl_blk + i - start);
- if ((c < num_clusters) || (c == inode_cluster) ||
- (c == block_cluster) || (c == itbl_cluster))
- continue;
- if (c == num_clusters) {
- num_clusters++;
- continue;
- }
+ /*
+ * Additional check if inode bitmap is in just accounted
+ * block_cluster
+ */
+ if (inode_cluster != block_cluster &&
+ inode_cluster >= base_clusters &&
+ (inode_cluster < itbl_cluster_start ||
+ inode_cluster > itbl_cluster_end))
num_clusters++;
- itbl_cluster = c;
- }
}
- if (block_cluster != -1)
- num_clusters++;
- if (inode_cluster != -1)
- num_clusters++;
-
return num_clusters;
}
@@ -185,10 +188,8 @@ static int ext4_init_block_bitmap(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t start, tmp;
- J_ASSERT_BH(bh, buffer_locked(bh));
+ ASSERT(buffer_locked(bh));
- /* If checksum is bad mark all blocks used to prevent allocation
- * essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
ext4_mark_group_bitmap_corrupted(sb, block_group,
EXT4_GROUP_INFO_BBITMAP_CORRUPT |
@@ -239,7 +240,7 @@ unsigned ext4_free_clusters_after_init(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
{
- return num_clusters_in_group(sb, block_group) -
+ return num_clusters_in_group(sb, block_group) -
ext4_num_overhead_clusters(sb, block_group, gdp);
}
@@ -270,6 +271,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
ext4_group_t ngroups = ext4_get_groups_count(sb);
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *bh_p;
+
+ KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc,
+ sb, block_group, bh);
if (block_group >= ngroups) {
ext4_error(sb, "block_group >= groups_count - block_group = %u,"
@@ -280,7 +285,14 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
- if (!sbi->s_group_desc[group_desc]) {
+ bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc);
+ /*
+ * sbi_array_rcu_deref returns with rcu unlocked, this is ok since
+ * the pointer being dereferenced won't be dereferenced again. By
+ * looking at the usage in add_new_gdb() the value isn't modified,
+ * just the pointer, and so it remains valid.
+ */
+ if (!bh_p) {
ext4_error(sb, "Group descriptor not loaded - "
"block_group = %u, group_desc = %u, desc = %u",
block_group, group_desc, offset);
@@ -288,13 +300,43 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
}
desc = (struct ext4_group_desc *)(
- (__u8 *)sbi->s_group_desc[group_desc]->b_data +
+ (__u8 *)bh_p->b_data +
offset * EXT4_DESC_SIZE(sb));
if (bh)
- *bh = sbi->s_group_desc[group_desc];
+ *bh = bh_p;
return desc;
}
+static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
+{
+ ext4_grpblk_t next_zero_bit;
+ unsigned long bitmap_size = sb->s_blocksize * 8;
+ unsigned int offset = num_clusters_in_group(sb, block_group);
+
+ if (bitmap_size <= offset)
+ return 0;
+
+ next_zero_bit = ext4_find_next_zero_bit(bh->b_data, bitmap_size, offset);
+
+ return (next_zero_bit < bitmap_size ? next_zero_bit : 0);
+}
+
+struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
+ ext4_group_t group)
+{
+ struct ext4_group_info **grp_info;
+ long indexv, indexh;
+
+ if (unlikely(group >= EXT4_SB(sb)->s_groups_count))
+ return NULL;
+ indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
+ indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
+ grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
+ return grp_info[indexh];
+}
+
/*
* Return the block number which was discovered to be invalid, or 0 if
* the block bitmap is valid.
@@ -342,13 +384,13 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
blk = ext4_inode_table(sb, desc);
offset = blk - group_first_block;
if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit ||
- EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= max_bit)
+ EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) >= max_bit)
return blk;
next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
- EXT4_B2C(sbi, offset + sbi->s_itb_per_group),
+ EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1,
EXT4_B2C(sbi, offset));
if (next_zero_bit <
- EXT4_B2C(sbi, offset + sbi->s_itb_per_group))
+ EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1)
/* bad bitmap for inode tables */
return blk;
return 0;
@@ -360,18 +402,23 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
struct buffer_head *bh)
{
ext4_fsblk_t blk;
- struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+ struct ext4_group_info *grp;
+
+ if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
+ grp = ext4_get_group_info(sb, block_group);
if (buffer_verified(bh))
return 0;
- if (EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ if (!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
return -EFSCORRUPTED;
ext4_lock_group(sb, block_group);
if (buffer_verified(bh))
goto verified;
- if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
- desc, bh))) {
+ if (unlikely(!ext4_block_bitmap_csum_verify(sb, desc, bh) ||
+ ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
ext4_mark_group_bitmap_corrupted(sb, block_group,
@@ -387,6 +434,15 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
return -EFSCORRUPTED;
}
+ blk = ext4_valid_block_bitmap_padding(sb, block_group, bh);
+ if (unlikely(blk != 0)) {
+ ext4_unlock_group(sb, block_group);
+ ext4_error(sb, "bg %u: block %llu: padding at end of block bitmap is not set",
+ block_group, blk);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
+ return -EFSCORRUPTED;
+ }
set_buffer_verified(bh);
verified:
ext4_unlock_group(sb, block_group);
@@ -397,14 +453,16 @@ verified:
* ext4_read_block_bitmap_nowait()
* @sb: super block
* @block_group: given block group
+ * @ignore_locked: ignore locked buffers
*
* Read the bitmap for a given block_group,and validate the
* bits for block/inode/inode tables are set in the bitmaps
*
- * Return buffer_head on success or NULL in case of failure.
+ * Return buffer_head on success or an ERR_PTR in case of failure.
*/
struct buffer_head *
-ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
+ bool ignore_locked)
{
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -412,6 +470,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
ext4_fsblk_t bitmap_blk;
int err;
+ KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait,
+ sb, block_group, ignore_locked);
+
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
return ERR_PTR(-EFSCORRUPTED);
@@ -432,6 +493,12 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
return ERR_PTR(-ENOMEM);
}
+ if (ignore_locked && buffer_locked(bh)) {
+ /* buffer under IO already, return if called for prefetching */
+ put_bh(bh);
+ return NULL;
+ }
+
if (bitmap_uptodate(bh))
goto verify;
@@ -452,17 +519,19 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
goto out;
}
err = ext4_init_block_bitmap(sb, bh, block_group, desc);
- set_bitmap_uptodate(bh);
- set_buffer_uptodate(bh);
- set_buffer_verified(bh);
- ext4_unlock_group(sb, block_group);
- unlock_buffer(bh);
if (err) {
+ ext4_unlock_group(sb, block_group);
+ unlock_buffer(bh);
ext4_error(sb, "Failed to init block bitmap for group "
"%u: %d", block_group, err);
goto out;
}
- goto verify;
+ set_bitmap_uptodate(bh);
+ set_buffer_uptodate(bh);
+ set_buffer_verified(bh);
+ ext4_unlock_group(sb, block_group);
+ unlock_buffer(bh);
+ return bh;
}
ext4_unlock_group(sb, block_group);
if (buffer_uptodate(bh)) {
@@ -478,10 +547,11 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
* submit the buffer_head for reading
*/
set_buffer_new(bh);
- trace_ext4_read_block_bitmap_load(sb, block_group);
- bh->b_end_io = ext4_end_bitmap_read;
- get_bh(bh);
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
+ trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked);
+ ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO |
+ (ignore_locked ? REQ_RAHEAD : 0),
+ ext4_end_bitmap_read,
+ ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_EIO));
return bh;
verify:
err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -493,12 +563,15 @@ out:
return ERR_PTR(err);
}
-/* Returns 0 on success, 1 on error */
+/* Returns 0 on success, -errno on error */
int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
struct buffer_head *bh)
{
struct ext4_group_desc *desc;
+ KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap,
+ sb, block_group, bh);
+
if (!buffer_new(bh))
return 0;
desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -506,9 +579,9 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
return -EFSCORRUPTED;
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
- ext4_error(sb, "Cannot read block bitmap - "
- "block_group = %u, block_bitmap = %llu",
- block_group, (unsigned long long) bh->b_blocknr);
+ ext4_error_err(sb, EIO, "Cannot read block bitmap - "
+ "block_group = %u, block_bitmap = %llu",
+ block_group, (unsigned long long) bh->b_blocknr);
ext4_mark_group_bitmap_corrupted(sb, block_group,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
return -EIO;
@@ -524,7 +597,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
struct buffer_head *bh;
int err;
- bh = ext4_read_block_bitmap_nowait(sb, block_group);
+ bh = ext4_read_block_bitmap_nowait(sb, block_group, false);
if (IS_ERR(bh))
return bh;
err = ext4_wait_block_bitmap(sb, block_group, bh);
@@ -576,8 +649,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
/* Hm, nope. Are (enough) root reserved clusters available? */
if (uid_eq(sbi->s_resuid, current_fsuid()) ||
(!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
- capable(CAP_SYS_RESOURCE) ||
- (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+ (flags & EXT4_MB_USE_ROOT_BLOCKS) ||
+ capable(CAP_SYS_RESOURCE)) {
if (free_clusters >= (nclusters + dirty_clusters +
resv_clusters))
@@ -603,28 +676,48 @@ int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
}
/**
- * ext4_should_retry_alloc()
- * @sb: super block
- * @retries number of attemps has been made
+ * ext4_should_retry_alloc() - check if a block allocation should be retried
+ * @sb: superblock
+ * @retries: number of retry attempts made so far
*
- * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
- * it is profitable to retry the operation, this function will wait
- * for the current or committing transaction to complete, and then
- * return TRUE. We will only retry once.
+ * ext4_should_retry_alloc() is called when ENOSPC is returned while
+ * attempting to allocate blocks. If there's an indication that a pending
+ * journal transaction might free some space and allow another attempt to
+ * succeed, this function will wait for the current or committing transaction
+ * to complete and then return TRUE.
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
- if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
- (*retries)++ > 1 ||
- !EXT4_SB(sb)->s_journal)
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!sbi->s_journal)
return 0;
- smp_mb();
- if (EXT4_SB(sb)->s_mb_free_pending == 0)
+ if (++(*retries) > 3) {
+ percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit);
return 0;
+ }
- jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
- jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+ /*
+ * if there's no indication that blocks are about to be freed it's
+ * possible we just missed a transaction commit that did so
+ */
+ smp_mb();
+ if (atomic_read(&sbi->s_mb_free_pending) == 0) {
+ if (test_opt(sb, DISCARD)) {
+ atomic_inc(&sbi->s_retry_alloc_pending);
+ flush_work(&sbi->s_discard_work);
+ atomic_dec(&sbi->s_retry_alloc_pending);
+ }
+ return ext4_has_free_clusters(sbi, 1, 0);
+ }
+
+ /*
+ * it's possible we've just missed a transaction commit here,
+ * so ignore the returned status
+ */
+ ext4_debug("%s: retrying operation after ENOSPC\n", sb->s_id);
+ (void) jbd2_journal_force_commit_nested(sbi->s_journal);
return 1;
}
@@ -659,7 +752,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
*count = ar.len;
/*
* Account for the allocated meta blocks. We will never
- * fail EDQUOT for metdata, but we do account for it.
+ * fail EDQUOT for metadata, but we do account for it.
*/
if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) {
dquot_alloc_block_nofail(inode,
@@ -828,11 +921,11 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
}
/*
- * This function returns the number of file system metadata clusters at
+ * This function returns the number of file system metadata blocks at
* the beginning of a block group, including the reserved gdt blocks.
*/
-static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
- ext4_group_t block_group)
+unsigned int ext4_num_base_meta_blocks(struct super_block *sb,
+ ext4_group_t block_group)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned num;
@@ -844,14 +937,21 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
sbi->s_desc_per_block) {
if (num) {
- num += ext4_bg_num_gdb(sb, block_group);
+ num += ext4_bg_num_gdb_nometa(sb, block_group);
num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
}
} else { /* For META_BG_BLOCK_GROUPS */
- num += ext4_bg_num_gdb(sb, block_group);
+ num += ext4_bg_num_gdb_meta(sb, block_group);
}
- return EXT4_NUM_B2C(sbi, num);
+ return num;
}
+
+static unsigned int ext4_num_base_meta_clusters(struct super_block *sb,
+ ext4_group_t block_group)
+{
+ return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group));
+}
+
/**
* ext4_inode_to_goal_block - return a hint for block allocation
* @inode: inode for block allocation
@@ -893,10 +993,11 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
return bg_start;
if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
- colour = (current->pid % 16) *
+ colour = (task_pid_nr(current) % 16) *
(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
else
- colour = (current->pid % 16) * ((last_block - bg_start) / 16);
+ colour = (task_pid_nr(current) % 16) *
+ ((last_block - bg_start) / 16);
return bg_start + colour;
}