diff options
Diffstat (limited to 'fs/ext4/mballoc.c')
-rw-r--r-- | fs/ext4/mballoc.c | 596 |
1 files changed, 321 insertions, 275 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f44f668e407f..1e98c5be4e0a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -187,7 +187,7 @@ * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan * /sys/fs/ext4/<partition>/mb_order2_req - * /sys/fs/ext4/<partition>/mb_linear_limit + * /sys/fs/ext4/<partition>/mb_max_linear_groups * * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The @@ -209,7 +209,7 @@ * get traversed linearly. That may result in subsequent allocations being not * close to each other. And so, the underlying device may get filled up in a * non-linear fashion. While that may not matter on non-rotational devices, for - * rotational devices that may result in higher seek times. "mb_linear_limit" + * rotational devices that may result in higher seek times. "mb_max_linear_groups" * tells mballoc how many groups mballoc should search linearly before * performing consulting above data structures for more efficient lookups. For * non rotational devices, this value defaults to 0 and for rotational devices @@ -564,14 +564,14 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(EXT4_SB(sb), first + i); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing block already freed " "(bit %u)", first + i); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } @@ -677,7 +677,7 @@ do { \ } \ } while (0) -static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, +static void __mb_check_buddy(struct ext4_buddy *e4b, char *file, const char *function, int line) { struct super_block *sb = e4b->bd_sb; @@ -696,7 +696,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, void *buddy2; if (e4b->bd_info->bb_check_counter++ % 10) - return 0; + return; while (order > 1) { buddy = mb_find_buddy(e4b, order, &max); @@ -758,7 +758,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, grp = ext4_get_group_info(sb, e4b->bd_group); if (!grp) - return NULL; + return; list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; @@ -768,7 +768,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, for (i = 0; i < pa->pa_len; i++) MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); } - return 0; } #undef MB_CHECK_ASSERT #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ @@ -832,6 +831,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) return 0; if (order == MB_NUM_ORDERS(sb)) order--; + if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb))) + order = MB_NUM_ORDERS(sb) - 1; return order; } @@ -842,7 +843,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) struct ext4_sb_info *sbi = EXT4_SB(sb); int new_order; - if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) + if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_fragments == 0) return; new_order = mb_avg_fragment_size_order(sb, @@ -871,7 +872,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) * cr level needs an update. */ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *iter; @@ -945,7 +946,7 @@ ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int o * order. Updates *new_cr if cr level needs an update. */ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL; @@ -990,7 +991,7 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context * * much and fall to CR_GOAL_LEN_SLOW in that case. */ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL; @@ -1009,6 +1010,8 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context * goal length. */ order = fls(ac->ac_g_ex.fe_len) - 1; + if (WARN_ON_ONCE(order - 1 > MB_NUM_ORDERS(ac->ac_sb))) + order = MB_NUM_ORDERS(ac->ac_sb); min_order = order - sbi->s_mb_best_avail_max_trim_order; if (min_order < 0) min_order = 0; @@ -1077,23 +1080,11 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) } /* - * Return next linear group for allocation. If linear traversal should not be - * performed, this function just returns the same group + * Return next linear group for allocation. */ static ext4_group_t -next_linear_group(struct ext4_allocation_context *ac, ext4_group_t group, - ext4_group_t ngroups) +next_linear_group(ext4_group_t group, ext4_group_t ngroups) { - if (!should_optimize_scan(ac)) - goto inc_and_return; - - if (ac->ac_groups_linear_remaining) { - ac->ac_groups_linear_remaining--; - goto inc_and_return; - } - - return group; -inc_and_return: /* * Artificially restricted ngroups for non-extent * files makes group > ngroups possible on first loop. @@ -1119,21 +1110,33 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, { *new_cr = ac->ac_criteria; - if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) { - *group = next_linear_group(ac, *group, ngroups); + if (!should_optimize_scan(ac)) { + *group = next_linear_group(*group, ngroups); + return; + } + + /* + * Optimized scanning can return non adjacent groups which can cause + * seek overhead for rotational disks. So try few linear groups before + * trying optimized scan. + */ + if (ac->ac_groups_linear_remaining) { + *group = next_linear_group(*group, ngroups); + ac->ac_groups_linear_remaining--; return; } if (*new_cr == CR_POWER2_ALIGNED) { - ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group, ngroups); + ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group); } else if (*new_cr == CR_GOAL_LEN_FAST) { - ext4_mb_choose_next_group_goal_fast(ac, new_cr, group, ngroups); + ext4_mb_choose_next_group_goal_fast(ac, new_cr, group); } else if (*new_cr == CR_BEST_AVAIL_LEN) { - ext4_mb_choose_next_group_best_avail(ac, new_cr, group, ngroups); + ext4_mb_choose_next_group_best_avail(ac, new_cr, group); } else { /* - * TODO: For CR=2, we can arrange groups in an rb tree sorted by - * bb_free. But until that happens, we should never come here. + * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an + * rb tree sorted by bb_free. But until that happens, we should + * never come here. */ WARN_ON(1); } @@ -1233,6 +1236,24 @@ void ext4_mb_generate_buddy(struct super_block *sb, atomic64_add(period, &sbi->s_mb_generation_time); } +static void mb_regenerate_buddy(struct ext4_buddy *e4b) +{ + int count; + int order = 1; + void *buddy; + + while ((buddy = mb_find_buddy(e4b, order++, &count))) + mb_set_bits(buddy, 0, count); + + e4b->bd_info->bb_fragments = 0; + memset(e4b->bd_info->bb_counters, 0, + sizeof(*e4b->bd_info->bb_counters) * + (e4b->bd_sb->s_blocksize_bits + 2)); + + ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, + e4b->bd_bitmap, e4b->bd_group, e4b->bd_info); +} + /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve @@ -1253,7 +1274,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, * for this page; do not hold this lock when calling this routine! */ -static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) +static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) { ext4_group_t ngroups; unsigned int blocksize; @@ -1271,13 +1292,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) char *bitmap; struct ext4_group_info *grinfo; - inode = page->mapping->host; + inode = folio->mapping->host; sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = i_blocksize(inode); blocks_per_page = PAGE_SIZE / blocksize; - mb_debug(sb, "init page %lu\n", page->index); + mb_debug(sb, "init folio %lu\n", folio->index); groups_per_page = blocks_per_page >> 1; if (groups_per_page == 0) @@ -1292,9 +1313,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) } else bh = &bhs; - first_group = page->index * blocks_per_page / 2; + first_group = folio->index * blocks_per_page / 2; - /* read all groups the page covers into the cache */ + /* read all groups the folio covers into the cache */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { if (group >= ngroups) break; @@ -1305,10 +1326,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) /* * If page is uptodate then we came here after online resize * which added some new uninitialized group info structs, so - * we must skip all initialized uptodate buddies on the page, + * we must skip all initialized uptodate buddies on the folio, * which may be currently in use by an allocating task. */ - if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { + if (folio_test_uptodate(folio) && + !EXT4_MB_GRP_NEED_INIT(grinfo)) { bh[i] = NULL; continue; } @@ -1332,7 +1354,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) err = err2; } - first_block = page->index * blocks_per_page; + first_block = folio->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { group = (first_block + i) >> 1; if (group >= ngroups) @@ -1353,7 +1375,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) * above * */ - data = page_address(page) + (i * blocksize); + data = folio_address(folio) + (i * blocksize); bitmap = bh[group - first_group]->b_data; /* @@ -1368,8 +1390,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); - mb_debug(sb, "put buddy for group %u in page %lu/%x\n", - group, page->index, i * blocksize); + mb_debug(sb, "put buddy for group %u in folio %lu/%x\n", + group, folio->index, i * blocksize); trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, @@ -1387,8 +1409,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) } else { /* this is block of bitmap */ BUG_ON(incore != NULL); - mb_debug(sb, "put bitmap for group %u in page %lu/%x\n", - group, page->index, i * blocksize); + mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n", + group, folio->index, i * blocksize); trace_ext4_mb_bitmap_load(sb, group); /* see comments in ext4_mb_put_pa() */ @@ -1406,7 +1428,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) incore = data; } } - SetPageUptodate(page); + folio_mark_uptodate(folio); out: if (bh) { @@ -1422,7 +1444,7 @@ out: * Lock the buddy and bitmap pages. This make sure other parallel init_group * on the same buddy page doesn't happen whild holding the buddy page lock. * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap - * are on the same page e4b->bd_buddy_page is NULL and return value is 0. + * are on the same page e4b->bd_buddy_folio is NULL and return value is 0. */ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) @@ -1430,10 +1452,10 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, struct inode *inode = EXT4_SB(sb)->s_buddy_cache; int block, pnum, poff; int blocks_per_page; - struct page *page; + struct folio *folio; - e4b->bd_buddy_page = NULL; - e4b->bd_bitmap_page = NULL; + e4b->bd_buddy_folio = NULL; + e4b->bd_bitmap_folio = NULL; blocks_per_page = PAGE_SIZE / sb->s_blocksize; /* @@ -1444,12 +1466,13 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, block = group * 2; pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (!page) - return -ENOMEM; - BUG_ON(page->mapping != inode->i_mapping); - e4b->bd_bitmap_page = page; - e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (IS_ERR(folio)) + return PTR_ERR(folio); + BUG_ON(folio->mapping != inode->i_mapping); + e4b->bd_bitmap_folio = folio; + e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); if (blocks_per_page >= 2) { /* buddy and bitmap are on the same page */ @@ -1457,23 +1480,24 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, } /* blocks_per_page == 1, hence we need another page for the buddy */ - page = find_or_create_page(inode->i_mapping, block + 1, gfp); - if (!page) - return -ENOMEM; - BUG_ON(page->mapping != inode->i_mapping); - e4b->bd_buddy_page = page; + folio = __filemap_get_folio(inode->i_mapping, block + 1, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (IS_ERR(folio)) + return PTR_ERR(folio); + BUG_ON(folio->mapping != inode->i_mapping); + e4b->bd_buddy_folio = folio; return 0; } static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) { - if (e4b->bd_bitmap_page) { - unlock_page(e4b->bd_bitmap_page); - put_page(e4b->bd_bitmap_page); + if (e4b->bd_bitmap_folio) { + folio_unlock(e4b->bd_bitmap_folio); + folio_put(e4b->bd_bitmap_folio); } - if (e4b->bd_buddy_page) { - unlock_page(e4b->bd_buddy_page); - put_page(e4b->bd_buddy_page); + if (e4b->bd_buddy_folio) { + folio_unlock(e4b->bd_buddy_folio); + folio_put(e4b->bd_buddy_folio); } } @@ -1488,7 +1512,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) struct ext4_group_info *this_grp; struct ext4_buddy e4b; - struct page *page; + struct folio *folio; int ret = 0; might_sleep(); @@ -1515,16 +1539,16 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) goto err; } - page = e4b.bd_bitmap_page; - ret = ext4_mb_init_cache(page, NULL, gfp); + folio = e4b.bd_bitmap_folio; + ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) goto err; - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } - if (e4b.bd_buddy_page == NULL) { + if (e4b.bd_buddy_folio == NULL) { /* * If both the bitmap and buddy are in * the same page we don't need to force @@ -1534,11 +1558,11 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) goto err; } /* init buddy cache */ - page = e4b.bd_buddy_page; - ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp); + folio = e4b.bd_buddy_folio; + ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp); if (ret) goto err; - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } @@ -1560,7 +1584,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, int block; int pnum; int poff; - struct page *page; + struct folio *folio; int ret; struct ext4_group_info *grp; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1578,8 +1602,8 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; - e4b->bd_buddy_page = NULL; - e4b->bd_bitmap_page = NULL; + e4b->bd_buddy_folio = NULL; + e4b->bd_bitmap_folio = NULL; if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { /* @@ -1600,102 +1624,103 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, pnum = block / blocks_per_page; poff = block % blocks_per_page; - /* we could use find_or_create_page(), but it locks page - * what we'd like to avoid in fast path ... */ - page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); - if (page == NULL || !PageUptodate(page)) { - if (page) + /* Avoid locking the folio in the fast path ... */ + folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + if (!IS_ERR(folio)) /* - * drop the page reference and try - * to get the page with lock. If we + * drop the folio reference and try + * to get the folio with lock. If we * are not uptodate that implies - * somebody just created the page but - * is yet to initialize the same. So + * somebody just created the folio but + * is yet to initialize it. So * wait for it to initialize. */ - put_page(page); - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (page) { - if (WARN_RATELIMIT(page->mapping != inode->i_mapping, - "ext4: bitmap's paging->mapping != inode->i_mapping\n")) { + folio_put(folio); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (!IS_ERR(folio)) { + if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, + "ext4: bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ - unlock_page(page); + folio_unlock(folio); ret = -EINVAL; goto err; } - if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, NULL, gfp); + if (!folio_test_uptodate(folio)) { + ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) { - unlock_page(page); + folio_unlock(folio); goto err; } - mb_cmp_bitmaps(e4b, page_address(page) + + mb_cmp_bitmaps(e4b, folio_address(folio) + (poff * sb->s_blocksize)); } - unlock_page(page); + folio_unlock(folio); } } - if (page == NULL) { - ret = -ENOMEM; + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto err; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } - /* Pages marked accessed already */ - e4b->bd_bitmap_page = page; - e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + /* Folios marked accessed already */ + e4b->bd_bitmap_folio = folio; + e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); block++; pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); - if (page == NULL || !PageUptodate(page)) { - if (page) - put_page(page); - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (page) { - if (WARN_RATELIMIT(page->mapping != inode->i_mapping, - "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) { + folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + if (!IS_ERR(folio)) + folio_put(folio); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (!IS_ERR(folio)) { + if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, + "ext4: buddy bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ - unlock_page(page); + folio_unlock(folio); ret = -EINVAL; goto err; } - if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, e4b->bd_bitmap, + if (!folio_test_uptodate(folio)) { + ret = ext4_mb_init_cache(folio, e4b->bd_bitmap, gfp); if (ret) { - unlock_page(page); + folio_unlock(folio); goto err; } } - unlock_page(page); + folio_unlock(folio); } } - if (page == NULL) { - ret = -ENOMEM; + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto err; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } - /* Pages marked accessed already */ - e4b->bd_buddy_page = page; - e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); + /* Folios marked accessed already */ + e4b->bd_buddy_folio = folio; + e4b->bd_buddy = folio_address(folio) + (poff * sb->s_blocksize); return 0; err: - if (page) - put_page(page); - if (e4b->bd_bitmap_page) - put_page(e4b->bd_bitmap_page); + if (!IS_ERR_OR_NULL(folio)) + folio_put(folio); + if (e4b->bd_bitmap_folio) + folio_put(e4b->bd_bitmap_folio); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; @@ -1710,10 +1735,10 @@ static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { - if (e4b->bd_bitmap_page) - put_page(e4b->bd_bitmap_page); - if (e4b->bd_buddy_page) - put_page(e4b->bd_buddy_page); + if (e4b->bd_bitmap_folio) + folio_put(e4b->bd_bitmap_folio); + if (e4b->bd_buddy_folio) + folio_put(e4b->bd_buddy_folio); } @@ -1891,11 +1916,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); - this_cpu_inc(discard_pa_seq); - e4b->bd_info->bb_free += count; - if (first < e4b->bd_info->bb_first_free) - e4b->bd_info->bb_first_free = first; - /* access memory sequentially: check left neighbour, * clear range and then check right neighbour */ @@ -1909,21 +1929,31 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t blocknr; + /* + * Fastcommit replay can free already freed blocks which + * corrupts allocation info. Regenerate it. + */ + if (sbi->s_mount_state & EXT4_FC_REPLAY) { + mb_regenerate_buddy(e4b); + goto check; + } + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(sbi, block); - if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { - ext4_grp_locked_error(sb, e4b->bd_group, - inode ? inode->i_ino : 0, - blocknr, - "freeing already freed block (bit %u); block bitmap corrupt.", - block); - ext4_mark_group_bitmap_corrupted( - sb, e4b->bd_group, + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); - } - goto done; + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, blocknr, + "freeing already freed block (bit %u); block bitmap corrupt.", + block); + return; } + this_cpu_inc(discard_pa_seq); + e4b->bd_info->bb_free += count; + if (first < e4b->bd_info->bb_first_free) + e4b->bd_info->bb_first_free = first; + /* let's maintain fragments counter */ if (left_is_free && right_is_free) e4b->bd_info->bb_fragments--; @@ -1948,9 +1978,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, if (first <= last) mb_buddy_mark_free(e4b, first >> 1, last >> 1); -done: mb_set_largest_free_order(sb, e4b->bd_info); mb_update_avg_fragment_size(sb, e4b->bd_info); +check: mb_check_buddy(e4b); } @@ -2018,13 +2048,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) int ord; int mlen = 0; int max = 0; - int cur; int start = ex->fe_start; int len = ex->fe_len; unsigned ret = 0; int len0 = len; void *buddy; - bool split = false; + int ord_start, ord_end; BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); BUG_ON(e4b->bd_group != ex->fe_group); @@ -2049,16 +2078,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) /* let's maintain buddy itself */ while (len) { - if (!split) - ord = mb_find_order_for_block(e4b, start); + ord = mb_find_order_for_block(e4b, start); if (((start >> ord) << ord) == start && len >= (1 << ord)) { /* the whole chunk may be allocated at once! */ mlen = 1 << ord; - if (!split) - buddy = mb_find_buddy(e4b, ord, &max); - else - split = false; + buddy = mb_find_buddy(e4b, ord, &max); BUG_ON((start >> ord) >= max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; @@ -2072,20 +2097,29 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) if (ret == 0) ret = len | (ord << 16); - /* we have to split large buddy */ BUG_ON(ord <= 0); buddy = mb_find_buddy(e4b, ord, &max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; - ord--; - cur = (start >> ord) & ~1U; - buddy = mb_find_buddy(e4b, ord, &max); - mb_clear_bit(cur, buddy); - mb_clear_bit(cur + 1, buddy); - e4b->bd_info->bb_counters[ord]++; - e4b->bd_info->bb_counters[ord]++; - split = true; + ord_start = (start >> ord) << ord; + ord_end = ord_start + (1 << ord); + /* first chunk */ + if (start > ord_start) + ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, + ord_start, start - ord_start, + e4b->bd_info); + + /* last chunk */ + if (start + len < ord_end) { + ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, + start + len, + ord_end - (start + len), + e4b->bd_info); + break; + } + len = start + len - ord_end; + start = ord_end; } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); @@ -2127,10 +2161,10 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, * double allocate blocks. The reference is dropped * in ext4_mb_release_context */ - ac->ac_bitmap_page = e4b->bd_bitmap_page; - get_page(ac->ac_bitmap_page); - ac->ac_buddy_page = e4b->bd_buddy_page; - get_page(ac->ac_buddy_page); + ac->ac_bitmap_folio = e4b->bd_bitmap_folio; + folio_get(ac->ac_bitmap_folio); + ac->ac_buddy_folio = e4b->bd_buddy_folio; + folio_get(ac->ac_buddy_folio); /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { spin_lock(&sbi->s_md_lock); @@ -2276,6 +2310,9 @@ void ext4_mb_try_best_found(struct ext4_allocation_context *ac, return; ext4_lock_group(ac->ac_sb, group); + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + goto out; + max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); if (max > 0) { @@ -2283,6 +2320,7 @@ void ext4_mb_try_best_found(struct ext4_allocation_context *ac, ext4_mb_use_best_found(ac, e4b); } +out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); } @@ -2309,18 +2347,16 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, if (err) return err; - if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) { - ext4_mb_unload_buddy(e4b); - return 0; - } - ext4_lock_group(ac->ac_sb, group); + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + goto out; + max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); ex.fe_logical = 0xDEADFA11; /* debug value */ if (max >= ac->ac_g_ex.fe_len && - ac->ac_g_ex.fe_len == EXT4_B2C(sbi, sbi->s_stripe)) { + ac->ac_g_ex.fe_len == EXT4_NUM_B2C(sbi, sbi->s_stripe)) { ext4_fsblk_t start; start = ext4_grp_offs_to_block(ac->ac_sb, &ex); @@ -2347,6 +2383,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } +out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); @@ -2380,12 +2417,12 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, k = mb_find_next_zero_bit(buddy, max, 0); if (k >= max) { + ext4_mark_group_bitmap_corrupted(ac->ac_sb, + e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0, "%d free clusters of order %d. But found 0", grp->bb_counters[i], i); - ext4_mark_group_bitmap_corrupted(ac->ac_sb, - e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); break; } ac->ac_found++; @@ -2436,12 +2473,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, * free blocks even though group info says we * have free blocks */ + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But bitmap says 0", free); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); break; } @@ -2467,12 +2504,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, if (WARN_ON(ex.fe_len <= 0)) break; if (free < ex.fe_len) { + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); /* * The number of free blocks differs. This mostly * indicate that the bitmap is corrupt. So exit @@ -2516,7 +2553,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; - stripe = EXT4_B2C(sbi, sbi->s_stripe); + stripe = EXT4_NUM_B2C(sbi, sbi->s_stripe); i = EXT4_B2C(sbi, i); while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { @@ -2650,7 +2687,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, int ret; /* - * cr=CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic + * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic * search to find large good chunks almost for free. If buddy * data is not ready, then this optimization makes no sense. But * we never skip the first block group in a flex_bg, since this @@ -2831,6 +2868,7 @@ repeat: group = ac->ac_g_ex.fe_group; ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; prefetch_grp = group; + nr = 0; for (i = 0, new_cr = cr; i < ngroups; i++, ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { @@ -2890,9 +2928,11 @@ repeat: if (cr == CR_POWER2_ALIGNED) ext4_mb_simple_scan_group(ac, &e4b); else { - bool is_stripe_aligned = sbi->s_stripe && + bool is_stripe_aligned = + (sbi->s_stripe >= + sbi->s_cluster_ratio) && !(ac->ac_g_ex.fe_len % - EXT4_B2C(sbi, sbi->s_stripe)); + EXT4_NUM_B2C(sbi, sbi->s_stripe)); if ((cr == CR_GOAL_LEN_FAST || cr == CR_BEST_AVAIL_LEN) && @@ -2990,17 +3030,15 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group = (ext4_group_t) ((unsigned long) v); - int i; - int err, buddy_loaded = 0; + int i, err; + char nbuf[16]; struct ext4_buddy e4b; struct ext4_group_info *grinfo; unsigned char blocksize_bits = min_t(unsigned char, sb->s_blocksize_bits, EXT4_MAX_BLOCK_LOG_SIZE); - struct sg { - struct ext4_group_info info; - ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; - } sg; + DEFINE_RAW_FLEX(struct ext4_group_info, sg, bb_counters, + EXT4_MAX_BLOCK_LOG_SIZE + 2); group--; if (group == 0) @@ -3008,7 +3046,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); - i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + i = (blocksize_bits + 2) * sizeof(sg->bb_counters[0]) + sizeof(struct ext4_group_info); grinfo = ext4_get_group_info(sb, group); @@ -3018,24 +3056,26 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { - seq_printf(seq, "#%-5u: I/O error\n", group); + seq_printf(seq, "#%-5u: %s\n", group, ext4_decode_error(NULL, err, nbuf)); return 0; } - buddy_loaded = 1; - } - - memcpy(&sg, grinfo, i); - - if (buddy_loaded) ext4_mb_unload_buddy(&e4b); + } - seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, - sg.info.bb_fragments, sg.info.bb_first_free); + /* + * We care only about free space counters in the group info and + * these are safe to access even after the buddy has been unloaded + */ + memcpy(sg, grinfo, i); + seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg->bb_free, + sg->bb_fragments, sg->bb_first_free); for (i = 0; i <= 13; i++) seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? - sg.info.bb_counters[i] : 0); - seq_puts(seq, " ]\n"); - + sg->bb_counters[i] : 0); + seq_puts(seq, " ]"); + if (EXT4_MB_GRP_BBITMAP_CORRUPT(sg)) + seq_puts(seq, " Block bitmap corrupted!"); + seq_putc(seq, '\n'); return 0; } @@ -3158,7 +3198,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) } static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) -__acquires(&EXT4_SB(sb)->s_mb_rb_lock) { struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; @@ -3412,10 +3451,11 @@ static int ext4_mb_init_backend(struct super_block *sb) } if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) sbi->s_mb_prefetch = ext4_get_groups_count(sb); - /* now many real IOs to prefetch within a single allocation at cr=0 - * given cr=0 is an CPU-related optimization we shouldn't try to - * load too many groups, at some point we should start to use what - * we've got in memory. + /* + * now many real IOs to prefetch within a single allocation at + * CR_POWER2_ALIGNED. Given CR_POWER2_ALIGNED is an CPU-related + * optimization we shouldn't try to load too many groups, at some point + * we should start to use what we've got in memory. * with an average random access time 5ms, it'd take a second to get * 200 groups (* N with flex_bg), so let's make this limit 4 */ @@ -3666,7 +3706,7 @@ int ext4_mb_init(struct super_block *sb) */ if (sbi->s_stripe > 1) { sbi->s_mb_group_prealloc = roundup( - sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe)); + sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe)); } sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); @@ -3725,7 +3765,7 @@ static int ext4_mb_cleanup_pa(struct ext4_group_info *grp) return count; } -int ext4_mb_release(struct super_block *sb) +void ext4_mb_release(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; @@ -3801,13 +3841,10 @@ int ext4_mb_release(struct super_block *sb) } free_percpu(sbi->s_locality_groups); - - return 0; } static inline int ext4_issue_discard(struct super_block *sb, - ext4_group_t block_group, ext4_grpblk_t cluster, int count, - struct bio **biop) + ext4_group_t block_group, ext4_grpblk_t cluster, int count) { ext4_fsblk_t discard_block; @@ -3816,13 +3853,8 @@ static inline int ext4_issue_discard(struct super_block *sb, count = EXT4_C2B(EXT4_SB(sb), count); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); - if (biop) { - return __blkdev_issue_discard(sb->s_bdev, - (sector_t)discard_block << (sb->s_blocksize_bits - 9), - (sector_t)count << (sb->s_blocksize_bits - 9), - GFP_NOFS, biop); - } else - return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); + + return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } static void ext4_free_data_in_buddy(struct super_block *sb, @@ -3854,18 +3886,15 @@ static void ext4_free_data_in_buddy(struct super_block *sb, /* * Clear the trimmed flag for the group so that the next * ext4_trim_fs can trim it. - * If the volume is mounted with -o discard, online discard - * is supported and the free blocks will be trimmed online. */ - if (!test_opt(sb, DISCARD)) - EXT4_MB_GRP_CLEAR_TRIMMED(db); + EXT4_MB_GRP_CLEAR_TRIMMED(db); if (!db->bb_free_root.rb_node) { /* No more items in the per group rb tree * balance refcounts from ext4_mb_free_metadata() */ - put_page(e4b.bd_buddy_page); - put_page(e4b.bd_bitmap_page); + folio_put(e4b.bd_buddy_folio); + folio_put(e4b.bd_bitmap_folio); } ext4_unlock_group(sb, entry->efd_group); ext4_mb_unload_buddy(&e4b); @@ -5146,10 +5175,16 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) .fe_len = ac->ac_orig_goal_len, }; loff_t orig_goal_end = extent_logical_end(sbi, &ex); + loff_t o_ex_end = extent_logical_end(sbi, &ac->ac_o_ex); - /* we can't allocate as much as normalizer wants. - * so, found space must get proper lstart - * to cover original request */ + /* + * We can't allocate as much as normalizer wants, so we try + * to get proper lstart to cover the original request, except + * when the goal doesn't cover the original request as below: + * + * orig_ex:2045/2055(10), isize:8417280 -> normalized:0/2048 + * best_ex:0/200(200) -> adjusted: 1848/2048(200) + */ BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); @@ -5161,7 +5196,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) * 1. Check if best ex can be kept at end of goal (before * cr_best_avail trimmed it) and still cover original start * 2. Else, check if best ex can be kept at start of goal and - * still cover original start + * still cover original end * 3. Else, keep the best ex at start of original request. */ ex.fe_len = ac->ac_b_ex.fe_len; @@ -5171,7 +5206,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) goto adjust_bex; ex.fe_logical = ac->ac_g_ex.fe_logical; - if (ac->ac_o_ex.fe_logical < extent_logical_end(sbi, &ex)) + if (o_ex_end <= extent_logical_end(sbi, &ex)) goto adjust_bex; ex.fe_logical = ac->ac_o_ex.fe_logical; @@ -5179,7 +5214,6 @@ adjust_bex: ac->ac_b_ex.fe_logical = ex.fe_logical; BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); - BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end); } @@ -5284,7 +5318,7 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) * the caller MUST hold group/inode locks. * TODO: optimize the case when there are no in-core structures yet */ -static noinline_for_stack int +static noinline_for_stack void ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, struct ext4_prealloc_space *pa) { @@ -5334,11 +5368,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, */ } atomic_add(free, &sbi->s_mb_discarded); - - return 0; } -static noinline_for_stack int +static noinline_for_stack void ext4_mb_release_group_pa(struct ext4_buddy *e4b, struct ext4_prealloc_space *pa) { @@ -5352,13 +5384,11 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) { ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu", e4b->bd_group, group, pa->pa_pstart); - return 0; + return; } mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); - - return 0; } /* @@ -5479,7 +5509,7 @@ out_dbg: * * FIXME!! Make sure it is valid at all the call sites */ -void ext4_discard_preallocations(struct inode *inode, unsigned int needed) +void ext4_discard_preallocations(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct super_block *sb = inode->i_sb; @@ -5491,9 +5521,8 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) struct rb_node *iter; int err; - if (!S_ISREG(inode->i_mode)) { + if (!S_ISREG(inode->i_mode)) return; - } if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return; @@ -5501,15 +5530,12 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) mb_debug(sb, "discard preallocation for inode %lu\n", inode->i_ino); trace_ext4_discard_preallocations(inode, - atomic_read(&ei->i_prealloc_active), needed); - - if (needed == 0) - needed = UINT_MAX; + atomic_read(&ei->i_prealloc_active)); repeat: /* first, collect all pa's in the inode */ write_lock(&ei->i_prealloc_lock); - for (iter = rb_first(&ei->i_prealloc_node); iter && needed; + for (iter = rb_first(&ei->i_prealloc_node); iter; iter = rb_next(iter)) { pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); @@ -5533,7 +5559,6 @@ repeat: spin_unlock(&pa->pa_lock); rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); list_add(&pa->u.pa_tmp_list, &list); - needed--; continue; } @@ -5626,7 +5651,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; - if (ext4_forced_shutdown(sb)) + if (ext4_emergency_state(sb)) return; ngroups = ext4_get_groups_count(sb); @@ -5660,7 +5685,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - if (ext4_forced_shutdown(sb)) + if (ext4_emergency_state(sb)) return; mb_debug(sb, "Can't allocate:" @@ -5684,7 +5709,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); mb_debug(sb, "%u found", ac->ac_found); - mb_debug(sb, "used pa: %s, ", ac->ac_pa ? "yes" : "no"); + mb_debug(sb, "used pa: %s, ", str_yes_no(ac->ac_pa)); if (ac->ac_pa) mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ? "group pa" : "inode pa"); @@ -5943,7 +5968,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) /* * release all resource we used in allocation */ -static int ext4_mb_release_context(struct ext4_allocation_context *ac) +static void ext4_mb_release_context(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; @@ -5973,14 +5998,13 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) ext4_mb_put_pa(ac, ac->ac_sb, pa); } - if (ac->ac_bitmap_page) - put_page(ac->ac_bitmap_page); - if (ac->ac_buddy_page) - put_page(ac->ac_buddy_page); + if (ac->ac_bitmap_folio) + folio_put(ac->ac_bitmap_folio); + if (ac->ac_buddy_folio) + folio_put(ac->ac_buddy_folio); if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); - return 0; } static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) @@ -6030,7 +6054,7 @@ static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, } out_dbg: - mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no"); + mb_debug(sb, "freed %d, retry ? %s\n", freed, str_yes_no(ret)); return ret; } @@ -6098,6 +6122,7 @@ ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) ext4_mb_mark_bb(sb, block, 1, true); ar->len = 1; + *errp = 0; return block; } @@ -6292,8 +6317,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct rb_node *parent = NULL, *new_node; BUG_ON(!ext4_handle_valid(handle)); - BUG_ON(e4b->bd_bitmap_page == NULL); - BUG_ON(e4b->bd_buddy_page == NULL); + BUG_ON(e4b->bd_bitmap_folio == NULL); + BUG_ON(e4b->bd_buddy_folio == NULL); new_node = &new_entry->efd_node; cluster = new_entry->efd_start_cluster; @@ -6304,8 +6329,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, * otherwise we'll refresh it from * on-disk bitmap and lose not-yet-available * blocks */ - get_page(e4b->bd_buddy_page); - get_page(e4b->bd_bitmap_page); + folio_get(e4b->bd_buddy_folio); + folio_get(e4b->bd_bitmap_folio); } while (*n) { parent = *n; @@ -6474,14 +6499,21 @@ do_more: } else { if (test_opt(sb, DISCARD)) { err = ext4_issue_discard(sb, block_group, bit, - count_clusters, NULL); - if (err && err != -EOPNOTSUPP) + count_clusters); + /* + * Ignore EOPNOTSUPP error. This is consistent with + * what happens when using journal. + */ + if (err == -EOPNOTSUPP) + err = 0; + if (err) ext4_msg(sb, KERN_WARNING, "discard request in" " group:%u block:%d count:%lu failed" " with %d", block_group, bit, count, err); - } else - EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); + } + + EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); ext4_lock_group(sb, block_group); mb_free_blocks(inode, &e4b, bit, count_clusters); @@ -6610,7 +6642,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, for (i = 0; i < count; i++) { cond_resched(); if (is_metadata) - bh = sb_find_get_block(inode->i_sb, block + i); + bh = sb_find_get_block_nonatomic(inode->i_sb, + block + i); ext4_forget(handle, is_metadata, inode, bh, block + i); } } @@ -6725,7 +6758,7 @@ __acquires(bitlock) */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); - ret = ext4_issue_discard(sb, group, start, count, NULL); + ret = ext4_issue_discard(sb, group, start, count); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); return ret; @@ -6761,6 +6794,9 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) bool set_trimmed = false; void *bitmap; + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + return 0; + last = ext4_last_grp_cluster(sb, e4b->bd_group); bitmap = e4b->bd_bitmap; if (start == 0 && max >= last) @@ -6962,13 +6998,14 @@ int ext4_mballoc_query_range( struct super_block *sb, ext4_group_t group, - ext4_grpblk_t start, + ext4_grpblk_t first, ext4_grpblk_t end, + ext4_mballoc_query_range_fn meta_formatter, ext4_mballoc_query_range_fn formatter, void *priv) { void *bitmap; - ext4_grpblk_t next; + ext4_grpblk_t start, next; struct ext4_buddy e4b; int error; @@ -6979,10 +7016,19 @@ ext4_mballoc_query_range( ext4_lock_group(sb, group); - start = max(e4b.bd_info->bb_first_free, start); + start = max(e4b.bd_info->bb_first_free, first); if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; - + if (meta_formatter && start != first) { + if (start > end) + start = end; + ext4_unlock_group(sb, group); + error = meta_formatter(sb, group, first, start - first, + priv); + if (error) + goto out_unload; + ext4_lock_group(sb, group); + } while (start <= end) { start = mb_find_next_zero_bit(bitmap, end + 1, start); if (start > end) |