diff options
Diffstat (limited to 'fs/btrfs/space-info.c')
-rw-r--r-- | fs/btrfs/space-info.c | 529 |
1 files changed, 429 insertions, 100 deletions
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d620323d08ea..d9087aa81b21 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/spinlock.h> +#include <linux/minmax.h> #include "misc.h" #include "ctree.h" #include "space-info.h" @@ -12,6 +14,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "zoned.h" /* * HOW DOES SPACE RESERVATION WORK @@ -47,11 +50,11 @@ * num_bytes we want to reserve. * * ->reserve - * space_info->bytes_may_reserve += num_bytes + * space_info->bytes_may_use += num_bytes * * ->extent allocation * Call btrfs_add_reserved_bytes() which does - * space_info->bytes_may_reserve -= num_bytes + * space_info->bytes_may_use -= num_bytes * space_info->bytes_reserved += extent_bytes * * ->insert reference @@ -125,6 +128,14 @@ * churn a lot and we can avoid making some extent tree modifications if we * are able to delay for as long as possible. * + * RESET_ZONES + * This state works only for the zoned mode. On the zoned mode, we cannot + * reuse once allocated then freed region until we reset the zone, due to + * the sequential write zone requirement. The RESET_ZONES state resets the + * zones of an unused block group and let us reuse the space. The reusing + * is faster than removing the block group and allocating another block + * group on the zones. + * * ALLOC_CHUNK * We will skip this the first time through space reservation, because of * overcommit and we don't want to have a lot of useless metadata space when @@ -161,7 +172,7 @@ * thing with or without extra unallocated space. */ -u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, +u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, bool may_use_included) { ASSERT(s_info); @@ -190,6 +201,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) */ #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) +#define BTRFS_UNALLOC_BLOCK_GROUP_TARGET (10ULL) + /* * Calculate chunk size depending on volume type (regular or zoned). */ @@ -221,18 +234,11 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, WRITE_ONCE(space_info->chunk_size, chunk_size); } -static int create_space_info(struct btrfs_fs_info *info, u64 flags) +static void init_space_info(struct btrfs_fs_info *info, + struct btrfs_space_info *space_info, u64 flags) { - - struct btrfs_space_info *space_info; - int i; - int ret; - - space_info = kzalloc(sizeof(*space_info), GFP_NOFS); - if (!space_info) - return -ENOMEM; - - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + space_info->fs_info = info; + for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&space_info->block_groups[i]); init_rwsem(&space_info->groups_sem); spin_lock_init(&space_info->lock); @@ -243,9 +249,64 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) INIT_LIST_HEAD(&space_info->priority_tickets); space_info->clamp = 1; btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags)); + space_info->subgroup_id = BTRFS_SUB_GROUP_PRIMARY; if (btrfs_is_zoned(info)) space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; +} + +static int create_space_info_sub_group(struct btrfs_space_info *parent, u64 flags, + enum btrfs_space_info_sub_group id, int index) +{ + struct btrfs_fs_info *fs_info = parent->fs_info; + struct btrfs_space_info *sub_group; + int ret; + + ASSERT(parent->subgroup_id == BTRFS_SUB_GROUP_PRIMARY); + ASSERT(id != BTRFS_SUB_GROUP_PRIMARY); + + sub_group = kzalloc(sizeof(*sub_group), GFP_NOFS); + if (!sub_group) + return -ENOMEM; + + init_space_info(fs_info, sub_group, flags); + parent->sub_group[index] = sub_group; + sub_group->parent = parent; + sub_group->subgroup_id = id; + + ret = btrfs_sysfs_add_space_info_type(fs_info, sub_group); + if (ret) { + kfree(sub_group); + parent->sub_group[index] = NULL; + } + return ret; +} + +static int create_space_info(struct btrfs_fs_info *info, u64 flags) +{ + + struct btrfs_space_info *space_info; + int ret = 0; + + space_info = kzalloc(sizeof(*space_info), GFP_NOFS); + if (!space_info) + return -ENOMEM; + + init_space_info(info, space_info, flags); + + if (btrfs_is_zoned(info)) { + if (flags & BTRFS_BLOCK_GROUP_DATA) + ret = create_space_info_sub_group(space_info, flags, + BTRFS_SUB_GROUP_DATA_RELOC, + 0); + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + ret = create_space_info_sub_group(space_info, flags, + BTRFS_SUB_GROUP_TREELOG, + 0); + + if (ret) + return ret; + } ret = btrfs_sysfs_add_space_info_type(info, space_info); if (ret) @@ -298,31 +359,29 @@ out: void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, struct btrfs_block_group *block_group) { - struct btrfs_space_info *found; + struct btrfs_space_info *space_info = block_group->space_info; int factor, index; factor = btrfs_bg_type_to_factor(block_group->flags); - found = btrfs_find_space_info(info, block_group->flags); - ASSERT(found); - spin_lock(&found->lock); - found->total_bytes += block_group->length; - found->disk_total += block_group->length * factor; - found->bytes_used += block_group->used; - found->disk_used += block_group->used * factor; - found->bytes_readonly += block_group->bytes_super; - found->bytes_zone_unusable += block_group->zone_unusable; + spin_lock(&space_info->lock); + space_info->total_bytes += block_group->length; + space_info->disk_total += block_group->length * factor; + space_info->bytes_used += block_group->used; + space_info->disk_used += block_group->used * factor; + space_info->bytes_readonly += block_group->bytes_super; + btrfs_space_info_update_bytes_zone_unusable(space_info, block_group->zone_unusable); if (block_group->length > 0) - found->full = 0; - btrfs_try_granting_tickets(info, found); - spin_unlock(&found->lock); + space_info->full = 0; + btrfs_try_granting_tickets(info, space_info); + spin_unlock(&space_info->lock); - block_group->space_info = found; + block_group->space_info = space_info; index = btrfs_bg_flags_to_raid_index(block_group->flags); - down_write(&found->groups_sem); - list_add_tail(&block_group->list, &found->block_groups[index]); - up_write(&found->groups_sem); + down_write(&space_info->groups_sem); + list_add_tail(&block_group->list, &space_info->block_groups[index]); + up_write(&space_info->groups_sem); } struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, @@ -340,11 +399,32 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, return NULL; } +static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *data_sinfo; + u64 data_chunk_size; + + /* + * Calculate the data_chunk_size, space_info->chunk_size is the + * "optimal" chunk size based on the fs size. However when we actually + * allocate the chunk we will strip this down further, making it no + * more than 10% of the disk or 1G, whichever is smaller. + * + * On the zoned mode, we need to use zone_size (= data_sinfo->chunk_size) + * as it is. + */ + data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); + if (btrfs_is_zoned(fs_info)) + return data_sinfo->chunk_size; + data_chunk_size = min(data_sinfo->chunk_size, + mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); + return min_t(u64, data_chunk_size, SZ_1G); +} + static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, + const struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) { - struct btrfs_space_info *data_sinfo; u64 profile; u64 avail; u64 data_chunk_size; @@ -368,16 +448,7 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, if (avail == 0) return 0; - /* - * Calculate the data_chunk_size, space_info->chunk_size is the - * "optimal" chunk size based on the fs size. However when we actually - * allocate the chunk we will strip this down further, making it no more - * than 10% of the disk or 1G, whichever is smaller. - */ - data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); - data_chunk_size = min(data_sinfo->chunk_size, - mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); - data_chunk_size = min_t(u64, data_chunk_size, SZ_1G); + data_chunk_size = calc_effective_data_chunk_size(fs_info); /* * Since data allocations immediately use block groups as part of the @@ -405,11 +476,22 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, avail >>= 3; else avail >>= 1; + + /* + * On the zoned mode, we always allocate one zone as one chunk. + * Returning non-zone size alingned bytes here will result in + * less pressure for the async metadata reclaim process, and it + * will over-commit too much leading to ENOSPC. Align down to the + * zone size to avoid that. + */ + if (btrfs_is_zoned(fs_info)) + avail = ALIGN_DOWN(avail, fs_info->zone_size); + return avail; } int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, + const struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { u64 avail; @@ -461,9 +543,7 @@ again: if ((used + ticket->bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, ticket->bytes, flush)) { - btrfs_space_info_update_bytes_may_use(fs_info, - space_info, - ticket->bytes); + btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes); remove_ticket(space_info, ticket); ticket->bytes = 0; space_info->tickets_id++; @@ -514,15 +594,16 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); } -static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info) +static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *info) { const char *flag_str = space_info_flag_to_str(info); lockdep_assert_held(&info->lock); /* The free space could be negative in case of overcommit */ - btrfs_info(fs_info, "space_info %s has %lld free, is %sfull", - flag_str, + btrfs_info(fs_info, + "space_info %s (sub-group id %d) has %lld free, is %sfull", + flag_str, info->subgroup_id, (s64)(info->total_bytes - btrfs_space_info_used(info, true)), info->full ? "" : "not "); btrfs_info(fs_info, @@ -555,8 +636,7 @@ again: spin_lock(&cache->lock); avail = cache->length - cache->used - cache->pinned - - cache->reserved - cache->delalloc_bytes - - cache->bytes_super - cache->zone_unusable; + cache->reserved - cache->bytes_super - cache->zone_unusable; btrfs_info(fs_info, "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s", cache->start, cache->length, cache->used, cache->pinned, @@ -587,8 +667,6 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, return nr; } -#define EXTENT_SIZE_PER_ITEM SZ_256K - /* * shrink metadata reservation for delalloc */ @@ -688,7 +766,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, skip_async: loops++; if (wait_ordered && !trans) { - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, items, NULL); } else { time_left = schedule_timeout_killable(1); if (time_left) @@ -780,7 +858,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, ret = PTR_ERR(trans); break; } - ret = btrfs_chunk_alloc(trans, + ret = btrfs_chunk_alloc(trans, space_info, btrfs_get_alloc_profile(fs_info, space_info->flags), (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); @@ -807,14 +885,10 @@ static void flush_space(struct btrfs_fs_info *fs_info, * because that does not wait for a transaction to fully commit * (only for it to be unblocked, state TRANS_STATE_UNBLOCKED). */ - trans = btrfs_attach_transaction_barrier(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - if (ret == -ENOENT) - ret = 0; - break; - } - ret = btrfs_commit_transaction(trans); + ret = btrfs_commit_current_transaction(root); + break; + case RESET_ZONES: + ret = btrfs_reset_unused_block_groups(space_info, num_bytes); break; default: ret = -ENOSPC; @@ -826,9 +900,8 @@ static void flush_space(struct btrfs_fs_info *fs_info, return; } -static inline u64 -btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *space_info) { u64 used; u64 avail; @@ -853,7 +926,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, } static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) + const struct btrfs_space_info *space_info) { const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv); u64 ordered, delalloc; @@ -1056,22 +1129,19 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, return (tickets_id != space_info->tickets_id); } -/* - * This is for normal flushers, we can wait all goddamned day if we want to. We - * will loop and continuously try to flush as long as we are making progress. - * We count progress as clearing off tickets each time we have to loop. - */ -static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info) { - struct btrfs_fs_info *fs_info; - struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = space_info->fs_info; u64 to_reclaim; enum btrfs_flush_state flush_state; int commit_cycles = 0; u64 last_tickets_id; + enum btrfs_flush_state final_state; - fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); - space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + if (btrfs_is_zoned(fs_info)) + final_state = RESET_ZONES; + else + final_state = COMMIT_TRANS; spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); @@ -1124,7 +1194,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) flush_state++; - if (flush_state > COMMIT_TRANS) { + if (flush_state > final_state) { commit_cycles++; if (commit_cycles > 2) { if (maybe_fail_all_tickets(fs_info, space_info)) { @@ -1138,7 +1208,26 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } } spin_unlock(&space_info->lock); - } while (flush_state <= COMMIT_TRANS); + } while (flush_state <= final_state); +} + +/* + * This is for normal flushers, it can wait as much time as needed. We will + * loop and continuously try to flush as long as we are making progress. We + * count progress as clearing off tickets each time we have to loop. + */ +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + + fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + do_async_reclaim_metadata_space(space_info); + for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) { + if (space_info->sub_group[i]) + do_async_reclaim_metadata_space(space_info->sub_group[i]); + } } /* @@ -1262,13 +1351,17 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) * If we are freeing inodes, we want to make sure all delayed iputs have * completed, because they could have been on an inode with i_nlink == 0, and * thus have been truncated and freed up space. But again this space is not - * immediately re-usable, it comes in the form of a delayed ref, which must be + * immediately reusable, it comes in the form of a delayed ref, which must be * run and then the transaction must be committed. * * COMMIT_TRANS * This is where we reclaim all of the pinned space generated by running the * iputs * + * RESET_ZONES + * This state works only for the zoned mode. We scan the unused block group + * list and reset the zones and reuse the block group. + * * ALLOC_CHUNK_FORCE * For data we start with alloc chunk force, however we could have been full * before, and then the transaction commit could have freed new block groups, @@ -1278,19 +1371,16 @@ static const enum btrfs_flush_state data_flush_states[] = { FLUSH_DELALLOC_FULL, RUN_DELAYED_IPUTS, COMMIT_TRANS, + RESET_ZONES, ALLOC_CHUNK_FORCE, }; -static void btrfs_async_reclaim_data_space(struct work_struct *work) +static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) { - struct btrfs_fs_info *fs_info; - struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = space_info->fs_info; u64 last_tickets_id; enum btrfs_flush_state flush_state = 0; - fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); - space_info = fs_info->data_sinfo; - spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; @@ -1358,6 +1448,19 @@ aborted_fs: spin_unlock(&space_info->lock); } +static void btrfs_async_reclaim_data_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + + fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); + space_info = fs_info->data_sinfo; + do_async_reclaim_data_space(space_info); + for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) + if (space_info->sub_group[i]) + do_async_reclaim_data_space(space_info->sub_group[i]); +} + void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) { INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); @@ -1369,6 +1472,7 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) static const enum btrfs_flush_state priority_flush_states[] = { FLUSH_DELAYED_ITEMS_NR, FLUSH_DELAYED_ITEMS, + RESET_ZONES, ALLOC_CHUNK, }; @@ -1382,6 +1486,7 @@ static const enum btrfs_flush_state evict_flush_states[] = { FLUSH_DELALLOC_FULL, ALLOC_CHUNK, COMMIT_TRANS, + RESET_ZONES, }; static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, @@ -1471,8 +1576,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); } -static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static void wait_reserve_ticket(struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { @@ -1530,7 +1634,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, case BTRFS_RESERVE_FLUSH_DATA: case BTRFS_RESERVE_FLUSH_ALL: case BTRFS_RESERVE_FLUSH_ALL_STEAL: - wait_reserve_ticket(fs_info, space_info, ticket); + wait_reserve_ticket(space_info, ticket); break; case BTRFS_RESERVE_FLUSH_LIMIT: priority_reclaim_metadata_space(fs_info, space_info, ticket, @@ -1674,8 +1778,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, if (!pending_tickets && ((used + orig_bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { - btrfs_space_info_update_bytes_may_use(fs_info, space_info, - orig_bytes); + btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); ret = 0; } @@ -1687,8 +1790,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { used = btrfs_space_info_used(space_info, false); if (used + orig_bytes <= space_info->total_bytes) { - btrfs_space_info_update_bytes_may_use(fs_info, space_info, - orig_bytes); + btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); ret = 0; } } @@ -1800,10 +1902,10 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * This will reserve bytes from the data space info. If there is not enough * space then we will attempt to flush space as specified by flush. */ -int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, +int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { - struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; + struct btrfs_fs_info *fs_info = space_info->fs_info; int ret; ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || @@ -1811,12 +1913,12 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, flush == BTRFS_RESERVE_NO_FLUSH); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); - ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush); + ret = __reserve_bytes(fs_info, space_info, bytes, flush); if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", - data_sinfo->flags, bytes, 1); + space_info->flags, bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0); + btrfs_dump_space_info(fs_info, space_info, bytes, 0); } return ret; } @@ -1868,3 +1970,230 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) return free_bytes; } + +static u64 calc_pct_ratio(u64 x, u64 y) +{ + int err; + + if (!y) + return 0; +again: + err = check_mul_overflow(100, x, &x); + if (err) + goto lose_precision; + return div64_u64(x, y); +lose_precision: + x >>= 10; + y >>= 10; + if (!y) + y = 1; + goto again; +} + +/* + * A reasonable buffer for unallocated space is 10 data block_groups. + * If we claw this back repeatedly, we can still achieve efficient + * utilization when near full, and not do too much reclaim while + * always maintaining a solid buffer for workloads that quickly + * allocate and pressure the unallocated space. + */ +static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info) +{ + u64 chunk_sz = calc_effective_data_chunk_size(fs_info); + + return BTRFS_UNALLOC_BLOCK_GROUP_TARGET * chunk_sz; +} + +/* + * The fundamental goal of automatic reclaim is to protect the filesystem's + * unallocated space and thus minimize the probability of the filesystem going + * read only when a metadata allocation failure causes a transaction abort. + * + * However, relocations happen into the space_info's unused space, therefore + * automatic reclaim must also back off as that space runs low. There is no + * value in doing trivial "relocations" of re-writing the same block group + * into a fresh one. + * + * Furthermore, we want to avoid doing too much reclaim even if there are good + * candidates. This is because the allocator is pretty good at filling up the + * holes with writes. So we want to do just enough reclaim to try and stay + * safe from running out of unallocated space but not be wasteful about it. + * + * Therefore, the dynamic reclaim threshold is calculated as follows: + * - calculate a target unallocated amount of 5 block group sized chunks + * - ratchet up the intensity of reclaim depending on how far we are from + * that target by using a formula of unalloc / target to set the threshold. + * + * Typically with 10 block groups as the target, the discrete values this comes + * out to are 0, 10, 20, ... , 80, 90, and 99. + */ +static int calc_dynamic_reclaim_threshold(const struct btrfs_space_info *space_info) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + u64 unalloc = atomic64_read(&fs_info->free_chunk_space); + u64 target = calc_unalloc_target(fs_info); + u64 alloc = space_info->total_bytes; + u64 used = btrfs_space_info_used(space_info, false); + u64 unused = alloc - used; + u64 want = target > unalloc ? target - unalloc : 0; + u64 data_chunk_size = calc_effective_data_chunk_size(fs_info); + + /* If we have no unused space, don't bother, it won't work anyway. */ + if (unused < data_chunk_size) + return 0; + + /* Cast to int is OK because want <= target. */ + return calc_pct_ratio(want, target); +} + +int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info) +{ + lockdep_assert_held(&space_info->lock); + + if (READ_ONCE(space_info->dynamic_reclaim)) + return calc_dynamic_reclaim_threshold(space_info); + return READ_ONCE(space_info->bg_reclaim_threshold); +} + +/* + * Under "urgent" reclaim, we will reclaim even fresh block groups that have + * recently seen successful allocations, as we are desperate to reclaim + * whatever we can to avoid ENOSPC in a transaction leading to a readonly fs. + */ +static bool is_reclaim_urgent(struct btrfs_space_info *space_info) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + u64 unalloc = atomic64_read(&fs_info->free_chunk_space); + u64 data_chunk_size = calc_effective_data_chunk_size(fs_info); + + return unalloc < data_chunk_size; +} + +static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid) +{ + struct btrfs_block_group *bg; + int thresh_pct; + bool try_again = true; + bool urgent; + + spin_lock(&space_info->lock); + urgent = is_reclaim_urgent(space_info); + thresh_pct = btrfs_calc_reclaim_threshold(space_info); + spin_unlock(&space_info->lock); + + down_read(&space_info->groups_sem); +again: + list_for_each_entry(bg, &space_info->block_groups[raid], list) { + u64 thresh; + bool reclaim = false; + + btrfs_get_block_group(bg); + spin_lock(&bg->lock); + thresh = mult_perc(bg->length, thresh_pct); + if (bg->used < thresh && bg->reclaim_mark) { + try_again = false; + reclaim = true; + } + bg->reclaim_mark++; + spin_unlock(&bg->lock); + if (reclaim) + btrfs_mark_bg_to_reclaim(bg); + btrfs_put_block_group(bg); + } + + /* + * In situations where we are very motivated to reclaim (low unalloc) + * use two passes to make the reclaim mark check best effort. + * + * If we have any staler groups, we don't touch the fresher ones, but if we + * really need a block group, do take a fresh one. + */ + if (try_again && urgent) { + try_again = false; + goto again; + } + + up_read(&space_info->groups_sem); +} + +void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes) +{ + u64 chunk_sz = calc_effective_data_chunk_size(space_info->fs_info); + + lockdep_assert_held(&space_info->lock); + space_info->reclaimable_bytes += bytes; + + if (space_info->reclaimable_bytes >= chunk_sz) + btrfs_set_periodic_reclaim_ready(space_info, true); +} + +void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready) +{ + lockdep_assert_held(&space_info->lock); + if (!READ_ONCE(space_info->periodic_reclaim)) + return; + if (ready != space_info->periodic_reclaim_ready) { + space_info->periodic_reclaim_ready = ready; + if (!ready) + space_info->reclaimable_bytes = 0; + } +} + +bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info) +{ + bool ret; + + if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) + return false; + if (!READ_ONCE(space_info->periodic_reclaim)) + return false; + + spin_lock(&space_info->lock); + ret = space_info->periodic_reclaim_ready; + btrfs_set_periodic_reclaim_ready(space_info, false); + spin_unlock(&space_info->lock); + + return ret; +} + +void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info) +{ + int raid; + struct btrfs_space_info *space_info; + + list_for_each_entry(space_info, &fs_info->space_info, list) { + if (!btrfs_should_periodic_reclaim(space_info)) + continue; + for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) + do_reclaim_sweep(space_info, raid); + } +} + +void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + + lockdep_assert_held(&space_info->lock); + + /* Prioritize the global reservation to receive the freed space. */ + if (global_rsv->space_info != space_info) + goto grant; + + spin_lock(&global_rsv->lock); + if (!global_rsv->full) { + u64 to_add = min(len, global_rsv->size - global_rsv->reserved); + + global_rsv->reserved += to_add; + btrfs_space_info_update_bytes_may_use(space_info, to_add); + if (global_rsv->reserved >= global_rsv->size) + global_rsv->full = 1; + len -= to_add; + } + spin_unlock(&global_rsv->lock); + +grant: + /* Add to any tickets we may have. */ + if (len) + btrfs_try_granting_tickets(fs_info, space_info); +} |