summaryrefslogtreecommitdiff
path: root/fs/btrfs/block-group.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/block-group.c')
-rw-r--r--fs/btrfs/block-group.c3222
1 files changed, 2240 insertions, 982 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 176e8a292fd1..08b14449fabe 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/sizes.h>
+#include <linux/list_sort.h>
#include "misc.h"
#include "ctree.h"
#include "block-group.h"
@@ -15,6 +17,35 @@
#include "delalloc-space.h"
#include "discard.h"
#include "raid56.h"
+#include "zoned.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+
+#ifdef CONFIG_BTRFS_DEBUG
+int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+
+ return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
+ block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
+ block_group->flags & BTRFS_BLOCK_GROUP_DATA);
+}
+#endif
+
+static inline bool has_unwritten_metadata(struct btrfs_block_group *block_group)
+{
+ /* The meta_write_pointer is available only on the zoned setup. */
+ if (!btrfs_is_zoned(block_group->fs_info))
+ return false;
+
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+ return false;
+
+ return block_group->start + block_group->alloc_offset >
+ block_group->meta_write_pointer;
+}
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
@@ -22,9 +53,9 @@
*
* Should be called with balance_lock held
*/
-static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
+static u64 get_restripe_target(const struct btrfs_fs_info *fs_info, u64 flags)
{
- struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ const struct btrfs_balance_control *bctl = fs_info->balance_ctl;
u64 target = 0;
if (!bctl)
@@ -65,11 +96,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
spin_lock(&fs_info->balance_lock);
target = get_restripe_target(fs_info, flags);
if (target) {
- /* Pick target profile only if it's already available */
- if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
- spin_unlock(&fs_info->balance_lock);
- return extended_to_chunk(target);
- }
+ spin_unlock(&fs_info->balance_lock);
+ return extended_to_chunk(target);
}
spin_unlock(&fs_info->balance_lock);
@@ -80,14 +108,21 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
}
allowed &= flags;
- if (allowed & BTRFS_BLOCK_GROUP_RAID6)
+ /* Select the highest-redundancy RAID level. */
+ if (allowed & BTRFS_BLOCK_GROUP_RAID1C4)
+ allowed = BTRFS_BLOCK_GROUP_RAID1C4;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID6)
allowed = BTRFS_BLOCK_GROUP_RAID6;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3)
+ allowed = BTRFS_BLOCK_GROUP_RAID1C3;
else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
allowed = BTRFS_BLOCK_GROUP_RAID5;
else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
allowed = BTRFS_BLOCK_GROUP_RAID10;
else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
allowed = BTRFS_BLOCK_GROUP_RAID1;
+ else if (allowed & BTRFS_BLOCK_GROUP_DUP)
+ allowed = BTRFS_BLOCK_GROUP_DUP;
else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
allowed = BTRFS_BLOCK_GROUP_RAID0;
@@ -118,14 +153,23 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
void btrfs_get_block_group(struct btrfs_block_group *cache)
{
- atomic_inc(&cache->count);
+ refcount_inc(&cache->refs);
}
void btrfs_put_block_group(struct btrfs_block_group *cache)
{
- if (atomic_dec_and_test(&cache->count)) {
+ if (refcount_dec_and_test(&cache->refs)) {
WARN_ON(cache->pinned > 0);
- WARN_ON(cache->reserved > 0);
+ /*
+ * If there was a failure to cleanup a log tree, very likely due
+ * to an IO failure on a writeback attempt of one or more of its
+ * extent buffers, we could not do proper (and cheap) unaccounting
+ * of their reserved space, so don't warn on reserved > 0 in that
+ * case.
+ */
+ if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
+ WARN_ON(cache->reserved > 0);
/*
* A block_group shouldn't be on the discard_list anymore.
@@ -136,58 +180,47 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
cache);
- /*
- * If not empty, someone is still holding mutex of
- * full_stripe_lock, which can only be released by caller.
- * And it will definitely cause use-after-free when caller
- * tries to release full stripe lock.
- *
- * No better way to resolve, but only to warn.
- */
- WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
kfree(cache->free_space_ctl);
+ btrfs_free_chunk_map(cache->physical_map);
kfree(cache);
}
}
+static int btrfs_bg_start_cmp(const struct rb_node *new,
+ const struct rb_node *exist)
+{
+ const struct btrfs_block_group *new_bg =
+ rb_entry(new, struct btrfs_block_group, cache_node);
+ const struct btrfs_block_group *exist_bg =
+ rb_entry(exist, struct btrfs_block_group, cache_node);
+
+ if (new_bg->start < exist_bg->start)
+ return -1;
+ if (new_bg->start > exist_bg->start)
+ return 1;
+ return 0;
+}
+
/*
* This adds the block group to the fs_info rb tree for the block group cache
*/
-static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
- struct btrfs_block_group *block_group)
+static int btrfs_add_block_group_cache(struct btrfs_block_group *block_group)
{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct btrfs_block_group *cache;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct rb_node *exist;
+ int ret = 0;
ASSERT(block_group->length != 0);
- spin_lock(&info->block_group_cache_lock);
- p = &info->block_group_cache_tree.rb_node;
+ write_lock(&fs_info->block_group_cache_lock);
- while (*p) {
- parent = *p;
- cache = rb_entry(parent, struct btrfs_block_group, cache_node);
- if (block_group->start < cache->start) {
- p = &(*p)->rb_left;
- } else if (block_group->start > cache->start) {
- p = &(*p)->rb_right;
- } else {
- spin_unlock(&info->block_group_cache_lock);
- return -EEXIST;
- }
- }
+ exist = rb_find_add_cached(&block_group->cache_node,
+ &fs_info->block_group_cache_tree, btrfs_bg_start_cmp);
+ if (exist)
+ ret = -EEXIST;
+ write_unlock(&fs_info->block_group_cache_lock);
- rb_link_node(&block_group->cache_node, parent, p);
- rb_insert_color(&block_group->cache_node,
- &info->block_group_cache_tree);
-
- if (info->first_logical_byte > block_group->start)
- info->first_logical_byte = block_group->start;
-
- spin_unlock(&info->block_group_cache_lock);
-
- return 0;
+ return ret;
}
/*
@@ -201,8 +234,8 @@ static struct btrfs_block_group *block_group_cache_tree_search(
struct rb_node *n;
u64 end, start;
- spin_lock(&info->block_group_cache_lock);
- n = info->block_group_cache_tree.rb_node;
+ read_lock(&info->block_group_cache_lock);
+ n = info->block_group_cache_tree.rb_root.rb_node;
while (n) {
cache = rb_entry(n, struct btrfs_block_group, cache_node);
@@ -224,12 +257,9 @@ static struct btrfs_block_group *block_group_cache_tree_search(
break;
}
}
- if (ret) {
+ if (ret)
btrfs_get_block_group(ret);
- if (bytenr == 0 && info->first_logical_byte > ret->start)
- info->first_logical_byte = ret->start;
- }
- spin_unlock(&info->block_group_cache_lock);
+ read_unlock(&info->block_group_cache_lock);
return ret;
}
@@ -258,15 +288,15 @@ struct btrfs_block_group *btrfs_next_block_group(
struct btrfs_fs_info *fs_info = cache->fs_info;
struct rb_node *node;
- spin_lock(&fs_info->block_group_cache_lock);
+ read_lock(&fs_info->block_group_cache_lock);
/* If our block group was removed, we need a full search. */
if (RB_EMPTY_NODE(&cache->cache_node)) {
const u64 next_bytenr = cache->start + cache->length;
- spin_unlock(&fs_info->block_group_cache_lock);
+ read_unlock(&fs_info->block_group_cache_lock);
btrfs_put_block_group(cache);
- cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
+ return btrfs_lookup_first_block_group(fs_info, next_bytenr);
}
node = rb_next(&cache->cache_node);
btrfs_put_block_group(cache);
@@ -275,46 +305,68 @@ struct btrfs_block_group *btrfs_next_block_group(
btrfs_get_block_group(cache);
} else
cache = NULL;
- spin_unlock(&fs_info->block_group_cache_lock);
+ read_unlock(&fs_info->block_group_cache_lock);
return cache;
}
-bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+/*
+ * Check if we can do a NOCOW write for a given extent.
+ *
+ * @fs_info: The filesystem information object.
+ * @bytenr: Logical start address of the extent.
+ *
+ * Check if we can do a NOCOW write for the given extent, and increments the
+ * number of NOCOW writers in the block group that contains the extent, as long
+ * as the block group exists and it's currently not in read-only mode.
+ *
+ * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
+ * is responsible for calling btrfs_dec_nocow_writers() later.
+ *
+ * Or NULL if we can not do a NOCOW write
+ */
+struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
+ u64 bytenr)
{
struct btrfs_block_group *bg;
- bool ret = true;
+ bool can_nocow = true;
bg = btrfs_lookup_block_group(fs_info, bytenr);
if (!bg)
- return false;
+ return NULL;
spin_lock(&bg->lock);
if (bg->ro)
- ret = false;
+ can_nocow = false;
else
atomic_inc(&bg->nocow_writers);
spin_unlock(&bg->lock);
- /* No put on block group, done by btrfs_dec_nocow_writers */
- if (!ret)
+ if (!can_nocow) {
btrfs_put_block_group(bg);
+ return NULL;
+ }
- return ret;
+ /* No put on block group, done by btrfs_dec_nocow_writers(). */
+ return bg;
}
-void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+/*
+ * Decrement the number of NOCOW writers in a block group.
+ *
+ * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
+ * and on the block group returned by that call. Typically this is called after
+ * creating an ordered extent for a NOCOW write, to prevent races with scrub and
+ * relocation.
+ *
+ * After this call, the caller should not use the block group anymore. It it wants
+ * to use it, then it should get a reference on it before calling this function.
+ */
+void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
{
- struct btrfs_block_group *bg;
-
- bg = btrfs_lookup_block_group(fs_info, bytenr);
- ASSERT(bg);
if (atomic_dec_and_test(&bg->nocow_writers))
wake_up_var(&bg->nocow_writers);
- /*
- * Once for our lookup and once for the lookup done by a previous call
- * to btrfs_inc_nocow_writers()
- */
- btrfs_put_block_group(bg);
+
+ /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
btrfs_put_block_group(bg);
}
@@ -377,7 +429,7 @@ struct btrfs_caching_control *btrfs_get_caching_control(
return ctl;
}
-void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
+static void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
{
if (refcount_dec_and_test(&ctl->count))
kfree(ctl);
@@ -400,29 +452,43 @@ void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes)
{
struct btrfs_caching_control *caching_ctl;
+ int progress;
caching_ctl = btrfs_get_caching_control(cache);
if (!caching_ctl)
return;
+ /*
+ * We've already failed to allocate from this block group, so even if
+ * there's enough space in the block group it isn't contiguous enough to
+ * allow for an allocation, so wait for at least the next wakeup tick,
+ * or for the thing to be done.
+ */
+ progress = atomic_read(&caching_ctl->progress);
+
wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
- (cache->free_space_ctl->free_space >= num_bytes));
+ (progress != atomic_read(&caching_ctl->progress) &&
+ (cache->free_space_ctl->free_space >= num_bytes)));
btrfs_put_caching_control(caching_ctl);
}
-int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
+static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
+ struct btrfs_caching_control *caching_ctl)
+{
+ wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
+ return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
+}
+
+static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
{
struct btrfs_caching_control *caching_ctl;
- int ret = 0;
+ int ret;
caching_ctl = btrfs_get_caching_control(cache);
if (!caching_ctl)
return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
-
- wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
- if (cache->cached == BTRFS_CACHE_ERROR)
- ret = -EIO;
+ ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
btrfs_put_caching_control(caching_ctl);
return ret;
}
@@ -449,33 +515,44 @@ static void fragment_free_space(struct btrfs_block_group *block_group)
#endif
/*
- * This is only called by btrfs_cache_block_group, since we could have freed
- * extents we need to check the pinned_extents for any extents that can't be
- * used yet since their free space will be released as soon as the transaction
- * commits.
+ * Add a free space range to the in memory free space cache of a block group.
+ * This checks if the range contains super block locations and any such
+ * locations are not added to the free space cache.
+ *
+ * @block_group: The target block group.
+ * @start: Start offset of the range.
+ * @end: End offset of the range (exclusive).
+ * @total_added_ret: Optional pointer to return the total amount of space
+ * added to the block group's free space cache.
+ *
+ * Returns 0 on success or < 0 on error.
*/
-u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end)
+int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start,
+ u64 end, u64 *total_added_ret)
{
struct btrfs_fs_info *info = block_group->fs_info;
- u64 extent_start, extent_end, size, total_added = 0;
+ u64 extent_start, extent_end, size;
int ret;
+ if (total_added_ret)
+ *total_added_ret = 0;
+
while (start < end) {
- ret = find_first_extent_bit(&info->excluded_extents, start,
- &extent_start, &extent_end,
- EXTENT_DIRTY | EXTENT_UPTODATE,
- NULL);
- if (ret)
+ if (!btrfs_find_first_extent_bit(&info->excluded_extents, start,
+ &extent_start, &extent_end,
+ EXTENT_DIRTY, NULL))
break;
if (extent_start <= start) {
start = extent_end + 1;
} else if (extent_start > start && extent_start < end) {
size = extent_start - start;
- total_added += size;
ret = btrfs_add_free_space_async_trimmed(block_group,
start, size);
- BUG_ON(ret); /* -ENOMEM or logic error */
+ if (ret)
+ return ret;
+ if (total_added_ret)
+ *total_added_ret += size;
start = extent_end + 1;
} else {
break;
@@ -484,21 +561,159 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
if (start < end) {
size = end - start;
- total_added += size;
ret = btrfs_add_free_space_async_trimmed(block_group, start,
size);
- BUG_ON(ret); /* -ENOMEM or logic error */
+ if (ret)
+ return ret;
+ if (total_added_ret)
+ *total_added_ret += size;
}
- return total_added;
+ return 0;
+}
+
+/*
+ * Get an arbitrary extent item index / max_index through the block group
+ *
+ * @block_group the block group to sample from
+ * @index: the integral step through the block group to grab from
+ * @max_index: the granularity of the sampling
+ * @key: return value parameter for the item we find
+ *
+ * Pre-conditions on indices:
+ * 0 <= index <= max_index
+ * 0 < max_index
+ *
+ * Returns: 0 on success, 1 if the search didn't yield a useful item, negative
+ * error code on error.
+ */
+static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
+ struct btrfs_block_group *block_group,
+ int index, int max_index,
+ struct btrfs_key *found_key)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_root *extent_root;
+ u64 search_offset;
+ u64 search_end = block_group->start + block_group->length;
+ BTRFS_PATH_AUTO_FREE(path);
+ struct btrfs_key search_key;
+ int ret = 0;
+
+ ASSERT(index >= 0);
+ ASSERT(index <= max_index);
+ ASSERT(max_index > 0);
+ lockdep_assert_held(&caching_ctl->mutex);
+ lockdep_assert_held_read(&fs_info->commit_root_sem);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
+ BTRFS_SUPER_INFO_OFFSET));
+
+ path->skip_locking = true;
+ path->search_commit_root = true;
+ path->reada = READA_FORWARD;
+
+ search_offset = index * div_u64(block_group->length, max_index);
+ search_key.objectid = block_group->start + search_offset;
+ search_key.type = BTRFS_EXTENT_ITEM_KEY;
+ search_key.offset = 0;
+
+ btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) {
+ /* Success; sampled an extent item in the block group */
+ if (found_key->type == BTRFS_EXTENT_ITEM_KEY &&
+ found_key->objectid >= block_group->start &&
+ found_key->objectid + found_key->offset <= search_end)
+ break;
+
+ /* We can't possibly find a valid extent item anymore */
+ if (found_key->objectid >= search_end) {
+ ret = 1;
+ break;
+ }
+ }
+
+ lockdep_assert_held(&caching_ctl->mutex);
+ lockdep_assert_held_read(&fs_info->commit_root_sem);
+ return ret;
+}
+
+/*
+ * Best effort attempt to compute a block group's size class while caching it.
+ *
+ * @block_group: the block group we are caching
+ *
+ * We cannot infer the size class while adding free space extents, because that
+ * logic doesn't care about contiguous file extents (it doesn't differentiate
+ * between a 100M extent and 100 contiguous 1M extents). So we need to read the
+ * file extent items. Reading all of them is quite wasteful, because usually
+ * only a handful are enough to give a good answer. Therefore, we just grab 5 of
+ * them at even steps through the block group and pick the smallest size class
+ * we see. Since size class is best effort, and not guaranteed in general,
+ * inaccuracy is acceptable.
+ *
+ * To be more explicit about why this algorithm makes sense:
+ *
+ * If we are caching in a block group from disk, then there are three major cases
+ * to consider:
+ * 1. the block group is well behaved and all extents in it are the same size
+ * class.
+ * 2. the block group is mostly one size class with rare exceptions for last
+ * ditch allocations
+ * 3. the block group was populated before size classes and can have a totally
+ * arbitrary mix of size classes.
+ *
+ * In case 1, looking at any extent in the block group will yield the correct
+ * result. For the mixed cases, taking the minimum size class seems like a good
+ * approximation, since gaps from frees will be usable to the size class. For
+ * 2., a small handful of file extents is likely to yield the right answer. For
+ * 3, we can either read every file extent, or admit that this is best effort
+ * anyway and try to stay fast.
+ *
+ * Returns: 0 on success, negative error code on error.
+ */
+static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_key key;
+ int i;
+ u64 min_size = block_group->length;
+ enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
+ int ret;
+
+ if (!btrfs_block_group_should_use_size_class(block_group))
+ return 0;
+
+ lockdep_assert_held(&caching_ctl->mutex);
+ lockdep_assert_held_read(&fs_info->commit_root_sem);
+ for (i = 0; i < 5; ++i) {
+ ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ continue;
+ min_size = min_t(u64, min_size, key.offset);
+ size_class = btrfs_calc_block_group_size_class(min_size);
+ }
+ if (size_class != BTRFS_BG_SZ_NONE) {
+ spin_lock(&block_group->lock);
+ block_group->size_class = size_class;
+ spin_unlock(&block_group->lock);
+ }
+out:
+ return ret;
}
static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group *block_group = caching_ctl->block_group;
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_root *extent_root = fs_info->extent_root;
- struct btrfs_path *path;
+ struct btrfs_root *extent_root;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key key;
u64 total_found = 0;
@@ -512,6 +727,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
return -ENOMEM;
last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
+ extent_root = btrfs_extent_root(fs_info, last);
#ifdef CONFIG_BTRFS_DEBUG
/*
@@ -528,13 +744,13 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
* root to add free space. So we skip locking and search the commit
* root, since its read-only
*/
- path->skip_locking = 1;
- path->search_commit_root = 1;
+ path->skip_locking = true;
+ path->search_commit_root = true;
path->reada = READA_FORWARD;
key.objectid = last;
- key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = 0;
next:
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
@@ -559,8 +775,6 @@ next:
if (need_resched() ||
rwsem_is_contended(&fs_info->commit_root_sem)) {
- if (wakeup)
- caching_ctl->progress = last;
btrfs_release_path(path);
up_read(&fs_info->commit_root_sem);
mutex_unlock(&caching_ctl->mutex);
@@ -582,11 +796,8 @@ next:
if (key.objectid < last) {
key.objectid = last;
- key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
-
- if (wakeup)
- caching_ctl->progress = last;
+ key.offset = 0;
btrfs_release_path(path);
goto next;
}
@@ -601,8 +812,13 @@ next:
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
key.type == BTRFS_METADATA_ITEM_KEY) {
- total_found += add_new_free_space(block_group, last,
- key.objectid);
+ u64 space_added;
+
+ ret = btrfs_add_new_free_space(block_group, last,
+ key.objectid, &space_added);
+ if (ret)
+ goto out;
+ total_found += space_added;
if (key.type == BTRFS_METADATA_ITEM_KEY)
last = key.objectid +
fs_info->nodesize;
@@ -611,23 +827,28 @@ next:
if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
- if (wakeup)
+ if (wakeup) {
+ atomic_inc(&caching_ctl->progress);
wake_up(&caching_ctl->wait);
+ }
}
}
path->slots[0]++;
}
- ret = 0;
-
- total_found += add_new_free_space(block_group, last,
- block_group->start + block_group->length);
- caching_ctl->progress = (u64)-1;
+ ret = btrfs_add_new_free_space(block_group, last,
+ block_group->start + block_group->length,
+ NULL);
out:
- btrfs_free_path(path);
return ret;
}
+static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg)
+{
+ btrfs_clear_extent_bit(&bg->fs_info->excluded_extents, bg->start,
+ bg->start + bg->length - 1, EXTENT_DIRTY, NULL);
+}
+
static noinline void caching_thread(struct btrfs_work *work)
{
struct btrfs_block_group *block_group;
@@ -642,11 +863,37 @@ static noinline void caching_thread(struct btrfs_work *work)
mutex_lock(&caching_ctl->mutex);
down_read(&fs_info->commit_root_sem);
- if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
- ret = load_free_space_tree(caching_ctl);
+ load_block_group_size_class(caching_ctl, block_group);
+ if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ ret = load_free_space_cache(block_group);
+ if (ret == 1) {
+ ret = 0;
+ goto done;
+ }
+
+ /*
+ * We failed to load the space cache, set ourselves to
+ * CACHE_STARTED and carry on.
+ */
+ spin_lock(&block_group->lock);
+ block_group->cached = BTRFS_CACHE_STARTED;
+ spin_unlock(&block_group->lock);
+ wake_up(&caching_ctl->wait);
+ }
+
+ /*
+ * If we are in the transaction that populated the free space tree we
+ * can't actually cache from the free space tree as our commit root and
+ * real root are the same, so we could change the contents of the blocks
+ * while caching. Instead do the slow caching in this case, and after
+ * the transaction has committed we will be safe.
+ */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
+ ret = btrfs_load_free_space_tree(caching_ctl);
else
ret = load_extent_tree_free(caching_ctl);
-
+done:
spin_lock(&block_group->lock);
block_group->caching_ctl = NULL;
block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
@@ -666,8 +913,6 @@ static noinline void caching_thread(struct btrfs_work *work)
}
#endif
- caching_ctl->progress = (u64)-1;
-
up_read(&fs_info->commit_root_sem);
btrfs_free_excluded_extents(block_group);
mutex_unlock(&caching_ctl->mutex);
@@ -678,13 +923,16 @@ static noinline void caching_thread(struct btrfs_work *work)
btrfs_put_block_group(block_group);
}
-int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only)
+int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
{
- DEFINE_WAIT(wait);
struct btrfs_fs_info *fs_info = cache->fs_info;
- struct btrfs_caching_control *caching_ctl;
+ struct btrfs_caching_control *caching_ctl = NULL;
int ret = 0;
+ /* Allocator for zoned filesystems does not use the cache at all */
+ if (btrfs_is_zoned(fs_info))
+ return 0;
+
caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
if (!caching_ctl)
return -ENOMEM;
@@ -693,120 +941,38 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
mutex_init(&caching_ctl->mutex);
init_waitqueue_head(&caching_ctl->wait);
caching_ctl->block_group = cache;
- caching_ctl->progress = cache->start;
- refcount_set(&caching_ctl->count, 1);
- btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
+ refcount_set(&caching_ctl->count, 2);
+ atomic_set(&caching_ctl->progress, 0);
+ btrfs_init_work(&caching_ctl->work, caching_thread, NULL);
spin_lock(&cache->lock);
- /*
- * This should be a rare occasion, but this could happen I think in the
- * case where one thread starts to load the space cache info, and then
- * some other thread starts a transaction commit which tries to do an
- * allocation while the other thread is still loading the space cache
- * info. The previous loop should have kept us from choosing this block
- * group, but if we've moved to the state where we will wait on caching
- * block groups we need to first check if we're doing a fast load here,
- * so we can wait for it to finish, otherwise we could end up allocating
- * from a block group who's cache gets evicted for one reason or
- * another.
- */
- while (cache->cached == BTRFS_CACHE_FAST) {
- struct btrfs_caching_control *ctl;
-
- ctl = cache->caching_ctl;
- refcount_inc(&ctl->count);
- prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock(&cache->lock);
-
- schedule();
-
- finish_wait(&ctl->wait, &wait);
- btrfs_put_caching_control(ctl);
- spin_lock(&cache->lock);
- }
-
if (cache->cached != BTRFS_CACHE_NO) {
- spin_unlock(&cache->lock);
kfree(caching_ctl);
- return 0;
+
+ caching_ctl = cache->caching_ctl;
+ if (caching_ctl)
+ refcount_inc(&caching_ctl->count);
+ spin_unlock(&cache->lock);
+ goto out;
}
WARN_ON(cache->caching_ctl);
cache->caching_ctl = caching_ctl;
- cache->cached = BTRFS_CACHE_FAST;
+ cache->cached = BTRFS_CACHE_STARTED;
spin_unlock(&cache->lock);
- if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
- mutex_lock(&caching_ctl->mutex);
- ret = load_free_space_cache(cache);
-
- spin_lock(&cache->lock);
- if (ret == 1) {
- cache->caching_ctl = NULL;
- cache->cached = BTRFS_CACHE_FINISHED;
- cache->last_byte_to_unpin = (u64)-1;
- caching_ctl->progress = (u64)-1;
- } else {
- if (load_cache_only) {
- cache->caching_ctl = NULL;
- cache->cached = BTRFS_CACHE_NO;
- } else {
- cache->cached = BTRFS_CACHE_STARTED;
- cache->has_caching_ctl = 1;
- }
- }
- spin_unlock(&cache->lock);
-#ifdef CONFIG_BTRFS_DEBUG
- if (ret == 1 &&
- btrfs_should_fragment_free_space(cache)) {
- u64 bytes_used;
-
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- bytes_used = cache->length - cache->used;
- cache->space_info->bytes_used += bytes_used >> 1;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
- fragment_free_space(cache);
- }
-#endif
- mutex_unlock(&caching_ctl->mutex);
-
- wake_up(&caching_ctl->wait);
- if (ret == 1) {
- btrfs_put_caching_control(caching_ctl);
- btrfs_free_excluded_extents(cache);
- return 0;
- }
- } else {
- /*
- * We're either using the free space tree or no caching at all.
- * Set cached to the appropriate value and wakeup any waiters.
- */
- spin_lock(&cache->lock);
- if (load_cache_only) {
- cache->caching_ctl = NULL;
- cache->cached = BTRFS_CACHE_NO;
- } else {
- cache->cached = BTRFS_CACHE_STARTED;
- cache->has_caching_ctl = 1;
- }
- spin_unlock(&cache->lock);
- wake_up(&caching_ctl->wait);
- }
-
- if (load_cache_only) {
- btrfs_put_caching_control(caching_ctl);
- return 0;
- }
-
- down_write(&fs_info->commit_root_sem);
+ write_lock(&fs_info->block_group_cache_lock);
refcount_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
- up_write(&fs_info->commit_root_sem);
+ write_unlock(&fs_info->block_group_cache_lock);
btrfs_get_block_group(cache);
btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
+out:
+ if (wait && caching_ctl)
+ ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
+ if (caching_ctl)
+ btrfs_put_caching_control(caching_ctl);
return ret;
}
@@ -864,6 +1030,13 @@ static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
}
}
+static struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
+ return fs_info->block_group_root;
+ return btrfs_extent_root(fs_info, 0);
+}
+
static int remove_block_group_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_block_group *block_group)
@@ -873,7 +1046,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ret;
- root = fs_info->extent_root;
+ root = btrfs_block_group_root(fs_info);
key.objectid = block_group->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
key.offset = block_group->length;
@@ -889,25 +1062,25 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans,
}
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- u64 group_start, struct extent_map *em)
+ struct btrfs_chunk_map *map)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_block_group *block_group;
struct btrfs_free_cluster *cluster;
- struct btrfs_root *tree_root = fs_info->tree_root;
- struct btrfs_key key;
struct inode *inode;
struct kobject *kobj = NULL;
int ret;
int index;
int factor;
struct btrfs_caching_control *caching_ctl = NULL;
- bool remove_em;
+ bool remove_map;
bool remove_rsv = false;
- block_group = btrfs_lookup_block_group(fs_info, group_start);
- BUG_ON(!block_group);
+ block_group = btrfs_lookup_block_group(fs_info, map->start);
+ if (!block_group)
+ return -ENOENT;
+
BUG_ON(!block_group->ro);
trace_btrfs_remove_block_group(block_group);
@@ -937,10 +1110,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&cluster->refill_lock);
+ btrfs_clear_treelog_bg(block_group);
+ btrfs_clear_data_reloc_bg(block_group);
+
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
- goto out_put_group;
+ goto out;
}
/*
@@ -974,51 +1150,19 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&trans->transaction->dirty_bgs_lock);
mutex_unlock(&trans->transaction->cache_write_mutex);
- if (!IS_ERR(inode)) {
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
- if (ret) {
- btrfs_add_delayed_iput(inode);
- goto out_put_group;
- }
- clear_nlink(inode);
- /* One for the block groups ref */
- spin_lock(&block_group->lock);
- if (block_group->iref) {
- block_group->iref = 0;
- block_group->inode = NULL;
- spin_unlock(&block_group->lock);
- iput(inode);
- } else {
- spin_unlock(&block_group->lock);
- }
- /* One for our lookup ref */
- btrfs_add_delayed_iput(inode);
- }
-
- key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.type = 0;
- key.offset = block_group->start;
-
- ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- if (ret < 0)
- goto out_put_group;
- if (ret > 0)
- btrfs_release_path(path);
- if (ret == 0) {
- ret = btrfs_del_item(trans, tree_root, path);
- if (ret)
- goto out_put_group;
- btrfs_release_path(path);
- }
+ ret = btrfs_remove_free_space_inode(trans, inode, block_group);
+ if (ret)
+ goto out;
- spin_lock(&fs_info->block_group_cache_lock);
- rb_erase(&block_group->cache_node,
- &fs_info->block_group_cache_tree);
+ write_lock(&fs_info->block_group_cache_lock);
+ rb_erase_cached(&block_group->cache_node,
+ &fs_info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
- if (fs_info->first_logical_byte == block_group->start)
- fs_info->first_logical_byte = (u64)-1;
- spin_unlock(&fs_info->block_group_cache_lock);
+ /* Once for the block groups rbtree */
+ btrfs_put_block_group(block_group);
+
+ write_unlock(&fs_info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
/*
@@ -1038,32 +1182,31 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
kobject_put(kobj);
}
- if (block_group->has_caching_ctl)
- caching_ctl = btrfs_get_caching_control(block_group);
if (block_group->cached == BTRFS_CACHE_STARTED)
btrfs_wait_block_group_cache_done(block_group);
- if (block_group->has_caching_ctl) {
- down_write(&fs_info->commit_root_sem);
- if (!caching_ctl) {
- struct btrfs_caching_control *ctl;
-
- list_for_each_entry(ctl,
- &fs_info->caching_block_groups, list)
- if (ctl->block_group == block_group) {
- caching_ctl = ctl;
- refcount_inc(&caching_ctl->count);
- break;
- }
- }
- if (caching_ctl)
- list_del_init(&caching_ctl->list);
- up_write(&fs_info->commit_root_sem);
- if (caching_ctl) {
- /* Once for the caching bgs list and once for us. */
- btrfs_put_caching_control(caching_ctl);
- btrfs_put_caching_control(caching_ctl);
+
+ write_lock(&fs_info->block_group_cache_lock);
+ caching_ctl = btrfs_get_caching_control(block_group);
+ if (!caching_ctl) {
+ struct btrfs_caching_control *ctl;
+
+ list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
+ if (ctl->block_group == block_group) {
+ caching_ctl = ctl;
+ refcount_inc(&caching_ctl->count);
+ break;
+ }
}
}
+ if (caching_ctl)
+ list_del_init(&caching_ctl->list);
+ write_unlock(&fs_info->block_group_cache_lock);
+
+ if (caching_ctl) {
+ /* Once for the caching bgs list and once for us. */
+ btrfs_put_caching_control(caching_ctl);
+ btrfs_put_caching_control(caching_ctl);
+ }
spin_lock(&trans->transaction->dirty_bgs_lock);
WARN_ON(!list_empty(&block_group->dirty_list));
@@ -1079,19 +1222,52 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
WARN_ON(block_group->space_info->total_bytes
< block_group->length);
WARN_ON(block_group->space_info->bytes_readonly
- < block_group->length);
+ < block_group->length - block_group->zone_unusable);
+ WARN_ON(block_group->space_info->bytes_zone_unusable
+ < block_group->zone_unusable);
WARN_ON(block_group->space_info->disk_total
< block_group->length * factor);
}
block_group->space_info->total_bytes -= block_group->length;
- block_group->space_info->bytes_readonly -= block_group->length;
+ block_group->space_info->bytes_readonly -=
+ (block_group->length - block_group->zone_unusable);
+ btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
+ -block_group->zone_unusable);
block_group->space_info->disk_total -= block_group->length * factor;
spin_unlock(&block_group->space_info->lock);
- mutex_lock(&fs_info->chunk_mutex);
+ /*
+ * Remove the free space for the block group from the free space tree
+ * and the block group's item from the extent tree before marking the
+ * block group as removed. This is to prevent races with tasks that
+ * freeze and unfreeze a block group, this task and another task
+ * allocating a new block group - the unfreeze task ends up removing
+ * the block group's extent map before the task calling this function
+ * deletes the block group item from the extent tree, allowing for
+ * another task to attempt to create another block group with the same
+ * item key (and failing with -EEXIST and a transaction abort).
+ */
+ ret = btrfs_remove_block_group_free_space(trans, block_group);
+ if (ret)
+ goto out;
+
+ ret = remove_block_group_item(trans, path, block_group);
+ if (ret < 0)
+ goto out;
+
spin_lock(&block_group->lock);
- block_group->removed = 1;
+ /*
+ * Hitting this WARN means we removed a block group with an unwritten
+ * region. It will cause "unable to find chunk map for logical" errors.
+ */
+ if (WARN_ON(has_unwritten_metadata(block_group)))
+ btrfs_warn(fs_info,
+ "block group %llu is removed before metadata write out",
+ block_group->start);
+
+ set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
+
/*
* At this point trimming or scrub can't start on this block group,
* because we removed the block group from the rbtree
@@ -1102,7 +1278,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* entries because we already removed them all when we called
* btrfs_remove_free_space_cache().
*
- * And we must not remove the extent map from the fs_info->mapping_tree
+ * And we must not remove the chunk map from the fs_info->mapping_tree
* to prevent the same logical address range and physical device space
* ranges from being reused for a new block group. This is needed to
* avoid races with trimming and scrub.
@@ -1118,55 +1294,30 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* in place until the extents have been discarded completely when
* the transaction commit has completed.
*/
- remove_em = (atomic_read(&block_group->frozen) == 0);
+ remove_map = (atomic_read(&block_group->frozen) == 0);
spin_unlock(&block_group->lock);
- mutex_unlock(&fs_info->chunk_mutex);
-
- ret = remove_block_group_free_space(trans, block_group);
- if (ret)
- goto out_put_group;
-
- /* Once for the block groups rbtree */
- btrfs_put_block_group(block_group);
-
- ret = remove_block_group_item(trans, path, block_group);
- if (ret < 0)
- goto out;
-
- if (remove_em) {
- struct extent_map_tree *em_tree;
-
- em_tree = &fs_info->mapping_tree;
- write_lock(&em_tree->lock);
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
- /* once for the tree */
- free_extent_map(em);
- }
+ if (remove_map)
+ btrfs_remove_chunk_map(fs_info, map);
-out_put_group:
+out:
/* Once for the lookup reference */
btrfs_put_block_group(block_group);
-out:
if (remove_rsv)
- btrfs_delayed_refs_rsv_release(fs_info, 1);
- btrfs_free_path(path);
+ btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
return ret;
}
struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info, const u64 chunk_offset)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
+ struct btrfs_chunk_map *map;
unsigned int num_items;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- read_unlock(&em_tree->lock);
- ASSERT(em && em->start == chunk_offset);
+ map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
+ ASSERT(map != NULL);
+ ASSERT(map->start == chunk_offset);
/*
* We need to reserve 3 + N units from the metadata space info in order
@@ -1187,12 +1338,10 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
* more device items and remove one chunk item), but this is done at
* btrfs_remove_chunk() through a call to check_system_chunk().
*/
- map = em->map_lookup;
num_items = 3 + map->num_stripes;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
- return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
- num_items);
+ return btrfs_start_transaction_fallback_global_rsv(root, num_items);
}
/*
@@ -1208,7 +1357,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
* data in this block group. That check should be done by relocation routine,
* not this function.
*/
-static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
+static int inc_block_group_ro(struct btrfs_block_group *cache, bool force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
@@ -1217,6 +1366,11 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
+ if (cache->swap_extents) {
+ ret = -ETXTBSY;
+ goto out;
+ }
+
if (cache->ro) {
cache->ro++;
ret = 0;
@@ -1224,7 +1378,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
}
num_bytes = cache->length - cache->reserved - cache->pinned -
- cache->bytes_super - cache->used;
+ cache->bytes_super - cache->zone_unusable - cache->used;
/*
* Data never overcommits, even in mixed mode, so do just the straight
@@ -1248,13 +1402,18 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
* BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
* leeway to allow us to mark this block group as read only.
*/
- if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
- BTRFS_RESERVE_NO_FLUSH))
+ if (btrfs_can_overcommit(sinfo, num_bytes, BTRFS_RESERVE_NO_FLUSH))
ret = 0;
}
if (!ret) {
sinfo->bytes_readonly += num_bytes;
+ if (btrfs_is_zoned(cache->fs_info)) {
+ /* Migrate zone_unusable bytes to readonly */
+ sinfo->bytes_readonly += cache->zone_unusable;
+ btrfs_space_info_update_bytes_zone_unusable(sinfo, -cache->zone_unusable);
+ cache->zone_unusable = 0;
+ }
cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
}
@@ -1264,24 +1423,23 @@ out:
if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
btrfs_info(cache->fs_info,
"unable to make block group %llu ro", cache->start);
- btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
+ btrfs_dump_space_info(cache->space_info, 0, false);
}
return ret;
}
static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *bg)
+ const struct btrfs_block_group *bg)
{
- struct btrfs_fs_info *fs_info = bg->fs_info;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_transaction *prev_trans = NULL;
const u64 start = bg->start;
const u64 end = start + bg->length - 1;
int ret;
spin_lock(&fs_info->trans_lock);
- if (trans->transaction->list.prev != &fs_info->trans_list) {
- prev_trans = list_last_entry(&trans->transaction->list,
- struct btrfs_transaction, list);
+ if (!list_is_first(&trans->transaction->list, &fs_info->trans_list)) {
+ prev_trans = list_prev_entry(trans->transaction, list);
refcount_inc(&prev_trans->use_count);
}
spin_unlock(&fs_info->trans_lock);
@@ -1294,18 +1452,18 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
* group in pinned_extents before we were able to clear the whole block
* group range from pinned_extents. This means that task can lookup for
* the block group after we unpinned it from pinned_extents and removed
- * it, leading to a BUG_ON() at unpin_extent_range().
+ * it, leading to an error at unpin_extent_range().
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
if (prev_trans) {
- ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
- EXTENT_DIRTY);
+ ret = btrfs_clear_extent_bit(&prev_trans->pinned_extents, start, end,
+ EXTENT_DIRTY, NULL);
if (ret)
goto out;
}
- ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
- EXTENT_DIRTY);
+ ret = btrfs_clear_extent_bit(&trans->transaction->pinned_extents, start, end,
+ EXTENT_DIRTY, NULL);
out:
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
if (prev_trans)
@@ -1315,11 +1473,38 @@ out:
}
/*
+ * Link the block_group to a list via bg_list.
+ *
+ * @bg: The block_group to link to the list.
+ * @list: The list to link it to.
+ *
+ * Use this rather than list_add_tail() directly to ensure proper respect
+ * to locking and refcounting.
+ *
+ * Returns: true if the bg was linked with a refcount bump and false otherwise.
+ */
+static bool btrfs_link_bg_list(struct btrfs_block_group *bg, struct list_head *list)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ bool added = false;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (list_empty(&bg->bg_list)) {
+ btrfs_get_block_group(bg);
+ list_add_tail(&bg->bg_list, list);
+ added = true;
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+ return added;
+}
+
+/*
* Process the unused_bgs list and remove any that don't have any allocated
* space inside of them.
*/
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
{
+ LIST_HEAD(retry_list);
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
@@ -1329,8 +1514,19 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
+ if (btrfs_fs_closing(fs_info))
+ return;
+
+ /*
+ * Long running balances can keep us blocked here for eternity, so
+ * simply skip deletion if we're unable to get the mutex.
+ */
+ if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
+ return;
+
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
+ u64 used;
int trimming;
block_group = list_first_entry(&fs_info->unused_bgs,
@@ -1348,8 +1544,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
-
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
@@ -1368,22 +1562,69 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
goto next;
}
+ spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- if (block_group->reserved || block_group->pinned ||
- block_group->used || block_group->ro ||
+ if (btrfs_is_block_group_used(block_group) || block_group->ro ||
list_is_singular(&block_group->list)) {
/*
* We want to bail if we made new allocations or have
* outstanding allocations in this block group. We do
* the ro check in case balance is currently acting on
* this block group.
+ *
+ * Also bail out if this is the only block group for its
+ * type, because otherwise we would lose profile
+ * information from fs_info->avail_*_alloc_bits and the
+ * next block group of this type would be created with a
+ * "single" profile (even if we're in a raid fs) because
+ * fs_info->avail_*_alloc_bits would be 0.
*/
trace_btrfs_skip_unused_block_group(block_group);
spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem);
goto next;
}
+
+ /*
+ * The block group may be unused but there may be space reserved
+ * accounting with the existence of that block group, that is,
+ * space_info->bytes_may_use was incremented by a task but no
+ * space was yet allocated from the block group by the task.
+ * That space may or may not be allocated, as we are generally
+ * pessimistic about space reservation for metadata as well as
+ * for data when using compression (as we reserve space based on
+ * the worst case, when data can't be compressed, and before
+ * actually attempting compression, before starting writeback).
+ *
+ * So check if the total space of the space_info minus the size
+ * of this block group is less than the used space of the
+ * space_info - if that's the case, then it means we have tasks
+ * that might be relying on the block group in order to allocate
+ * extents, and add back the block group to the unused list when
+ * we finish, so that we retry later in case no tasks ended up
+ * needing to allocate extents from the block group.
+ */
+ used = btrfs_space_info_used(space_info, true);
+ if ((space_info->total_bytes - block_group->length < used &&
+ block_group->zone_unusable < block_group->length) ||
+ has_unwritten_metadata(block_group)) {
+ /*
+ * Add a reference for the list, compensate for the ref
+ * drop under the "next" label for the
+ * fs_info->unused_bgs list.
+ */
+ btrfs_link_bg_list(block_group, &retry_list);
+
+ trace_btrfs_skip_unused_block_group(block_group);
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+
spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
/* We don't want to force the issue, only flip if it's ok. */
ret = inc_block_group_ro(block_group, 0);
@@ -1393,6 +1634,16 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
goto next;
}
+ ret = btrfs_zone_finish(block_group);
+ if (ret < 0) {
+ btrfs_dec_block_group_ro(block_group);
+ if (ret == -EAGAIN) {
+ btrfs_link_bg_list(block_group, &retry_list);
+ ret = 0;
+ }
+ goto next;
+ }
+
/*
* Want to do this before we do anything else so we can recover
* properly if we fail to join the transaction.
@@ -1435,12 +1686,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- btrfs_space_info_update_bytes_pinned(fs_info, space_info,
- -block_group->pinned);
+ btrfs_space_info_update_bytes_pinned(space_info, -block_group->pinned);
space_info->bytes_readonly += block_group->pinned;
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -block_group->pinned,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
block_group->pinned = 0;
spin_unlock(&block_group->lock);
@@ -1456,8 +1703,12 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
goto flip_async;
- /* DISCARD can flip during remount */
- trimming = btrfs_test_opt(fs_info, DISCARD_SYNC);
+ /*
+ * DISCARD can flip during remount. On zoned filesystems, we
+ * need to reset sequential-required zones.
+ */
+ trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
+ btrfs_is_zoned(fs_info);
/* Implicit trim during transaction commit. */
if (trimming)
@@ -1495,16 +1746,20 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
end_trans:
btrfs_end_transaction(trans);
next:
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
+ list_splice_tail(&retry_list, &fs_info->unused_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
return;
flip_async:
btrfs_end_transaction(trans);
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ spin_lock(&fs_info->unused_bgs_lock);
+ list_splice_tail(&retry_list, &fs_info->unused_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_put_block_group(block_group);
btrfs_discard_punt_unused_bgs_list(fs_info);
}
@@ -1518,87 +1773,338 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
btrfs_get_block_group(bg);
trace_btrfs_add_unused_block_group(bg);
list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
+ } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) {
+ /* Pull out the block group from the reclaim_bgs list. */
+ trace_btrfs_add_unused_block_group(bg);
+ list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
}
spin_unlock(&fs_info->unused_bgs_lock);
}
-static int find_first_block_group(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
- struct btrfs_key *key)
+/*
+ * We want block groups with a low number of used bytes to be in the beginning
+ * of the list, so they will get reclaimed first.
+ */
+static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
+ const struct list_head *b)
{
- struct btrfs_root *root = fs_info->extent_root;
- int ret = 0;
- struct btrfs_key found_key;
- struct extent_buffer *leaf;
+ const struct btrfs_block_group *bg1, *bg2;
+
+ bg1 = list_entry(a, struct btrfs_block_group, bg_list);
+ bg2 = list_entry(b, struct btrfs_block_group, bg_list);
+
+ /*
+ * Some other task may be updating the ->used field concurrently, but it
+ * is not serious if we get a stale value or load/store tearing issues,
+ * as sorting the list of block groups to reclaim is not critical and an
+ * occasional imperfect order is ok. So silence KCSAN and avoid the
+ * overhead of locking or any other synchronization.
+ */
+ return data_race(bg1->used > bg2->used);
+}
+
+static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_is_zoned(fs_info))
+ return btrfs_zoned_should_reclaim(fs_info);
+ return true;
+}
+
+static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 bytes_freed)
+{
+ const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info);
+ u64 thresh_bytes = mult_perc(bg->length, thresh_pct);
+ const u64 new_val = bg->used;
+ const u64 old_val = new_val + bytes_freed;
+
+ if (thresh_bytes == 0)
+ return false;
+
+ /*
+ * If we were below the threshold before don't reclaim, we are likely a
+ * brand new block group and we don't want to relocate new block groups.
+ */
+ if (old_val < thresh_bytes)
+ return false;
+ if (new_val >= thresh_bytes)
+ return false;
+ return true;
+}
+
+void btrfs_reclaim_bgs_work(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info =
+ container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
+ struct btrfs_block_group *bg;
+ struct btrfs_space_info *space_info;
+ LIST_HEAD(retry_list);
+
+ if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
+ return;
+
+ if (btrfs_fs_closing(fs_info))
+ return;
+
+ if (!btrfs_should_reclaim(fs_info))
+ return;
+
+ guard(super_write)(fs_info->sb);
+
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
+ return;
+
+ /*
+ * Long running balances can keep us blocked here for eternity, so
+ * simply skip reclaim if we're unable to get the mutex.
+ */
+ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
+ btrfs_exclop_finish(fs_info);
+ return;
+ }
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * Sort happens under lock because we can't simply splice it and sort.
+ * The block groups might still be in use and reachable via bg_list,
+ * and their presence in the reclaim_bgs list must be preserved.
+ */
+ list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
+ while (!list_empty(&fs_info->reclaim_bgs)) {
+ u64 used;
+ u64 reserved;
+ int ret = 0;
+
+ bg = list_first_entry(&fs_info->reclaim_bgs,
+ struct btrfs_block_group,
+ bg_list);
+ list_del_init(&bg->bg_list);
+
+ space_info = bg->space_info;
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ /* Don't race with allocators so take the groups_sem */
+ down_write(&space_info->groups_sem);
+
+ spin_lock(&space_info->lock);
+ spin_lock(&bg->lock);
+ if (bg->reserved || bg->pinned || bg->ro) {
+ /*
+ * We want to bail if we made new allocations or have
+ * outstanding allocations in this block group. We do
+ * the ro check in case balance is currently acting on
+ * this block group.
+ */
+ spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+ if (bg->used == 0) {
+ /*
+ * It is possible that we trigger relocation on a block
+ * group as its extents are deleted and it first goes
+ * below the threshold, then shortly after goes empty.
+ *
+ * In this case, relocating it does delete it, but has
+ * some overhead in relocation specific metadata, looking
+ * for the non-existent extents and running some extra
+ * transactions, which we can avoid by using one of the
+ * other mechanisms for dealing with empty block groups.
+ */
+ if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ btrfs_mark_bg_unused(bg);
+ spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+
+ }
+ /*
+ * The block group might no longer meet the reclaim condition by
+ * the time we get around to reclaiming it, so to avoid
+ * reclaiming overly full block_groups, skip reclaiming them.
+ *
+ * Since the decision making process also depends on the amount
+ * being freed, pass in a fake giant value to skip that extra
+ * check, which is more meaningful when adding to the list in
+ * the first place.
+ */
+ if (!should_reclaim_block_group(bg, bg->length)) {
+ spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+
+ spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
+
+ /*
+ * Get out fast, in case we're read-only or unmounting the
+ * filesystem. It is OK to drop block groups from the list even
+ * for the read-only case. As we did take the super write lock,
+ * "mount -o remount,ro" won't happen and read-only filesystem
+ * means it is forced read-only due to a fatal error. So, it
+ * never gets back to read-write to let us reclaim again.
+ */
+ if (btrfs_need_cleaner_sleep(fs_info)) {
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+
+ ret = inc_block_group_ro(bg, 0);
+ up_write(&space_info->groups_sem);
+ if (ret < 0)
+ goto next;
+
+ /*
+ * The amount of bytes reclaimed corresponds to the sum of the
+ * "used" and "reserved" counters. We have set the block group
+ * to RO above, which prevents reservations from happening but
+ * we may have existing reservations for which allocation has
+ * not yet been done - btrfs_update_block_group() was not yet
+ * called, which is where we will transfer a reserved extent's
+ * size from the "reserved" counter to the "used" counter - this
+ * happens when running delayed references. When we relocate the
+ * chunk below, relocation first flushes delalloc, waits for
+ * ordered extent completion (which is where we create delayed
+ * references for data extents) and commits the current
+ * transaction (which runs delayed references), and only after
+ * it does the actual work to move extents out of the block
+ * group. So the reported amount of reclaimed bytes is
+ * effectively the sum of the 'used' and 'reserved' counters.
+ */
+ spin_lock(&bg->lock);
+ used = bg->used;
+ reserved = bg->reserved;
+ spin_unlock(&bg->lock);
+
+ trace_btrfs_reclaim_block_group(bg);
+ ret = btrfs_relocate_chunk(fs_info, bg->start, false);
+ if (ret) {
+ btrfs_dec_block_group_ro(bg);
+ btrfs_err(fs_info, "error relocating chunk %llu",
+ bg->start);
+ used = 0;
+ reserved = 0;
+ spin_lock(&space_info->lock);
+ space_info->reclaim_errors++;
+ if (READ_ONCE(space_info->periodic_reclaim))
+ space_info->periodic_reclaim_ready = false;
+ spin_unlock(&space_info->lock);
+ }
+ spin_lock(&space_info->lock);
+ space_info->reclaim_count++;
+ space_info->reclaim_bytes += used;
+ space_info->reclaim_bytes += reserved;
+ spin_unlock(&space_info->lock);
+
+next:
+ if (ret && !READ_ONCE(space_info->periodic_reclaim))
+ btrfs_link_bg_list(bg, &retry_list);
+ btrfs_put_block_group(bg);
+
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ /*
+ * Reclaiming all the block groups in the list can take really
+ * long. Prioritize cleaning up unused block groups.
+ */
+ btrfs_delete_unused_bgs(fs_info);
+ /*
+ * If we are interrupted by a balance, we can just bail out. The
+ * cleaner thread restart again if necessary.
+ */
+ if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
+ goto end;
+ spin_lock(&fs_info->unused_bgs_lock);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+end:
+ spin_lock(&fs_info->unused_bgs_lock);
+ list_splice_tail(&retry_list, &fs_info->reclaim_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
+ btrfs_exclop_finish(fs_info);
+}
+
+void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
+{
+ btrfs_reclaim_sweep(fs_info);
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (!list_empty(&fs_info->reclaim_bgs))
+ queue_work(system_dfl_wq, &fs_info->reclaim_bgs_work);
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
+void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if (btrfs_link_bg_list(bg, &fs_info->reclaim_bgs))
+ trace_btrfs_add_reclaim_block_group(bg);
+}
+
+static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key,
+ const struct btrfs_path *path)
+{
+ struct btrfs_chunk_map *map;
struct btrfs_block_group_item bg;
- u64 flags;
+ struct extent_buffer *leaf;
int slot;
+ u64 flags;
+ int ret = 0;
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
- if (ret < 0)
- goto out;
+ slot = path->slots[0];
+ leaf = path->nodes[0];
- while (1) {
- slot = path->slots[0];
- leaf = path->nodes[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto out;
- break;
- }
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset);
+ if (!map) {
+ btrfs_err(fs_info,
+ "logical %llu len %llu found bg but no related chunk",
+ key->objectid, key->offset);
+ return -ENOENT;
+ }
+
+ if (unlikely(map->start != key->objectid || map->chunk_len != key->offset)) {
+ btrfs_err(fs_info,
+ "block group %llu len %llu mismatch with chunk %llu len %llu",
+ key->objectid, key->offset, map->start, map->chunk_len);
+ ret = -EUCLEAN;
+ goto out_free_map;
+ }
+
+ read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bg));
+ flags = btrfs_stack_block_group_flags(&bg) &
+ BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+ if (unlikely(flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) {
+ btrfs_err(fs_info,
+"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
+ key->objectid, key->offset, flags,
+ (BTRFS_BLOCK_GROUP_TYPE_MASK & map->type));
+ ret = -EUCLEAN;
+ }
+
+out_free_map:
+ btrfs_free_chunk_map(map);
+ return ret;
+}
+
+static int find_first_block_group(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ const struct btrfs_key *key)
+{
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
+ int ret;
+ struct btrfs_key found_key;
+ btrfs_for_each_slot(root, key, &found_key, path, ret) {
if (found_key.objectid >= key->objectid &&
found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
- struct extent_map_tree *em_tree;
- struct extent_map *em;
-
- em_tree = &root->fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, found_key.objectid,
- found_key.offset);
- read_unlock(&em_tree->lock);
- if (!em) {
- btrfs_err(fs_info,
- "logical %llu len %llu found bg but no related chunk",
- found_key.objectid, found_key.offset);
- ret = -ENOENT;
- } else if (em->start != found_key.objectid ||
- em->len != found_key.offset) {
- btrfs_err(fs_info,
- "block group %llu len %llu mismatch with chunk %llu len %llu",
- found_key.objectid, found_key.offset,
- em->start, em->len);
- ret = -EUCLEAN;
- } else {
- read_extent_buffer(leaf, &bg,
- btrfs_item_ptr_offset(leaf, slot),
- sizeof(bg));
- flags = btrfs_stack_block_group_flags(&bg) &
- BTRFS_BLOCK_GROUP_TYPE_MASK;
-
- if (flags != (em->map_lookup->type &
- BTRFS_BLOCK_GROUP_TYPE_MASK)) {
- btrfs_err(fs_info,
-"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
- found_key.objectid,
- found_key.offset, flags,
- (BTRFS_BLOCK_GROUP_TYPE_MASK &
- em->map_lookup->type));
- ret = -EUCLEAN;
- } else {
- ret = 0;
- }
- }
- free_extent_map(em);
- goto out;
+ return read_bg_from_eb(fs_info, &found_key, path);
}
- path->slots[0]++;
}
-out:
return ret;
}
@@ -1617,8 +2123,10 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
write_sequnlock(&fs_info->profiles_lock);
}
-/**
- * btrfs_rmap_block - Map a physical disk address to a list of logical addresses
+/*
+ * Map a physical disk address to a list of logical addresses.
+ *
+ * @fs_info: the filesystem
* @chunk_start: logical address of block group
* @physical: physical address to map to logical addresses
* @logical: return array of logical addresses which map to @physical
@@ -1629,12 +2137,10 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
* Used primarily to exclude those portions of a block group that contain super
* block copies.
*/
-EXPORT_FOR_TESTS
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
u64 physical, u64 **logical, int *naddrs, int *stripe_len)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 *buf;
u64 bytenr;
u64 data_stripe_length;
@@ -1642,24 +2148,17 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
int i, nr = 0;
int ret = 0;
- em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
- if (IS_ERR(em))
+ map = btrfs_get_chunk_map(fs_info, chunk_start, 1);
+ if (IS_ERR(map))
return -EIO;
- map = em->map_lookup;
- data_stripe_length = em->len;
- io_stripe_size = map->stripe_len;
+ data_stripe_length = map->stripe_size;
+ io_stripe_size = BTRFS_STRIPE_LEN;
+ chunk_start = map->start;
- if (map->type & BTRFS_BLOCK_GROUP_RAID10)
- data_stripe_length = div_u64(data_stripe_length,
- map->num_stripes / map->sub_stripes);
- else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
- data_stripe_length = div_u64(data_stripe_length, map->num_stripes);
- else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- data_stripe_length = div_u64(data_stripe_length,
- nr_data_stripes(map));
- io_stripe_size = map->stripe_len * nr_data_stripes(map);
- }
+ /* For RAID5/6 adjust to a full IO stripe length */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
if (!buf) {
@@ -1669,29 +2168,29 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
for (i = 0; i < map->num_stripes; i++) {
bool already_inserted = false;
- u64 stripe_nr;
+ u32 stripe_nr;
+ u32 offset;
int j;
if (!in_range(physical, map->stripes[i].physical,
data_stripe_length))
continue;
- stripe_nr = physical - map->stripes[i].physical;
- stripe_nr = div64_u64(stripe_nr, map->stripe_len);
+ stripe_nr = (physical - map->stripes[i].physical) >>
+ BTRFS_STRIPE_LEN_SHIFT;
+ offset = (physical - map->stripes[i].physical) &
+ BTRFS_STRIPE_LEN_MASK;
- if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- stripe_nr = stripe_nr * map->num_stripes + i;
- stripe_nr = div_u64(stripe_nr, map->sub_stripes);
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- stripe_nr = stripe_nr * map->num_stripes + i;
- }
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ stripe_nr = div_u64(stripe_nr * map->num_stripes + i,
+ map->sub_stripes);
/*
* The remaining case would be for RAID56, multiply by
* nr_data_stripes(). Alternatively, just use rmap_len below
* instead of map->stripe_len
*/
-
- bytenr = chunk_start + stripe_nr * io_stripe_size;
+ bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
/* Ensure we don't add duplicate addresses */
for (j = 0; j < nr; j++) {
@@ -1709,13 +2208,14 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
*naddrs = nr;
*stripe_len = io_stripe_size;
out:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
static int exclude_super_stripes(struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
+ const bool zoned = btrfs_is_zoned(fs_info);
u64 bytenr;
u64 *logical;
int stripe_len;
@@ -1724,8 +2224,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
cache->bytes_super += stripe_len;
- ret = btrfs_add_excluded_extent(fs_info, cache->start,
- stripe_len);
+ ret = btrfs_set_extent_bit(&fs_info->excluded_extents, cache->start,
+ cache->start + stripe_len - 1,
+ EXTENT_DIRTY, NULL);
if (ret)
return ret;
}
@@ -1737,26 +2238,23 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
if (ret)
return ret;
- while (nr--) {
- u64 start, len;
-
- if (logical[nr] > cache->start + cache->length)
- continue;
-
- if (logical[nr] + stripe_len <= cache->start)
- continue;
+ /* Shouldn't have super stripes in sequential zones */
+ if (unlikely(zoned && nr)) {
+ kfree(logical);
+ btrfs_err(fs_info,
+ "zoned: block group %llu must not contain super block",
+ cache->start);
+ return -EUCLEAN;
+ }
- start = logical[nr];
- if (start < cache->start) {
- start = cache->start;
- len = (logical[nr] + stripe_len) - start;
- } else {
- len = min_t(u64, stripe_len,
- cache->start + cache->length - start);
- }
+ while (nr--) {
+ u64 len = min_t(u64, stripe_len,
+ cache->start + cache->length - logical[nr]);
cache->bytes_super += len;
- ret = btrfs_add_excluded_extent(fs_info, start, len);
+ ret = btrfs_set_extent_bit(&fs_info->excluded_extents,
+ logical[nr], logical[nr] + len - 1,
+ EXTENT_DIRTY, NULL);
if (ret) {
kfree(logical);
return ret;
@@ -1768,22 +2266,6 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
return 0;
}
-static void link_block_group(struct btrfs_block_group *cache)
-{
- struct btrfs_space_info *space_info = cache->space_info;
- int index = btrfs_bg_flags_to_raid_index(cache->flags);
- bool first = false;
-
- down_write(&space_info->groups_sem);
- if (list_empty(&space_info->block_groups[index]))
- first = true;
- list_add_tail(&cache->list, &space_info->block_groups[index]);
- up_write(&space_info->groups_sem);
-
- if (first)
- btrfs_sysfs_add_block_group_type(cache);
-}
-
static struct btrfs_block_group *btrfs_create_block_group_cache(
struct btrfs_fs_info *fs_info, u64 start)
{
@@ -1804,11 +2286,10 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
cache->fs_info = fs_info;
cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
- set_free_space_tree_thresholds(cache);
cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
- atomic_set(&cache->count, 1);
+ refcount_set(&cache->refs, 1);
spin_lock_init(&cache->lock);
init_rwsem(&cache->data_rwsem);
INIT_LIST_HEAD(&cache->list);
@@ -1818,10 +2299,10 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
INIT_LIST_HEAD(&cache->discard_list);
INIT_LIST_HEAD(&cache->dirty_list);
INIT_LIST_HEAD(&cache->io_list);
- btrfs_init_free_space_ctl(cache);
+ INIT_LIST_HEAD(&cache->active_bg_list);
+ btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
atomic_set(&cache->frozen, 0);
mutex_init(&cache->free_space_lock);
- btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
return cache;
}
@@ -1832,79 +2313,58 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
*/
static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct btrfs_block_group *bg;
u64 start = 0;
int ret = 0;
while (1) {
- read_lock(&map_tree->lock);
+ struct btrfs_chunk_map *map;
+ struct btrfs_block_group *bg;
+
/*
- * lookup_extent_mapping will return the first extent map
- * intersecting the range, so setting @len to 1 is enough to
+ * btrfs_find_chunk_map() will return the first chunk map
+ * intersecting the range, so setting @length to 1 is enough to
* get the first chunk.
*/
- em = lookup_extent_mapping(map_tree, start, 1);
- read_unlock(&map_tree->lock);
- if (!em)
+ map = btrfs_find_chunk_map(fs_info, start, 1);
+ if (!map)
break;
- bg = btrfs_lookup_block_group(fs_info, em->start);
- if (!bg) {
+ bg = btrfs_lookup_block_group(fs_info, map->start);
+ if (unlikely(!bg)) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu doesn't have corresponding block group",
- em->start, em->len);
+ map->start, map->chunk_len);
ret = -EUCLEAN;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
break;
}
- if (bg->start != em->start || bg->length != em->len ||
- (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
- (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ if (unlikely(bg->start != map->start || bg->length != map->chunk_len ||
+ (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
+ (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
- em->start, em->len,
- em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
+ map->start, map->chunk_len,
+ map->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
bg->start, bg->length,
bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
ret = -EUCLEAN;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
btrfs_put_block_group(bg);
break;
}
- start = em->start + em->len;
- free_extent_map(em);
+ start = map->start + map->chunk_len;
+ btrfs_free_chunk_map(map);
btrfs_put_block_group(bg);
}
return ret;
}
-static int read_block_group_item(struct btrfs_block_group *cache,
- struct btrfs_path *path,
- const struct btrfs_key *key)
-{
- struct extent_buffer *leaf = path->nodes[0];
- struct btrfs_block_group_item bgi;
- int slot = path->slots[0];
-
- cache->length = key->offset;
-
- read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
- sizeof(bgi));
- cache->used = btrfs_stack_block_group_used(&bgi);
- cache->flags = btrfs_stack_block_group_flags(&bgi);
-
- return 0;
-}
-
static int read_one_block_group(struct btrfs_fs_info *info,
- struct btrfs_path *path,
+ struct btrfs_block_group_item *bgi,
const struct btrfs_key *key,
int need_clear)
{
struct btrfs_block_group *cache;
- struct btrfs_space_info *space_info;
const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
int ret;
@@ -1914,9 +2374,14 @@ static int read_one_block_group(struct btrfs_fs_info *info,
if (!cache)
return -ENOMEM;
- ret = read_block_group_item(cache, path, key);
- if (ret < 0)
- goto error;
+ cache->length = key->offset;
+ cache->used = btrfs_stack_block_group_used(bgi);
+ cache->commit_used = cache->used;
+ cache->flags = btrfs_stack_block_group_flags(bgi);
+ cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
+ cache->space_info = btrfs_find_space_info(info, cache->flags);
+
+ btrfs_set_free_space_tree_thresholds(cache);
if (need_clear) {
/*
@@ -1941,6 +2406,13 @@ static int read_one_block_group(struct btrfs_fs_info *info,
goto error;
}
+ ret = btrfs_load_block_group_zone_info(cache, false);
+ if (ret) {
+ btrfs_err(info, "zoned: failed to load zone info of bg %llu",
+ cache->start);
+ goto error;
+ }
+
/*
* We need to exclude the super stripes now so that the space info has
* super bytes accounted for, otherwise we'll think we have more space
@@ -1954,54 +2426,113 @@ static int read_one_block_group(struct btrfs_fs_info *info,
}
/*
- * Check for two cases, either we are full, and therefore don't need
- * to bother with the caching work since we won't find any space, or we
- * are empty, and we can just add all the space in and be done with it.
- * This saves us _a_lot_ of time, particularly in the full case.
+ * For zoned filesystem, space after the allocation offset is the only
+ * free space for a block group. So, we don't need any caching work.
+ * btrfs_calc_zone_unusable() will set the amount of free space and
+ * zone_unusable space.
+ *
+ * For regular filesystem, check for two cases, either we are full, and
+ * therefore don't need to bother with the caching work since we won't
+ * find any space, or we are empty, and we can just add all the space
+ * in and be done with it. This saves us _a_lot_ of time, particularly
+ * in the full case.
*/
- if (cache->length == cache->used) {
- cache->last_byte_to_unpin = (u64)-1;
+ if (btrfs_is_zoned(info)) {
+ btrfs_calc_zone_unusable(cache);
+ /* Should not have any excluded extents. Just in case, though. */
+ btrfs_free_excluded_extents(cache);
+ } else if (cache->length == cache->used) {
cache->cached = BTRFS_CACHE_FINISHED;
btrfs_free_excluded_extents(cache);
} else if (cache->used == 0) {
- cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
- add_new_free_space(cache, cache->start,
- cache->start + cache->length);
+ ret = btrfs_add_new_free_space(cache, cache->start,
+ cache->start + cache->length, NULL);
btrfs_free_excluded_extents(cache);
+ if (ret)
+ goto error;
}
- ret = btrfs_add_block_group_cache(info, cache);
+ ret = btrfs_add_block_group_cache(cache);
if (ret) {
btrfs_remove_free_space_cache(cache);
goto error;
}
- trace_btrfs_add_block_group(info, cache, 0);
- btrfs_update_space_info(info, cache->flags, cache->length,
- cache->used, cache->bytes_super, &space_info);
- cache->space_info = space_info;
-
- link_block_group(cache);
+ trace_btrfs_add_block_group(info, cache, 0);
+ btrfs_add_bg_to_space_info(info, cache);
set_avail_alloc_bits(info, cache->flags);
- if (btrfs_chunk_readonly(info, cache->start)) {
+ if (btrfs_chunk_writeable(info, cache->start)) {
+ if (cache->used == 0) {
+ ASSERT(list_empty(&cache->bg_list));
+ if (btrfs_test_opt(info, DISCARD_ASYNC))
+ btrfs_discard_queue_work(&info->discard_ctl, cache);
+ else
+ btrfs_mark_bg_unused(cache);
+ }
+ } else {
inc_block_group_ro(cache, 1);
- } else if (cache->used == 0) {
- ASSERT(list_empty(&cache->bg_list));
- if (btrfs_test_opt(info, DISCARD_ASYNC))
- btrfs_discard_queue_work(&info->discard_ctl, cache);
- else
- btrfs_mark_bg_unused(cache);
}
+
return 0;
error:
btrfs_put_block_group(cache);
return ret;
}
+static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
+{
+ struct rb_node *node;
+ int ret = 0;
+
+ for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
+ struct btrfs_chunk_map *map;
+ struct btrfs_block_group *bg;
+
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ bg = btrfs_create_block_group_cache(fs_info, map->start);
+ if (!bg) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ /* Fill dummy cache as FULL */
+ bg->length = map->chunk_len;
+ bg->flags = map->type;
+ bg->cached = BTRFS_CACHE_FINISHED;
+ bg->used = map->chunk_len;
+ bg->flags = map->type;
+ bg->space_info = btrfs_find_space_info(fs_info, bg->flags);
+ ret = btrfs_add_block_group_cache(bg);
+ /*
+ * We may have some valid block group cache added already, in
+ * that case we skip to the next one.
+ */
+ if (ret == -EEXIST) {
+ ret = 0;
+ btrfs_put_block_group(bg);
+ continue;
+ }
+
+ if (ret) {
+ btrfs_remove_free_space_cache(bg);
+ btrfs_put_block_group(bg);
+ break;
+ }
+
+ btrfs_add_bg_to_space_info(fs_info, bg);
+
+ set_avail_alloc_bits(fs_info, bg->flags);
+ }
+ if (!ret)
+ btrfs_init_global_block_rsv(fs_info);
+ return ret;
+}
+
int btrfs_read_block_groups(struct btrfs_fs_info *info)
{
+ struct btrfs_root *root = btrfs_block_group_root(info);
struct btrfs_path *path;
int ret;
struct btrfs_block_group *cache;
@@ -2010,9 +2541,21 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
int need_clear = 0;
u64 cache_gen;
+ /*
+ * Either no extent root (with ibadroots rescue option) or we have
+ * unsupported RO options. The fs can never be mounted read-write, so no
+ * need to waste time searching block group items.
+ *
+ * This also allows new extent tree related changes to be RO compat,
+ * no need for a full incompat flag.
+ */
+ if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
+ ~BTRFS_FEATURE_COMPAT_RO_SUPP))
+ return fill_dummy_bgs(info);
+
key.objectid = 0;
- key.offset = 0;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ key.offset = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -2025,23 +2568,44 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
need_clear = 1;
while (1) {
+ struct btrfs_block_group_item bgi;
+ struct extent_buffer *leaf;
+ int slot;
+
ret = find_first_block_group(info, path, &key);
if (ret > 0)
break;
if (ret != 0)
goto error;
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- ret = read_one_block_group(info, path, &key, need_clear);
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bgi));
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ btrfs_release_path(path);
+ ret = read_one_block_group(info, &bgi, &key, need_clear);
if (ret < 0)
goto error;
key.objectid += key.offset;
key.offset = 0;
- btrfs_release_path(path);
}
+ btrfs_release_path(path);
+
+ list_for_each_entry(space_info, &info->space_info, list) {
+ int i;
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ if (list_empty(&space_info->block_groups[i]))
+ continue;
+ cache = list_first_entry(&space_info->block_groups[i],
+ struct btrfs_block_group,
+ list);
+ btrfs_sysfs_add_block_group_type(cache);
+ }
- rcu_read_lock();
- list_for_each_entry_rcu(space_info, &info->space_info, list) {
if (!(btrfs_get_alloc_profile(info, space_info->flags) &
(BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID1_MASK |
@@ -2061,71 +2625,260 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
list)
inc_block_group_ro(cache, 1);
}
- rcu_read_unlock();
btrfs_init_global_block_rsv(info);
ret = check_chunk_block_group_mappings(info);
error:
btrfs_free_path(path);
+ /*
+ * We've hit some error while reading the extent tree, and have
+ * rescue=ibadroots mount option.
+ * Try to fill the tree using dummy block groups so that the user can
+ * continue to mount and grab their data.
+ */
+ if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
+ ret = fill_dummy_bgs(info);
return ret;
}
+/*
+ * This function, insert_block_group_item(), belongs to the phase 2 of chunk
+ * allocation.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
static int insert_block_group_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group_item bgi;
- struct btrfs_root *root;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
struct btrfs_key key;
+ u64 old_commit_used;
+ int ret;
spin_lock(&block_group->lock);
btrfs_set_stack_block_group_used(&bgi, block_group->used);
btrfs_set_stack_block_group_chunk_objectid(&bgi,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ block_group->global_root_id);
btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
+ old_commit_used = block_group->commit_used;
+ block_group->commit_used = block_group->used;
key.objectid = block_group->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
key.offset = block_group->length;
spin_unlock(&block_group->lock);
- root = fs_info->extent_root;
- return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
+ ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
+ if (ret < 0) {
+ spin_lock(&block_group->lock);
+ block_group->commit_used = old_commit_used;
+ spin_unlock(&block_group->lock);
+ }
+
+ return ret;
+}
+
+static int insert_dev_extent(struct btrfs_trans_handle *trans,
+ const struct btrfs_device *device, u64 chunk_offset,
+ u64 start, u64 num_bytes)
+{
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ struct btrfs_root *root = fs_info->dev_root;
+ BTRFS_PATH_AUTO_FREE(path);
+ struct btrfs_dev_extent *extent;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int ret;
+
+ WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
+ WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = device->devid;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = start;
+ ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
+ if (ret)
+ return ret;
+
+ leaf = path->nodes[0];
+ extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
+ btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
+ btrfs_set_dev_extent_chunk_objectid(leaf, extent,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
+ btrfs_set_dev_extent_length(leaf, extent, num_bytes);
+
+ return ret;
+}
+
+/*
+ * This function belongs to phase 2.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
+static int insert_dev_extents(struct btrfs_trans_handle *trans,
+ u64 chunk_offset, u64 chunk_size)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_device *device;
+ struct btrfs_chunk_map *map;
+ u64 dev_offset;
+ int i;
+ int ret = 0;
+
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ /*
+ * Take the device list mutex to prevent races with the final phase of
+ * a device replace operation that replaces the device object associated
+ * with the map's stripes, because the device object's id can change
+ * at any time during that final phase of the device replace operation
+ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+ * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
+ * resulting in persisting a device extent item with such ID.
+ */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ dev_offset = map->stripes[i].physical;
+
+ ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
+ map->stripe_size);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ btrfs_free_chunk_map(map);
+ return ret;
}
+/*
+ * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
+ * chunk allocation.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *block_group;
int ret = 0;
- if (!trans->can_flush_pending_bgs)
- return;
-
while (!list_empty(&trans->new_bgs)) {
+ int index;
+
block_group = list_first_entry(&trans->new_bgs,
struct btrfs_block_group,
bg_list);
if (ret)
goto next;
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
+
ret = insert_block_group_item(trans, block_group);
if (ret)
btrfs_abort_transaction(trans, ret);
- ret = btrfs_finish_chunk_alloc(trans, block_group->start,
- block_group->length);
+ if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
+ &block_group->runtime_flags)) {
+ mutex_lock(&fs_info->chunk_mutex);
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
+ mutex_unlock(&fs_info->chunk_mutex);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
+ ret = insert_dev_extents(trans, block_group->start,
+ block_group->length);
if (ret)
btrfs_abort_transaction(trans, ret);
- add_block_group_free_space(trans, block_group);
+ btrfs_add_block_group_free_space(trans, block_group);
+
+ /*
+ * If we restriped during balance, we may have added a new raid
+ * type, so now add the sysfs entries when it is safe to do so.
+ * We don't have to worry about locking here as it's handled in
+ * btrfs_sysfs_add_block_group_type.
+ */
+ if (block_group->space_info->block_group_kobjs[index] == NULL)
+ btrfs_sysfs_add_block_group_type(block_group);
+
/* Already aborted the transaction if it failed. */
next:
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
+
+ spin_lock(&fs_info->unused_bgs_lock);
list_del_init(&block_group->bg_list);
+ clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
+ btrfs_put_block_group(block_group);
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ /*
+ * If the block group is still unused, add it to the list of
+ * unused block groups. The block group may have been created in
+ * order to satisfy a space reservation, in which case the
+ * extent allocation only happens later. But often we don't
+ * actually need to allocate space that we previously reserved,
+ * so the block group may become unused for a long time. For
+ * example for metadata we generally reserve space for a worst
+ * possible scenario, but then don't end up allocating all that
+ * space or none at all (due to no need to COW, extent buffers
+ * were already COWed in the current transaction and still
+ * unwritten, tree heights lower than the maximum possible
+ * height, etc). For data we generally reserve the exact amount
+ * of space we are going to allocate later, the exception is
+ * when using compression, as we must reserve space based on the
+ * uncompressed data size, because the compression is only done
+ * when writeback triggered and we don't know how much space we
+ * are actually going to need, so we reserve the uncompressed
+ * size because the data may be incompressible in the worst case.
+ */
+ if (ret == 0) {
+ bool used;
+
+ spin_lock(&block_group->lock);
+ used = btrfs_is_block_group_used(block_group);
+ spin_unlock(&block_group->lock);
+
+ if (!used)
+ btrfs_mark_bg_unused(block_group);
+ }
}
btrfs_trans_release_chunk_metadata(trans);
}
-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
- u64 type, u64 chunk_offset, u64 size)
+/*
+ * For extent tree v2 we use the block_group_item->chunk_offset to point at our
+ * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
+ */
+static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 offset)
+{
+ u64 div = SZ_1G;
+ u64 index;
+
+ if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+
+ /* If we have a smaller fs index based on 128MiB. */
+ if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
+ div = SZ_128M;
+
+ offset = div64_u64(offset, div);
+ div64_u64_rem(offset, fs_info->nr_global_roots, &index);
+ return index;
+}
+
+struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_space_info *space_info,
+ u64 type, u64 chunk_offset, u64 size)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *cache;
@@ -2135,47 +2888,58 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
if (!cache)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Mark it as new before adding it to the rbtree of block groups or any
+ * list, so that no other task finds it and calls btrfs_mark_bg_unused()
+ * before the new flag is set.
+ */
+ set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags);
cache->length = size;
- cache->used = bytes_used;
+ btrfs_set_free_space_tree_thresholds(cache);
cache->flags = type;
- cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
- cache->needs_free_space = 1;
+ cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
+
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
+
+ ret = btrfs_load_block_group_zone_info(cache, true);
+ if (ret) {
+ btrfs_put_block_group(cache);
+ return ERR_PTR(ret);
+ }
+
ret = exclude_super_stripes(cache);
if (ret) {
/* We may have excluded something, so call this just in case */
btrfs_free_excluded_extents(cache);
btrfs_put_block_group(cache);
- return ret;
+ return ERR_PTR(ret);
}
- add_new_free_space(cache, chunk_offset, chunk_offset + size);
-
+ ret = btrfs_add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL);
btrfs_free_excluded_extents(cache);
-
-#ifdef CONFIG_BTRFS_DEBUG
- if (btrfs_should_fragment_free_space(cache)) {
- u64 new_bytes_used = size - bytes_used;
-
- bytes_used += new_bytes_used >> 1;
- fragment_free_space(cache);
+ if (ret) {
+ btrfs_put_block_group(cache);
+ return ERR_PTR(ret);
}
-#endif
+
/*
* Ensure the corresponding space_info object is created and
* assigned to our block group. We want our bg to be added to the rbtree
* with its ->space_info set.
*/
- cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
+ cache->space_info = space_info;
ASSERT(cache->space_info);
- ret = btrfs_add_block_group_cache(fs_info, cache);
+ ret = btrfs_add_block_group_cache(cache);
if (ret) {
btrfs_remove_free_space_cache(cache);
btrfs_put_block_group(cache);
- return ret;
+ return ERR_PTR(ret);
}
/*
@@ -2183,66 +2947,21 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
* the rbtree, update the space info's counters.
*/
trace_btrfs_add_block_group(fs_info, cache, 1);
- btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
- cache->bytes_super, &cache->space_info);
+ btrfs_add_bg_to_space_info(fs_info, cache);
btrfs_update_global_block_rsv(fs_info);
- link_block_group(cache);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(cache)) {
+ cache->space_info->bytes_used += size >> 1;
+ fragment_free_space(cache);
+ }
+#endif
- list_add_tail(&cache->bg_list, &trans->new_bgs);
- trans->delayed_ref_updates++;
- btrfs_update_delayed_refs_rsv(trans);
+ btrfs_link_bg_list(cache, &trans->new_bgs);
+ btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info);
set_avail_alloc_bits(fs_info, type);
- return 0;
-}
-
-static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
-{
- u64 num_devices;
- u64 stripped;
-
- /*
- * if restripe for this chunk_type is on pick target profile and
- * return, otherwise do the usual balance
- */
- stripped = get_restripe_target(fs_info, flags);
- if (stripped)
- return extended_to_chunk(stripped);
-
- num_devices = fs_info->fs_devices->rw_devices;
-
- stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
- BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
-
- if (num_devices == 1) {
- stripped |= BTRFS_BLOCK_GROUP_DUP;
- stripped = flags & ~stripped;
-
- /* turn raid0 into single device chunks */
- if (flags & BTRFS_BLOCK_GROUP_RAID0)
- return stripped;
-
- /* turn mirroring into duplication */
- if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
- BTRFS_BLOCK_GROUP_RAID10))
- return stripped | BTRFS_BLOCK_GROUP_DUP;
- } else {
- /* they already had raid on here, just return */
- if (flags & stripped)
- return flags;
-
- stripped |= BTRFS_BLOCK_GROUP_DUP;
- stripped = flags & ~stripped;
-
- /* switch duplicated blocks with raid1 */
- if (flags & BTRFS_BLOCK_GROUP_DUP)
- return stripped | BTRFS_BLOCK_GROUP_RAID1;
-
- /* this is drive concat, leave it alone */
- }
-
- return flags;
+ return cache;
}
/*
@@ -2258,41 +2977,60 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
bool do_chunk_alloc)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_space_info *space_info = cache->space_info;
struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
u64 alloc_flags;
int ret;
-
-again:
- trans = btrfs_join_transaction(fs_info->extent_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ bool dirty_bg_running;
/*
- * we're not allowed to set block groups readonly after the dirty
- * block groups cache has started writing. If it already started,
- * back off and let this transaction commit
+ * This can only happen when we are doing read-only scrub on read-only
+ * mount.
+ * In that case we should not start a new transaction on read-only fs.
+ * Thus here we skip all chunk allocations.
*/
- mutex_lock(&fs_info->ro_block_group_mutex);
- if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
- u64 transid = trans->transid;
-
+ if (sb_rdonly(fs_info->sb)) {
+ mutex_lock(&fs_info->ro_block_group_mutex);
+ ret = inc_block_group_ro(cache, 0);
mutex_unlock(&fs_info->ro_block_group_mutex);
- btrfs_end_transaction(trans);
-
- ret = btrfs_wait_for_commit(fs_info, transid);
- if (ret)
- return ret;
- goto again;
+ return ret;
}
+ do {
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ dirty_bg_running = false;
+
+ /*
+ * We're not allowed to set block groups readonly after the dirty
+ * block group cache has started writing. If it already started,
+ * back off and let this transaction commit.
+ */
+ mutex_lock(&fs_info->ro_block_group_mutex);
+ if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
+ u64 transid = trans->transid;
+
+ mutex_unlock(&fs_info->ro_block_group_mutex);
+ btrfs_end_transaction(trans);
+
+ ret = btrfs_wait_for_commit(fs_info, transid);
+ if (ret)
+ return ret;
+ dirty_bg_running = true;
+ }
+ } while (dirty_bg_running);
+
if (do_chunk_alloc) {
/*
* If we are changing raid levels, try to allocate a
* corresponding block group with the new raid level.
*/
- alloc_flags = update_block_group_flags(fs_info, cache->flags);
+ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
if (alloc_flags != cache->flags) {
- ret = btrfs_chunk_alloc(trans, alloc_flags,
+ ret = btrfs_chunk_alloc(trans, space_info, alloc_flags,
CHUNK_ALLOC_FORCE);
/*
* ENOSPC is allowed here, we may have enough space
@@ -2306,18 +3044,38 @@ again:
}
ret = inc_block_group_ro(cache, 0);
- if (!do_chunk_alloc)
- goto unlock_out;
if (!ret)
goto out;
- alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
- ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+ if (ret == -ETXTBSY)
+ goto unlock_out;
+
+ /*
+ * Skip chunk allocation if the bg is SYSTEM, this is to avoid system
+ * chunk allocation storm to exhaust the system chunk array. Otherwise
+ * we still want to try our best to mark the block group read-only.
+ */
+ if (!do_chunk_alloc && ret == -ENOSPC &&
+ (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
+ goto unlock_out;
+
+ alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
+ ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
+ if (ret < 0)
+ goto out;
+ /*
+ * We have allocated a new chunk. We also need to activate that chunk to
+ * grant metadata tickets for zoned filesystem.
+ */
+ ret = btrfs_zoned_activate_one_bg(space_info, true);
if (ret < 0)
goto out;
+
ret = inc_block_group_ro(cache, 0);
+ if (ret == -ETXTBSY)
+ goto unlock_out;
out:
if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
- alloc_flags = update_block_group_flags(fs_info, cache->flags);
+ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
mutex_lock(&fs_info->chunk_mutex);
check_system_chunk(trans, alloc_flags);
mutex_unlock(&fs_info->chunk_mutex);
@@ -2339,8 +3097,18 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
if (!--cache->ro) {
+ if (btrfs_is_zoned(cache->fs_info)) {
+ /* Migrate zone_unusable bytes back */
+ cache->zone_unusable =
+ (cache->alloc_offset - cache->used - cache->pinned -
+ cache->reserved) +
+ (cache->length - cache->zone_capacity);
+ btrfs_space_info_update_bytes_zone_unusable(sinfo, cache->zone_unusable);
+ sinfo->bytes_readonly -= cache->zone_unusable;
+ }
num_bytes = cache->length - cache->reserved -
- cache->pinned - cache->bytes_super - cache->used;
+ cache->pinned - cache->bytes_super -
+ cache->zone_unusable - cache->used;
sinfo->bytes_readonly -= num_bytes;
list_del_init(&cache->ro_list);
}
@@ -2354,11 +3122,30 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
- struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
unsigned long bi;
struct extent_buffer *leaf;
struct btrfs_block_group_item bgi;
struct btrfs_key key;
+ u64 old_commit_used;
+ u64 used;
+
+ /*
+ * Block group items update can be triggered out of commit transaction
+ * critical section, thus we need a consistent view of used bytes.
+ * We cannot use cache->used directly outside of the spin lock, as it
+ * may be changed.
+ */
+ spin_lock(&cache->lock);
+ old_commit_used = cache->commit_used;
+ used = cache->used;
+ /* No change in used bytes, can safely skip it. */
+ if (cache->commit_used == used) {
+ spin_unlock(&cache->lock);
+ return 0;
+ }
+ cache->commit_used = used;
+ spin_unlock(&cache->lock);
key.objectid = cache->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
@@ -2373,14 +3160,27 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
- btrfs_set_stack_block_group_used(&bgi, cache->used);
+ btrfs_set_stack_block_group_used(&bgi, used);
btrfs_set_stack_block_group_chunk_objectid(&bgi,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ cache->global_root_id);
btrfs_set_stack_block_group_flags(&bgi, cache->flags);
write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
- btrfs_mark_buffer_dirty(leaf);
fail:
btrfs_release_path(path);
+ /*
+ * We didn't update the block group item, need to revert commit_used
+ * unless the block group item didn't exist yet - this is to prevent a
+ * race with a concurrent insertion of the block group item, with
+ * insert_block_group_item(), that happened just after we attempted to
+ * update. In that case we would reset commit_used to 0 just after the
+ * insertion set it to a value greater than 0 - if the block group later
+ * becomes with 0 used bytes, we would incorrectly skip its update.
+ */
+ if (ret < 0 && ret != -ENOENT) {
+ spin_lock(&cache->lock);
+ cache->commit_used = old_commit_used;
+ spin_unlock(&cache->lock);
+ }
return ret;
}
@@ -2390,15 +3190,17 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_root *root = fs_info->tree_root;
struct inode *inode = NULL;
struct extent_changeset *data_reserved = NULL;
u64 alloc_hint = 0;
int dcs = BTRFS_DC_ERROR;
- u64 num_pages = 0;
+ u64 cache_size = 0;
int retries = 0;
int ret = 0;
+ if (!btrfs_test_opt(fs_info, SPACE_CACHE))
+ return 0;
+
/*
* If this block group is smaller than 100 megs don't bother caching the
* block group.
@@ -2439,8 +3241,8 @@ again:
* time.
*/
BTRFS_I(inode)->generation = 0;
- ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ if (unlikely(ret)) {
/*
* So theoretically we could recover from this, simply set the
* super cache generation to 0 so we know to invalidate the
@@ -2504,19 +3306,20 @@ again:
* taking up quite a bit since it's not folded into the other space
* cache.
*/
- num_pages = div_u64(block_group->length, SZ_256M);
- if (!num_pages)
- num_pages = 1;
+ cache_size = div_u64(block_group->length, SZ_256M);
+ if (!cache_size)
+ cache_size = 1;
- num_pages *= 16;
- num_pages *= PAGE_SIZE;
+ cache_size *= 16;
+ cache_size *= fs_info->sectorsize;
- ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
+ ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
+ cache_size, false);
if (ret)
goto out_put;
- ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
- num_pages, num_pages,
+ ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
+ cache_size, cache_size,
&alloc_hint);
/*
* Our cache requires contiguous chunks so that we don't modify a bunch
@@ -2551,7 +3354,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *cache, *tmp;
struct btrfs_transaction *cur_trans = trans->transaction;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
if (list_empty(&cur_trans->dirty_bgs) ||
!btrfs_test_opt(fs_info, SPACE_CACHE))
@@ -2568,7 +3371,6 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
cache_save_setup(cache, trans, path);
}
- btrfs_free_path(path);
return 0;
}
@@ -2591,10 +3393,9 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
int should_put;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
LIST_HEAD(dirty);
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
int loops = 0;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -2611,8 +3412,10 @@ again:
if (!path) {
path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
/*
@@ -2658,7 +3461,6 @@ again:
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
/*
@@ -2705,17 +3507,15 @@ again:
if (should_put)
btrfs_put_block_group(cache);
if (drop_reserve)
- btrfs_delayed_refs_rsv_release(fs_info, 1);
-
- if (ret)
- break;
-
+ btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
/*
* Avoid blocking other tasks for too long. It might even save
* us from writing caches for block groups that are going to be
* removed.
*/
mutex_unlock(&trans->transaction->cache_write_mutex);
+ if (ret)
+ goto out;
mutex_lock(&trans->transaction->cache_write_mutex);
}
mutex_unlock(&trans->transaction->cache_write_mutex);
@@ -2724,7 +3524,8 @@ again:
* Go through delayed refs for all the stuff we've just kicked off
* and then loop back (just once)
*/
- ret = btrfs_run_delayed_refs(trans, 0);
+ if (!ret)
+ ret = btrfs_run_delayed_refs(trans, 0);
if (!ret && loops == 0) {
loops++;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -2738,11 +3539,15 @@ again:
goto again;
}
spin_unlock(&cur_trans->dirty_bgs_lock);
- } else if (ret < 0) {
+ }
+out:
+ if (ret < 0) {
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ list_splice_init(&dirty, &cur_trans->dirty_bgs);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
}
- btrfs_free_path(path);
return ret;
}
@@ -2753,9 +3558,8 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
int should_put;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
path = btrfs_alloc_path();
if (!path)
@@ -2806,14 +3610,12 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
cache_save_setup(cache, trans, path);
if (!ret)
- ret = btrfs_run_delayed_refs(trans,
- (unsigned long) -1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
list_add_tail(&cache->io_list, io);
} else {
@@ -2836,22 +3638,24 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
* finished yet (no block group item in the extent tree
* yet, etc). If this is the case, wait for all free
* space endio workers to finish and retry. This is a
- * a very rare case so no need for a more efficient and
+ * very rare case so no need for a more efficient and
* complex approach.
*/
if (ret == -ENOENT) {
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
ret = update_block_group_item(trans, path, cache);
- }
- if (ret)
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ } else if (ret) {
btrfs_abort_transaction(trans, ret);
+ }
}
/* If its not on the io list, we need to put the block group */
if (should_put)
btrfs_put_block_group(cache);
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
spin_lock(&cur_trans->dirty_bgs_lock);
}
spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -2868,20 +3672,19 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
btrfs_put_block_group(cache);
}
- btrfs_free_path(path);
return ret;
}
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, int alloc)
+ u64 bytenr, u64 num_bytes, bool alloc)
{
struct btrfs_fs_info *info = trans->fs_info;
- struct btrfs_block_group *cache = NULL;
- u64 total = num_bytes;
+ struct btrfs_space_info *space_info;
+ struct btrfs_block_group *cache;
u64 old_val;
- u64 byte_in_group;
+ bool reclaim = false;
+ bool bg_already_dirty = true;
int factor;
- int ret = 0;
/* Block accounting for super block */
spin_lock(&info->delalloc_root_lock);
@@ -2893,96 +3696,96 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
btrfs_set_super_bytes_used(info->super_copy, old_val);
spin_unlock(&info->delalloc_root_lock);
- while (total) {
- cache = btrfs_lookup_block_group(info, bytenr);
- if (!cache) {
- ret = -ENOENT;
- break;
- }
- factor = btrfs_bg_type_to_factor(cache->flags);
+ cache = btrfs_lookup_block_group(info, bytenr);
+ if (!cache)
+ return -ENOENT;
- /*
- * If this block group has free space cache written out, we
- * need to make sure to load it if we are removing space. This
- * is because we need the unpinning stage to actually add the
- * space back to the block group, otherwise we will leak space.
- */
- if (!alloc && !btrfs_block_group_done(cache))
- btrfs_cache_block_group(cache, 1);
+ /* An extent can not span multiple block groups. */
+ ASSERT(bytenr + num_bytes <= cache->start + cache->length);
- byte_in_group = bytenr - cache->start;
- WARN_ON(byte_in_group > cache->length);
+ space_info = cache->space_info;
+ factor = btrfs_bg_type_to_factor(cache->flags);
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
+ /*
+ * If this block group has free space cache written out, we need to make
+ * sure to load it if we are removing space. This is because we need
+ * the unpinning stage to actually add the space back to the block group,
+ * otherwise we will leak space.
+ */
+ if (!alloc && !btrfs_block_group_done(cache))
+ btrfs_cache_block_group(cache, true);
- if (btrfs_test_opt(info, SPACE_CACHE) &&
- cache->disk_cache_state < BTRFS_DC_CLEAR)
- cache->disk_cache_state = BTRFS_DC_CLEAR;
+ spin_lock(&space_info->lock);
+ spin_lock(&cache->lock);
- old_val = cache->used;
- num_bytes = min(total, cache->length - byte_in_group);
- if (alloc) {
- old_val += num_bytes;
- cache->used = old_val;
- cache->reserved -= num_bytes;
- cache->space_info->bytes_reserved -= num_bytes;
- cache->space_info->bytes_used += num_bytes;
- cache->space_info->disk_used += num_bytes * factor;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
- } else {
- old_val -= num_bytes;
- cache->used = old_val;
- cache->pinned += num_bytes;
- btrfs_space_info_update_bytes_pinned(info,
- cache->space_info, num_bytes);
- cache->space_info->bytes_used -= num_bytes;
- cache->space_info->disk_used -= num_bytes * factor;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
-
- percpu_counter_add_batch(
- &cache->space_info->total_bytes_pinned,
- num_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
- set_extent_dirty(&trans->transaction->pinned_extents,
- bytenr, bytenr + num_bytes - 1,
- GFP_NOFS | __GFP_NOFAIL);
- }
+ if (btrfs_test_opt(info, SPACE_CACHE) &&
+ cache->disk_cache_state < BTRFS_DC_CLEAR)
+ cache->disk_cache_state = BTRFS_DC_CLEAR;
- spin_lock(&trans->transaction->dirty_bgs_lock);
- if (list_empty(&cache->dirty_list)) {
- list_add_tail(&cache->dirty_list,
- &trans->transaction->dirty_bgs);
- trans->delayed_ref_updates++;
- btrfs_get_block_group(cache);
- }
- spin_unlock(&trans->transaction->dirty_bgs_lock);
+ old_val = cache->used;
+ if (alloc) {
+ old_val += num_bytes;
+ cache->used = old_val;
+ cache->reserved -= num_bytes;
+ cache->reclaim_mark = 0;
+ space_info->bytes_reserved -= num_bytes;
+ space_info->bytes_used += num_bytes;
+ space_info->disk_used += num_bytes * factor;
+ if (READ_ONCE(space_info->periodic_reclaim))
+ btrfs_space_info_update_reclaimable(space_info, -num_bytes);
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
+ } else {
+ old_val -= num_bytes;
+ cache->used = old_val;
+ cache->pinned += num_bytes;
+ btrfs_space_info_update_bytes_pinned(space_info, num_bytes);
+ space_info->bytes_used -= num_bytes;
+ space_info->disk_used -= num_bytes * factor;
+ if (READ_ONCE(space_info->periodic_reclaim))
+ btrfs_space_info_update_reclaimable(space_info, num_bytes);
+ else
+ reclaim = should_reclaim_block_group(cache, num_bytes);
- /*
- * No longer have used bytes in this block group, queue it for
- * deletion. We do this after adding the block group to the
- * dirty list to avoid races between cleaner kthread and space
- * cache writeout.
- */
- if (!alloc && old_val == 0) {
- if (!btrfs_test_opt(info, DISCARD_ASYNC))
- btrfs_mark_bg_unused(cache);
- }
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
- btrfs_put_block_group(cache);
- total -= num_bytes;
- bytenr += num_bytes;
+ btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr,
+ bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
+ }
+
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&cache->dirty_list)) {
+ list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs);
+ bg_already_dirty = false;
+ btrfs_get_block_group(cache);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
+ /*
+ * No longer have used bytes in this block group, queue it for deletion.
+ * We do this after adding the block group to the dirty list to avoid
+ * races between cleaner kthread and space cache writeout.
+ */
+ if (!alloc && old_val == 0) {
+ if (!btrfs_test_opt(info, DISCARD_ASYNC))
+ btrfs_mark_bg_unused(cache);
+ } else if (!alloc && reclaim) {
+ btrfs_mark_bg_to_reclaim(cache);
}
+ btrfs_put_block_group(cache);
+
/* Modified block groups are accounted for in the delayed_refs_rsv. */
- btrfs_update_delayed_refs_rsv(trans);
- return ret;
+ if (!bg_already_dirty)
+ btrfs_inc_delayed_refs_rsv_bg_updates(info);
+
+ return 0;
}
-/**
- * btrfs_add_reserved_bytes - update the block_group and space info counters
+/*
+ * Update the block_group and space info counters.
+ *
* @cache: The cache we are manipulating
* @ram_bytes: The number of bytes of file content, and will be same to
* @num_bytes except for the compress path.
@@ -2994,57 +3797,89 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
* reservation and return -EAGAIN, otherwise this function always succeeds.
*/
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
- u64 ram_bytes, u64 num_bytes, int delalloc)
+ u64 ram_bytes, u64 num_bytes, bool delalloc,
+ bool force_wrong_size_class)
{
struct btrfs_space_info *space_info = cache->space_info;
+ enum btrfs_block_group_size_class size_class;
int ret = 0;
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
if (cache->ro) {
ret = -EAGAIN;
- } else {
- cache->reserved += num_bytes;
- space_info->bytes_reserved += num_bytes;
- trace_btrfs_space_reservation(cache->fs_info, "space_info",
- space_info->flags, num_bytes, 1);
- btrfs_space_info_update_bytes_may_use(cache->fs_info,
- space_info, -ram_bytes);
- if (delalloc)
- cache->delalloc_bytes += num_bytes;
+ goto out_error;
}
+
+ if (btrfs_block_group_should_use_size_class(cache)) {
+ size_class = btrfs_calc_block_group_size_class(num_bytes);
+ ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class);
+ if (ret)
+ goto out_error;
+ }
+
+ cache->reserved += num_bytes;
+ if (delalloc)
+ cache->delalloc_bytes += num_bytes;
+
+ trace_btrfs_space_reservation(cache->fs_info, "space_info",
+ space_info->flags, num_bytes, 1);
+ spin_unlock(&cache->lock);
+
+ space_info->bytes_reserved += num_bytes;
+ btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes);
+
+ /*
+ * Compression can use less space than we reserved, so wake tickets if
+ * that happens.
+ */
+ if (num_bytes < ram_bytes)
+ btrfs_try_granting_tickets(space_info);
+ spin_unlock(&space_info->lock);
+
+ return 0;
+
+out_error:
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
return ret;
}
-/**
- * btrfs_free_reserved_bytes - update the block_group and space info counters
- * @cache: The cache we are manipulating
- * @num_bytes: The number of bytes in question
- * @delalloc: The blocks are allocated for the delalloc write
+/*
+ * Update the block_group and space info counters.
+ *
+ * @cache: The cache we are manipulating.
+ * @num_bytes: The number of bytes in question.
+ * @is_delalloc: Whether the blocks are allocated for a delalloc write.
*
* This is called by somebody who is freeing space that was never actually used
* on disk. For example if you reserve some space for a new leaf in transaction
* A and before transaction A commits you free that leaf, you call this with
* reserve set to 0 in order to clear the reservation.
*/
-void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
- u64 num_bytes, int delalloc)
+void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes,
+ bool is_delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
+ bool bg_ro;
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
- if (cache->ro)
- space_info->bytes_readonly += num_bytes;
+ bg_ro = cache->ro;
cache->reserved -= num_bytes;
+ if (is_delalloc)
+ cache->delalloc_bytes -= num_bytes;
+ spin_unlock(&cache->lock);
+
+ if (bg_ro)
+ space_info->bytes_readonly += num_bytes;
+ else if (btrfs_is_zoned(cache->fs_info))
+ space_info->bytes_zone_unusable += num_bytes;
+
space_info->bytes_reserved -= num_bytes;
space_info->max_extent_size = 0;
- if (delalloc)
- cache->delalloc_bytes -= num_bytes;
- spin_unlock(&cache->lock);
+ btrfs_try_granting_tickets(space_info);
spin_unlock(&space_info->lock);
}
@@ -3053,22 +3888,20 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
+ list_for_each_entry(found, head, list) {
if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
found->force_alloc = CHUNK_ALLOC_FORCE;
}
- rcu_read_unlock();
}
-static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *sinfo, int force)
+static bool should_alloc_chunk(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_space_info *sinfo, int force)
{
u64 bytes_used = btrfs_space_info_used(sinfo, false);
u64 thresh;
if (force == CHUNK_ALLOC_FORCE)
- return 1;
+ return true;
/*
* in limited mode, we want to have some free space up to
@@ -3076,48 +3909,286 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
*/
if (force == CHUNK_ALLOC_LIMITED) {
thresh = btrfs_super_total_bytes(fs_info->super_copy);
- thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
+ thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1));
if (sinfo->total_bytes - bytes_used < thresh)
- return 1;
+ return true;
}
- if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
- return 0;
- return 1;
+ if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80))
+ return false;
+ return true;
}
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
{
u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
+ struct btrfs_space_info *space_info;
+
+ space_info = btrfs_find_space_info(trans->fs_info, type);
+ if (!space_info) {
+ DEBUG_WARN();
+ return -EINVAL;
+ }
+
+ return btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
+}
+
+static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_space_info *space_info,
+ u64 flags)
+{
+ struct btrfs_block_group *bg;
+ int ret;
+
+ /*
+ * Check if we have enough space in the system space info because we
+ * will need to update device items in the chunk btree and insert a new
+ * chunk item in the chunk btree as well. This will allocate a new
+ * system block group if needed.
+ */
+ check_system_chunk(trans, flags);
+
+ bg = btrfs_create_chunk(trans, space_info, flags);
+ if (IS_ERR(bg)) {
+ ret = PTR_ERR(bg);
+ goto out;
+ }
+
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
+ /*
+ * Normally we are not expected to fail with -ENOSPC here, since we have
+ * previously reserved space in the system space_info and allocated one
+ * new system chunk if necessary. However there are three exceptions:
+ *
+ * 1) We may have enough free space in the system space_info but all the
+ * existing system block groups have a profile which can not be used
+ * for extent allocation.
+ *
+ * This happens when mounting in degraded mode. For example we have a
+ * RAID1 filesystem with 2 devices, lose one device and mount the fs
+ * using the other device in degraded mode. If we then allocate a chunk,
+ * we may have enough free space in the existing system space_info, but
+ * none of the block groups can be used for extent allocation since they
+ * have a RAID1 profile, and because we are in degraded mode with a
+ * single device, we are forced to allocate a new system chunk with a
+ * SINGLE profile. Making check_system_chunk() iterate over all system
+ * block groups and check if they have a usable profile and enough space
+ * can be slow on very large filesystems, so we tolerate the -ENOSPC and
+ * try again after forcing allocation of a new system chunk. Like this
+ * we avoid paying the cost of that search in normal circumstances, when
+ * we were not mounted in degraded mode;
+ *
+ * 2) We had enough free space info the system space_info, and one suitable
+ * block group to allocate from when we called check_system_chunk()
+ * above. However right after we called it, the only system block group
+ * with enough free space got turned into RO mode by a running scrub,
+ * and in this case we have to allocate a new one and retry. We only
+ * need do this allocate and retry once, since we have a transaction
+ * handle and scrub uses the commit root to search for block groups;
+ *
+ * 3) We had one system block group with enough free space when we called
+ * check_system_chunk(), but after that, right before we tried to
+ * allocate the last extent buffer we needed, a discard operation came
+ * in and it temporarily removed the last free space entry from the
+ * block group (discard removes a free space entry, discards it, and
+ * then adds back the entry to the block group cache).
+ */
+ if (ret == -ENOSPC) {
+ const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
+ struct btrfs_block_group *sys_bg;
+ struct btrfs_space_info *sys_space_info;
+
+ sys_space_info = btrfs_find_space_info(trans->fs_info, sys_flags);
+ if (unlikely(!sys_space_info)) {
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
- return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+ sys_bg = btrfs_create_chunk(trans, sys_space_info, sys_flags);
+ if (IS_ERR(sys_bg)) {
+ ret = PTR_ERR(sys_bg);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ } else if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+out:
+ btrfs_trans_release_chunk_metadata(trans);
+
+ if (ret)
+ return ERR_PTR(ret);
+
+ btrfs_get_block_group(bg);
+ return bg;
}
/*
- * If force is CHUNK_ALLOC_FORCE:
+ * Chunk allocation is done in 2 phases:
+ *
+ * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
+ * the chunk, the chunk mapping, create its block group and add the items
+ * that belong in the chunk btree to it - more specifically, we need to
+ * update device items in the chunk btree and add a new chunk item to it.
+ *
+ * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
+ * group item to the extent btree and the device extent items to the devices
+ * btree.
+ *
+ * This is done to prevent deadlocks. For example when COWing a node from the
+ * extent btree we are holding a write lock on the node's parent and if we
+ * trigger chunk allocation and attempted to insert the new block group item
+ * in the extent btree right way, we could deadlock because the path for the
+ * insertion can include that parent node. At first glance it seems impossible
+ * to trigger chunk allocation after starting a transaction since tasks should
+ * reserve enough transaction units (metadata space), however while that is true
+ * most of the time, chunk allocation may still be triggered for several reasons:
+ *
+ * 1) When reserving metadata, we check if there is enough free space in the
+ * metadata space_info and therefore don't trigger allocation of a new chunk.
+ * However later when the task actually tries to COW an extent buffer from
+ * the extent btree or from the device btree for example, it is forced to
+ * allocate a new block group (chunk) because the only one that had enough
+ * free space was just turned to RO mode by a running scrub for example (or
+ * device replace, block group reclaim thread, etc), so we can not use it
+ * for allocating an extent and end up being forced to allocate a new one;
+ *
+ * 2) Because we only check that the metadata space_info has enough free bytes,
+ * we end up not allocating a new metadata chunk in that case. However if
+ * the filesystem was mounted in degraded mode, none of the existing block
+ * groups might be suitable for extent allocation due to their incompatible
+ * profile (for e.g. mounting a 2 devices filesystem, where all block groups
+ * use a RAID1 profile, in degraded mode using a single device). In this case
+ * when the task attempts to COW some extent buffer of the extent btree for
+ * example, it will trigger allocation of a new metadata block group with a
+ * suitable profile (SINGLE profile in the example of the degraded mount of
+ * the RAID1 filesystem);
+ *
+ * 3) The task has reserved enough transaction units / metadata space, but when
+ * it attempts to COW an extent buffer from the extent or device btree for
+ * example, it does not find any free extent in any metadata block group,
+ * therefore forced to try to allocate a new metadata block group.
+ * This is because some other task allocated all available extents in the
+ * meanwhile - this typically happens with tasks that don't reserve space
+ * properly, either intentionally or as a bug. One example where this is
+ * done intentionally is fsync, as it does not reserve any transaction units
+ * and ends up allocating a variable number of metadata extents for log
+ * tree extent buffers;
+ *
+ * 4) The task has reserved enough transaction units / metadata space, but right
+ * before it tries to allocate the last extent buffer it needs, a discard
+ * operation comes in and, temporarily, removes the last free space entry from
+ * the only metadata block group that had free space (discard starts by
+ * removing a free space entry from a block group, then does the discard
+ * operation and, once it's done, it adds back the free space entry to the
+ * block group).
+ *
+ * We also need this 2 phases setup when adding a device to a filesystem with
+ * a seed device - we must create new metadata and system chunks without adding
+ * any of the block group items to the chunk, extent and device btrees. If we
+ * did not do it this way, we would get ENOSPC when attempting to update those
+ * btrees, since all the chunks from the seed device are read-only.
+ *
+ * Phase 1 does the updates and insertions to the chunk btree because if we had
+ * it done in phase 2 and have a thundering herd of tasks allocating chunks in
+ * parallel, we risk having too many system chunks allocated by many tasks if
+ * many tasks reach phase 1 without the previous ones completing phase 2. In the
+ * extreme case this leads to exhaustion of the system chunk array in the
+ * superblock. This is easier to trigger if using a btree node/leaf size of 64K
+ * and with RAID filesystems (so we have more device items in the chunk btree).
+ * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
+ * the system chunk array due to concurrent allocations") provides more details.
+ *
+ * Allocation of system chunks does not happen through this function. A task that
+ * needs to update the chunk btree (the only btree that uses system chunks), must
+ * preallocate chunk space by calling either check_system_chunk() or
+ * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
+ * metadata chunk or when removing a chunk, while the later is used before doing
+ * a modification to the chunk btree - use cases for the later are adding,
+ * removing and resizing a device as well as relocation of a system chunk.
+ * See the comment below for more details.
+ *
+ * The reservation of system space, done through check_system_chunk(), as well
+ * as all the updates and insertions into the chunk btree must be done while
+ * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
+ * an extent buffer from the chunks btree we never trigger allocation of a new
+ * system chunk, which would result in a deadlock (trying to lock twice an
+ * extent buffer of the chunk btree, first time before triggering the chunk
+ * allocation and the second time during chunk allocation while attempting to
+ * update the chunks btree). The system chunk array is also updated while holding
+ * that mutex. The same logic applies to removing chunks - we must reserve system
+ * space, update the chunk btree and the system chunk array in the superblock
+ * while holding fs_info->chunk_mutex.
+ *
+ * This function, btrfs_chunk_alloc(), belongs to phase 1.
+ *
+ * @space_info: specify which space_info the new chunk should belong to.
+ *
+ * If @force is CHUNK_ALLOC_FORCE:
* - return 1 if it successfully allocates a chunk,
* - return errors including -ENOSPC otherwise.
- * If force is NOT CHUNK_ALLOC_FORCE:
+ * If @force is NOT CHUNK_ALLOC_FORCE:
* - return 0 if it doesn't need to allocate a new chunk,
* - return 1 if it successfully allocates a chunk,
* - return errors including -ENOSPC otherwise.
*/
-int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
+int btrfs_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_space_info *space_info, u64 flags,
enum btrfs_chunk_alloc_enum force)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_space_info *space_info;
+ struct btrfs_block_group *ret_bg;
bool wait_for_alloc = false;
bool should_alloc = false;
+ bool from_extent_allocation = false;
int ret = 0;
+ if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
+ from_extent_allocation = true;
+ force = CHUNK_ALLOC_FORCE;
+ }
+
/* Don't re-enter if we're already allocating a chunk */
if (trans->allocating_chunk)
return -ENOSPC;
-
- space_info = btrfs_find_space_info(fs_info, flags);
- ASSERT(space_info);
+ /*
+ * Allocation of system chunks can not happen through this path, as we
+ * could end up in a deadlock if we are allocating a data or metadata
+ * chunk and there is another task modifying the chunk btree.
+ *
+ * This is because while we are holding the chunk mutex, we will attempt
+ * to add the new chunk item to the chunk btree or update an existing
+ * device item in the chunk btree, while the other task that is modifying
+ * the chunk btree is attempting to COW an extent buffer while holding a
+ * lock on it and on its parent - if the COW operation triggers a system
+ * chunk allocation, then we can deadlock because we are holding the
+ * chunk mutex and we may need to access that extent buffer or its parent
+ * in order to add the chunk item or update a device item.
+ *
+ * Tasks that want to modify the chunk tree should reserve system space
+ * before updating the chunk btree, by calling either
+ * btrfs_reserve_chunk_metadata() or check_system_chunk().
+ * It's possible that after a task reserves the space, it still ends up
+ * here - this happens in the cases described above at do_chunk_alloc().
+ * The task will have to either retry or fail.
+ */
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return -ENOSPC;
do {
spin_lock(&space_info->lock);
@@ -3126,11 +4197,11 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
should_alloc = should_alloc_chunk(fs_info, space_info, force);
if (space_info->full) {
/* No more free physical space */
+ spin_unlock(&space_info->lock);
if (should_alloc)
ret = -ENOSPC;
else
ret = 0;
- spin_unlock(&space_info->lock);
return ret;
} else if (!should_alloc) {
spin_unlock(&space_info->lock);
@@ -3142,15 +4213,16 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
* recheck if we should continue with our allocation
* attempt.
*/
- wait_for_alloc = true;
spin_unlock(&space_info->lock);
+ wait_for_alloc = true;
+ force = CHUNK_ALLOC_NO_FORCE;
mutex_lock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->chunk_mutex);
} else {
/* Proceed with allocation */
- space_info->chunk_alloc = 1;
- wait_for_alloc = false;
+ space_info->chunk_alloc = true;
spin_unlock(&space_info->lock);
+ wait_for_alloc = false;
}
cond_resched();
@@ -3178,19 +4250,26 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
force_metadata_allocation(fs_info);
}
- /*
- * Check if we have enough space in SYSTEM chunk because we may need
- * to update devices.
- */
- check_system_chunk(trans, flags);
-
- ret = btrfs_alloc_chunk(trans, flags);
+ ret_bg = do_chunk_alloc(trans, space_info, flags);
trans->allocating_chunk = false;
+ if (IS_ERR(ret_bg)) {
+ ret = PTR_ERR(ret_bg);
+ } else if (from_extent_allocation && (flags & BTRFS_BLOCK_GROUP_DATA)) {
+ /*
+ * New block group is likely to be used soon. Try to activate
+ * it now. Failure is OK for now.
+ */
+ btrfs_zone_activate(ret_bg);
+ }
+
+ if (!ret)
+ btrfs_put_block_group(ret_bg);
+
spin_lock(&space_info->lock);
if (ret < 0) {
if (ret == -ENOSPC)
- space_info->full = 1;
+ space_info->full = true;
else
goto out;
} else {
@@ -3200,30 +4279,14 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
out:
- space_info->chunk_alloc = 0;
+ space_info->chunk_alloc = false;
spin_unlock(&space_info->lock);
mutex_unlock(&fs_info->chunk_mutex);
- /*
- * When we allocate a new chunk we reserve space in the chunk block
- * reserve to make sure we can COW nodes/leafs in the chunk tree or
- * add new nodes/leafs to it if we end up needing to do it when
- * inserting the chunk item and updating device items as part of the
- * second phase of chunk allocation, performed by
- * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
- * large number of new block groups to create in our transaction
- * handle's new_bgs list to avoid exhausting the chunk block reserve
- * in extreme cases - like having a single transaction create many new
- * block groups when starting to write out the free space caches of all
- * the block groups that were made dirty during the lifetime of the
- * transaction.
- */
- if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
- btrfs_create_pending_block_groups(trans);
return ret;
}
-static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
+static u64 get_profile_num_devs(const struct btrfs_fs_info *fs_info, u64 type)
{
u64 num_dev;
@@ -3234,17 +4297,14 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
return num_dev;
}
-/*
- * Reserve space in the system space for allocating or removing a chunk
- */
-void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+static void reserve_chunk_space(struct btrfs_trans_handle *trans,
+ u64 bytes,
+ u64 type)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *info;
u64 left;
- u64 thresh;
int ret = 0;
- u64 num_devs;
/*
* Needed because we can end up allocating a system chunk and for an
@@ -3257,20 +4317,19 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
left = info->total_bytes - btrfs_space_info_used(info, true);
spin_unlock(&info->lock);
- num_devs = get_profile_num_devs(fs_info, type);
-
- /* num_devs device items to update and 1 chunk item to add or remove */
- thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
- btrfs_calc_insert_metadata_size(fs_info, 1);
-
- if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
- left, thresh, type);
- btrfs_dump_space_info(fs_info, info, 0, 0);
+ left, bytes, type);
+ btrfs_dump_space_info(info, 0, false);
}
- if (left < thresh) {
+ if (left < bytes) {
u64 flags = btrfs_system_alloc_profile(fs_info);
+ struct btrfs_block_group *bg;
+ struct btrfs_space_info *space_info;
+
+ space_info = btrfs_find_space_info(fs_info, flags);
+ ASSERT(space_info);
/*
* Ignore failure to create system chunk. We might end up not
@@ -3278,51 +4337,149 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
* the paths we visit in the chunk tree (they were already COWed
* or created in the current transaction for example).
*/
- ret = btrfs_alloc_chunk(trans, flags);
+ bg = btrfs_create_chunk(trans, space_info, flags);
+ if (IS_ERR(bg)) {
+ ret = PTR_ERR(bg);
+ } else {
+ /*
+ * We have a new chunk. We also need to activate it for
+ * zoned filesystem.
+ */
+ ret = btrfs_zoned_activate_one_bg(info, true);
+ if (ret < 0)
+ return;
+
+ /*
+ * If we fail to add the chunk item here, we end up
+ * trying again at phase 2 of chunk allocation, at
+ * btrfs_create_pending_block_groups(). So ignore
+ * any error here. An ENOSPC here could happen, due to
+ * the cases described at do_chunk_alloc() - the system
+ * block group we just created was just turned into RO
+ * mode by a scrub for example, or a running discard
+ * temporarily removed its free space entries, etc.
+ */
+ btrfs_chunk_alloc_add_chunk_item(trans, bg);
+ }
}
if (!ret) {
- ret = btrfs_block_rsv_add(fs_info->chunk_root,
+ ret = btrfs_block_rsv_add(fs_info,
&fs_info->chunk_block_rsv,
- thresh, BTRFS_RESERVE_NO_FLUSH);
+ bytes, BTRFS_RESERVE_NO_FLUSH);
if (!ret)
- trans->chunk_bytes_reserved += thresh;
+ trans->chunk_bytes_reserved += bytes;
}
}
+/*
+ * Reserve space in the system space for allocating or removing a chunk.
+ * The caller must be holding fs_info->chunk_mutex.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ const u64 num_devs = get_profile_num_devs(fs_info, type);
+ u64 bytes;
+
+ /* num_devs device items to update and 1 chunk item to add or remove. */
+ bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
+ btrfs_calc_insert_metadata_size(fs_info, 1);
+
+ reserve_chunk_space(trans, bytes, type);
+}
+
+/*
+ * Reserve space in the system space, if needed, for doing a modification to the
+ * chunk btree.
+ *
+ * @trans: A transaction handle.
+ * @is_item_insertion: Indicate if the modification is for inserting a new item
+ * in the chunk btree or if it's for the deletion or update
+ * of an existing item.
+ *
+ * This is used in a context where we need to update the chunk btree outside
+ * block group allocation and removal, to avoid a deadlock with a concurrent
+ * task that is allocating a metadata or data block group and therefore needs to
+ * update the chunk btree while holding the chunk mutex. After the update to the
+ * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
+ *
+ */
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+ bool is_item_insertion)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ u64 bytes;
+
+ if (is_item_insertion)
+ bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+ else
+ bytes = btrfs_calc_metadata_size(fs_info, 1);
+
+ mutex_lock(&fs_info->chunk_mutex);
+ reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
+ mutex_unlock(&fs_info->chunk_mutex);
+}
+
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
struct btrfs_block_group *block_group;
- u64 last = 0;
- while (1) {
- struct inode *inode;
+ block_group = btrfs_lookup_first_block_group(info, 0);
+ while (block_group) {
+ btrfs_wait_block_group_cache_done(block_group);
+ spin_lock(&block_group->lock);
+ if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
+ &block_group->runtime_flags)) {
+ struct btrfs_inode *inode = block_group->inode;
- block_group = btrfs_lookup_first_block_group(info, last);
- while (block_group) {
- btrfs_wait_block_group_cache_done(block_group);
- spin_lock(&block_group->lock);
- if (block_group->iref)
- break;
+ block_group->inode = NULL;
+ spin_unlock(&block_group->lock);
+
+ ASSERT(block_group->io_ctl.inode == NULL);
+ iput(&inode->vfs_inode);
+ } else {
spin_unlock(&block_group->lock);
- block_group = btrfs_next_block_group(block_group);
}
- if (!block_group) {
- if (last == 0)
- break;
- last = 0;
- continue;
+ block_group = btrfs_next_block_group(block_group);
+ }
+}
+
+static void check_removing_space_info(struct btrfs_space_info *space_info)
+{
+ struct btrfs_fs_info *info = space_info->fs_info;
+
+ if (space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY) {
+ /* This is a top space_info, proceed with its children first. */
+ for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) {
+ if (space_info->sub_group[i]) {
+ check_removing_space_info(space_info->sub_group[i]);
+ kfree(space_info->sub_group[i]);
+ space_info->sub_group[i] = NULL;
+ }
}
+ }
- inode = block_group->inode;
- block_group->iref = 0;
- block_group->inode = NULL;
- spin_unlock(&block_group->lock);
- ASSERT(block_group->io_ctl.inode == NULL);
- iput(inode);
- last = block_group->start + block_group->length;
- btrfs_put_block_group(block_group);
+ /*
+ * Do not hide this behind enospc_debug, this is actually important and
+ * indicates a real bug if this happens.
+ */
+ if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_may_use > 0))
+ btrfs_dump_space_info(space_info, 0, false);
+
+ /*
+ * If there was a failure to cleanup a log tree, very likely due to an
+ * IO failure on a writeback attempt of one or more of its extent
+ * buffers, we could not do proper (and cheap) unaccounting of their
+ * reserved space, so don't warn on bytes_reserved > 0 in that case.
+ */
+ if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
+ if (WARN_ON(space_info->bytes_reserved > 0))
+ btrfs_dump_space_info(space_info, 0, false);
}
+
+ WARN_ON(space_info->reclaim_size > 0);
}
/*
@@ -3337,14 +4494,25 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
- down_write(&info->commit_root_sem);
+ if (btrfs_is_zoned(info)) {
+ if (info->active_meta_bg) {
+ btrfs_put_block_group(info->active_meta_bg);
+ info->active_meta_bg = NULL;
+ }
+ if (info->active_system_bg) {
+ btrfs_put_block_group(info->active_system_bg);
+ info->active_system_bg = NULL;
+ }
+ }
+
+ write_lock(&info->block_group_cache_lock);
while (!list_empty(&info->caching_block_groups)) {
- caching_ctl = list_entry(info->caching_block_groups.next,
- struct btrfs_caching_control, list);
+ caching_ctl = list_first_entry(&info->caching_block_groups,
+ struct btrfs_caching_control, list);
list_del(&caching_ctl->list);
btrfs_put_caching_control(caching_ctl);
}
- up_write(&info->commit_root_sem);
+ write_unlock(&info->block_group_cache_lock);
spin_lock(&info->unused_bgs_lock);
while (!list_empty(&info->unused_bgs)) {
@@ -3354,16 +4522,34 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
list_del_init(&block_group->bg_list);
btrfs_put_block_group(block_group);
}
+
+ while (!list_empty(&info->reclaim_bgs)) {
+ block_group = list_first_entry(&info->reclaim_bgs,
+ struct btrfs_block_group,
+ bg_list);
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ }
spin_unlock(&info->unused_bgs_lock);
- spin_lock(&info->block_group_cache_lock);
- while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+ spin_lock(&info->zone_active_bgs_lock);
+ while (!list_empty(&info->zone_active_bgs)) {
+ block_group = list_first_entry(&info->zone_active_bgs,
+ struct btrfs_block_group,
+ active_bg_list);
+ list_del_init(&block_group->active_bg_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&info->zone_active_bgs_lock);
+
+ write_lock(&info->block_group_cache_lock);
+ while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group,
cache_node);
- rb_erase(&block_group->cache_node,
- &info->block_group_cache_tree);
+ rb_erase_cached(&block_group->cache_node,
+ &info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
list_del(&block_group->list);
@@ -3382,37 +4568,21 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
ASSERT(list_empty(&block_group->dirty_list));
ASSERT(list_empty(&block_group->io_list));
ASSERT(list_empty(&block_group->bg_list));
- ASSERT(atomic_read(&block_group->count) == 1);
+ ASSERT(refcount_read(&block_group->refs) == 1);
+ ASSERT(block_group->swap_extents == 0);
btrfs_put_block_group(block_group);
- spin_lock(&info->block_group_cache_lock);
+ write_lock(&info->block_group_cache_lock);
}
- spin_unlock(&info->block_group_cache_lock);
-
- /*
- * Now that all the block groups are freed, go through and free all the
- * space_info structs. This is only called during the final stages of
- * unmount, and so we know nobody is using them. We call
- * synchronize_rcu() once before we start, just to be on the safe side.
- */
- synchronize_rcu();
+ write_unlock(&info->block_group_cache_lock);
btrfs_release_global_block_rsv(info);
while (!list_empty(&info->space_info)) {
- space_info = list_entry(info->space_info.next,
- struct btrfs_space_info,
- list);
+ space_info = list_first_entry(&info->space_info,
+ struct btrfs_space_info, list);
- /*
- * Do not hide this behind enospc_debug, this is actually
- * important and indicates a real bug if this happens.
- */
- if (WARN_ON(space_info->bytes_pinned > 0 ||
- space_info->bytes_reserved > 0 ||
- space_info->bytes_may_use > 0))
- btrfs_dump_space_info(info, space_info, 0, 0);
- WARN_ON(space_info->reclaim_size > 0);
+ check_removing_space_info(space_info);
list_del(&space_info->list);
btrfs_sysfs_remove_space_info(space_info);
}
@@ -3427,35 +4597,123 @@ void btrfs_freeze_block_group(struct btrfs_block_group *cache)
void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct extent_map_tree *em_tree;
- struct extent_map *em;
bool cleanup;
spin_lock(&block_group->lock);
cleanup = (atomic_dec_and_test(&block_group->frozen) &&
- block_group->removed);
+ test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
spin_unlock(&block_group->lock);
if (cleanup) {
- mutex_lock(&fs_info->chunk_mutex);
- em_tree = &fs_info->mapping_tree;
- write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, block_group->start,
- 1);
- BUG_ON(!em); /* logic error, can't happen */
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
- mutex_unlock(&fs_info->chunk_mutex);
+ struct btrfs_chunk_map *map;
+
+ map = btrfs_find_chunk_map(fs_info, block_group->start, 1);
+ /* Logic error, can't happen. */
+ ASSERT(map);
- /* once for us and once for the tree */
- free_extent_map(em);
- free_extent_map(em);
+ btrfs_remove_chunk_map(fs_info, map);
+
+ /* Once for our lookup reference. */
+ btrfs_free_chunk_map(map);
/*
* We may have left one free space entry and other possible
* tasks trimming this block group have left 1 entry each one.
* Free them if any.
*/
- __btrfs_remove_free_space_cache(block_group->free_space_ctl);
+ btrfs_remove_free_space_cache(block_group);
}
}
+
+bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
+{
+ bool ret = true;
+
+ spin_lock(&bg->lock);
+ if (bg->ro)
+ ret = false;
+ else
+ bg->swap_extents++;
+ spin_unlock(&bg->lock);
+
+ return ret;
+}
+
+void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
+{
+ spin_lock(&bg->lock);
+ ASSERT(!bg->ro);
+ ASSERT(bg->swap_extents >= amount);
+ bg->swap_extents -= amount;
+ spin_unlock(&bg->lock);
+}
+
+enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size)
+{
+ if (size <= SZ_128K)
+ return BTRFS_BG_SZ_SMALL;
+ if (size <= SZ_8M)
+ return BTRFS_BG_SZ_MEDIUM;
+ return BTRFS_BG_SZ_LARGE;
+}
+
+/*
+ * Handle a block group allocating an extent in a size class
+ *
+ * @bg: The block group we allocated in.
+ * @size_class: The size class of the allocation.
+ * @force_wrong_size_class: Whether we are desperate enough to allow
+ * mismatched size classes.
+ *
+ * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the
+ * case of a race that leads to the wrong size class without
+ * force_wrong_size_class set.
+ *
+ * find_free_extent will skip block groups with a mismatched size class until
+ * it really needs to avoid ENOSPC. In that case it will set
+ * force_wrong_size_class. However, if a block group is newly allocated and
+ * doesn't yet have a size class, then it is possible for two allocations of
+ * different sizes to race and both try to use it. The loser is caught here and
+ * has to retry.
+ */
+int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
+ enum btrfs_block_group_size_class size_class,
+ bool force_wrong_size_class)
+{
+ ASSERT(size_class != BTRFS_BG_SZ_NONE);
+
+ /* The new allocation is in the right size class, do nothing */
+ if (bg->size_class == size_class)
+ return 0;
+ /*
+ * The new allocation is in a mismatched size class.
+ * This means one of two things:
+ *
+ * 1. Two tasks in find_free_extent for different size_classes raced
+ * and hit the same empty block_group. Make the loser try again.
+ * 2. A call to find_free_extent got desperate enough to set
+ * 'force_wrong_slab'. Don't change the size_class, but allow the
+ * allocation.
+ */
+ if (bg->size_class != BTRFS_BG_SZ_NONE) {
+ if (force_wrong_size_class)
+ return 0;
+ return -EAGAIN;
+ }
+ /*
+ * The happy new block group case: the new allocation is the first
+ * one in the block_group so we set size_class.
+ */
+ bg->size_class = size_class;
+
+ return 0;
+}
+
+bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg)
+{
+ if (btrfs_is_zoned(bg->fs_info))
+ return false;
+ if (!btrfs_is_block_group_data_only(bg))
+ return false;
+ return true;
+}