summaryrefslogtreecommitdiff
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c1195
1 files changed, 581 insertions, 614 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e2d7e86b51d1..e08d0d45af4f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -26,7 +26,8 @@
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/percpu_counter.h>
-#include "hash.h"
+#include <linux/lockdep.h>
+#include <linux/crc32c.h>
#include "tree-log.h"
#include "disk-io.h"
#include "print-tree.h"
@@ -38,6 +39,7 @@
#include "math.h"
#include "sysfs.h"
#include "qgroup.h"
+#include "ref-verify.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -61,9 +63,6 @@ enum {
CHUNK_ALLOC_FORCE = 2,
};
-static int update_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 num_bytes, int alloc);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_node *node, u64 parent,
@@ -91,17 +90,8 @@ static int find_next_key(struct btrfs_path *path, int level,
static void dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
-static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 ram_bytes, u64 num_bytes, int delalloc);
-static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int delalloc);
static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
-static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush,
- bool system_chunk);
static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 num_bytes);
@@ -545,13 +535,11 @@ static noinline void caching_thread(struct btrfs_work *work)
struct btrfs_block_group_cache *block_group;
struct btrfs_fs_info *fs_info;
struct btrfs_caching_control *caching_ctl;
- struct btrfs_root *extent_root;
int ret;
caching_ctl = container_of(work, struct btrfs_caching_control, work);
block_group = caching_ctl->block_group;
fs_info = block_group->fs_info;
- extent_root = fs_info->extent_root;
mutex_lock(&caching_ctl->mutex);
down_read(&fs_info->commit_root_sem);
@@ -652,7 +640,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
cache->cached = BTRFS_CACHE_FAST;
spin_unlock(&cache->lock);
- if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
+ if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
mutex_lock(&caching_ctl->mutex);
ret = load_free_space_cache(fs_info, cache);
@@ -923,7 +911,7 @@ search_again:
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
if (head) {
if (!mutex_trylock(&head->mutex)) {
- refcount_inc(&head->node.refs);
+ refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
btrfs_release_path(path);
@@ -934,7 +922,7 @@ search_again:
*/
mutex_lock(&head->mutex);
mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref(&head->node);
+ btrfs_put_delayed_ref_head(head);
goto search_again;
}
spin_lock(&head->lock);
@@ -943,7 +931,7 @@ search_again:
else
BUG_ON(num_refs == 0);
- num_refs += head->node.ref_mod;
+ num_refs += head->ref_mod;
spin_unlock(&head->lock);
mutex_unlock(&head->mutex);
}
@@ -1213,11 +1201,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
__le64 lenum;
lenum = cpu_to_le64(root_objectid);
- high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+ high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(owner);
- low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+ low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(offset);
- low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+ low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
return ((u64)high_crc << 31) ^ (u64)low_crc;
}
@@ -2155,7 +2143,14 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
for (i = 0; i < bbio->num_stripes; i++, stripe++) {
u64 bytes;
- if (!stripe->dev->can_discard)
+ struct request_queue *req_q;
+
+ if (!stripe->dev->bdev) {
+ ASSERT(btrfs_test_opt(fs_info, DEGRADED));
+ continue;
+ }
+ req_q = bdev_get_queue(stripe->dev->bdev);
+ if (!blk_queue_discard(req_q))
continue;
ret = btrfs_issue_discard(stripe->dev->bdev,
@@ -2189,16 +2184,20 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
/* Can return -ENOMEM */
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner, u64 offset)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int old_ref_mod, new_ref_mod;
int ret;
BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
root_objectid == BTRFS_TREE_LOG_OBJECTID);
+ btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
+ owner, offset, BTRFS_ADD_DELAYED_REF);
+
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
num_bytes, parent,
@@ -2344,7 +2343,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_key key;
@@ -2366,14 +2365,14 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- key.objectid = node->bytenr;
+ key.objectid = head->bytenr;
if (metadata) {
key.type = BTRFS_METADATA_ITEM_KEY;
key.offset = extent_op->level;
} else {
key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = node->num_bytes;
+ key.offset = head->num_bytes;
}
again:
@@ -2390,17 +2389,17 @@ again:
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key,
path->slots[0]);
- if (key.objectid == node->bytenr &&
+ if (key.objectid == head->bytenr &&
key.type == BTRFS_EXTENT_ITEM_KEY &&
- key.offset == node->num_bytes)
+ key.offset == head->num_bytes)
ret = 0;
}
if (ret > 0) {
btrfs_release_path(path);
metadata = 0;
- key.objectid = node->bytenr;
- key.offset = node->num_bytes;
+ key.objectid = head->bytenr;
+ key.offset = head->num_bytes;
key.type = BTRFS_EXTENT_ITEM_KEY;
goto again;
}
@@ -2507,44 +2506,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
return 0;
}
- if (btrfs_delayed_ref_is_head(node)) {
- struct btrfs_delayed_ref_head *head;
- /*
- * we've hit the end of the chain and we were supposed
- * to insert this extent into the tree. But, it got
- * deleted before we ever needed to insert it, so all
- * we have to do is clean up the accounting
- */
- BUG_ON(extent_op);
- head = btrfs_delayed_node_to_head(node);
- trace_run_delayed_ref_head(fs_info, node, head, node->action);
-
- if (head->total_ref_mod < 0) {
- struct btrfs_block_group_cache *cache;
-
- cache = btrfs_lookup_block_group(fs_info, node->bytenr);
- ASSERT(cache);
- percpu_counter_add(&cache->space_info->total_bytes_pinned,
- -node->num_bytes);
- btrfs_put_block_group(cache);
- }
-
- if (insert_reserved) {
- btrfs_pin_extent(fs_info, node->bytenr,
- node->num_bytes, 1);
- if (head->is_data) {
- ret = btrfs_del_csums(trans, fs_info,
- node->bytenr,
- node->num_bytes);
- }
- }
-
- /* Also free its reserved qgroup space */
- btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
- head->qgroup_reserved);
- return ret;
- }
-
if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
node->type == BTRFS_SHARED_BLOCK_REF_KEY)
ret = run_delayed_tree_ref(trans, fs_info, node, extent_op,
@@ -2563,7 +2524,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
struct btrfs_delayed_ref_node *ref;
- if (list_empty(&head->ref_list))
+ if (RB_EMPTY_ROOT(&head->ref_tree))
return NULL;
/*
@@ -2576,20 +2537,122 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
return list_first_entry(&head->ref_add_list,
struct btrfs_delayed_ref_node, add_list);
- ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
- list);
+ ref = rb_entry(rb_first(&head->ref_tree),
+ struct btrfs_delayed_ref_node, ref_node);
ASSERT(list_empty(&ref->add_list));
return ref;
}
+static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
+{
+ spin_lock(&delayed_refs->lock);
+ head->processing = 0;
+ delayed_refs->num_heads_ready++;
+ spin_unlock(&delayed_refs->lock);
+ btrfs_delayed_ref_unlock(head);
+}
+
+static int cleanup_extent_op(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_delayed_extent_op *extent_op = head->extent_op;
+ int ret;
+
+ if (!extent_op)
+ return 0;
+ head->extent_op = NULL;
+ if (head->must_insert_reserved) {
+ btrfs_free_delayed_extent_op(extent_op);
+ return 0;
+ }
+ spin_unlock(&head->lock);
+ ret = run_delayed_extent_op(trans, fs_info, head, extent_op);
+ btrfs_free_delayed_extent_op(extent_op);
+ return ret ? ret : 1;
+}
+
+static int cleanup_ref_head(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int ret;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+
+ ret = cleanup_extent_op(trans, fs_info, head);
+ if (ret < 0) {
+ unselect_delayed_ref_head(delayed_refs, head);
+ btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
+ return ret;
+ } else if (ret) {
+ return ret;
+ }
+
+ /*
+ * Need to drop our head ref lock and re-acquire the delayed ref lock
+ * and then re-check to make sure nobody got added.
+ */
+ spin_unlock(&head->lock);
+ spin_lock(&delayed_refs->lock);
+ spin_lock(&head->lock);
+ if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
+ spin_unlock(&head->lock);
+ spin_unlock(&delayed_refs->lock);
+ return 1;
+ }
+ delayed_refs->num_heads--;
+ rb_erase(&head->href_node, &delayed_refs->href_root);
+ RB_CLEAR_NODE(&head->href_node);
+ spin_unlock(&delayed_refs->lock);
+ spin_unlock(&head->lock);
+ atomic_dec(&delayed_refs->num_entries);
+
+ trace_run_delayed_ref_head(fs_info, head, 0);
+
+ if (head->total_ref_mod < 0) {
+ struct btrfs_block_group_cache *cache;
+
+ cache = btrfs_lookup_block_group(fs_info, head->bytenr);
+ ASSERT(cache);
+ percpu_counter_add(&cache->space_info->total_bytes_pinned,
+ -head->num_bytes);
+ btrfs_put_block_group(cache);
+
+ if (head->is_data) {
+ spin_lock(&delayed_refs->lock);
+ delayed_refs->pending_csums -= head->num_bytes;
+ spin_unlock(&delayed_refs->lock);
+ }
+ }
+
+ if (head->must_insert_reserved) {
+ btrfs_pin_extent(fs_info, head->bytenr,
+ head->num_bytes, 1);
+ if (head->is_data) {
+ ret = btrfs_del_csums(trans, fs_info, head->bytenr,
+ head->num_bytes);
+ }
+ }
+
+ /* Also free its reserved qgroup space */
+ btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
+ head->qgroup_reserved);
+ btrfs_delayed_ref_unlock(head);
+ btrfs_put_delayed_ref_head(head);
+ return 0;
+}
+
/*
* Returns 0 on success or if called with an already aborted transaction.
* Returns -ENOMEM or -EIO on failure and will abort the transaction.
*/
static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
unsigned long nr)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_ref_head *locked_ref = NULL;
@@ -2655,11 +2718,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
if (ref && ref->seq &&
btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
spin_unlock(&locked_ref->lock);
- spin_lock(&delayed_refs->lock);
- locked_ref->processing = 0;
- delayed_refs->num_heads_ready++;
- spin_unlock(&delayed_refs->lock);
- btrfs_delayed_ref_unlock(locked_ref);
+ unselect_delayed_ref_head(delayed_refs, locked_ref);
locked_ref = NULL;
cond_resched();
count++;
@@ -2667,102 +2726,55 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
}
/*
- * record the must insert reserved flag before we
- * drop the spin lock.
+ * We're done processing refs in this ref_head, clean everything
+ * up and move on to the next ref_head.
*/
- must_insert_reserved = locked_ref->must_insert_reserved;
- locked_ref->must_insert_reserved = 0;
-
- extent_op = locked_ref->extent_op;
- locked_ref->extent_op = NULL;
-
if (!ref) {
-
-
- /* All delayed refs have been processed, Go ahead
- * and send the head node to run_one_delayed_ref,
- * so that any accounting fixes can happen
- */
- ref = &locked_ref->node;
-
- if (extent_op && must_insert_reserved) {
- btrfs_free_delayed_extent_op(extent_op);
- extent_op = NULL;
- }
-
- if (extent_op) {
- spin_unlock(&locked_ref->lock);
- ret = run_delayed_extent_op(trans, fs_info,
- ref, extent_op);
- btrfs_free_delayed_extent_op(extent_op);
-
- if (ret) {
- /*
- * Need to reset must_insert_reserved if
- * there was an error so the abort stuff
- * can cleanup the reserved space
- * properly.
- */
- if (must_insert_reserved)
- locked_ref->must_insert_reserved = 1;
- spin_lock(&delayed_refs->lock);
- locked_ref->processing = 0;
- delayed_refs->num_heads_ready++;
- spin_unlock(&delayed_refs->lock);
- btrfs_debug(fs_info,
- "run_delayed_extent_op returned %d",
- ret);
- btrfs_delayed_ref_unlock(locked_ref);
- return ret;
- }
+ ret = cleanup_ref_head(trans, fs_info, locked_ref);
+ if (ret > 0 ) {
+ /* We dropped our lock, we need to loop. */
+ ret = 0;
continue;
+ } else if (ret) {
+ return ret;
}
+ locked_ref = NULL;
+ count++;
+ continue;
+ }
- /*
- * Need to drop our head ref lock and re-acquire the
- * delayed ref lock and then re-check to make sure
- * nobody got added.
- */
- spin_unlock(&locked_ref->lock);
- spin_lock(&delayed_refs->lock);
- spin_lock(&locked_ref->lock);
- if (!list_empty(&locked_ref->ref_list) ||
- locked_ref->extent_op) {
- spin_unlock(&locked_ref->lock);
- spin_unlock(&delayed_refs->lock);
- continue;
- }
- ref->in_tree = 0;
- delayed_refs->num_heads--;
- rb_erase(&locked_ref->href_node,
- &delayed_refs->href_root);
- spin_unlock(&delayed_refs->lock);
- } else {
- actual_count++;
- ref->in_tree = 0;
- list_del(&ref->list);
- if (!list_empty(&ref->add_list))
- list_del(&ref->add_list);
+ actual_count++;
+ ref->in_tree = 0;
+ rb_erase(&ref->ref_node, &locked_ref->ref_tree);
+ RB_CLEAR_NODE(&ref->ref_node);
+ if (!list_empty(&ref->add_list))
+ list_del(&ref->add_list);
+ /*
+ * When we play the delayed ref, also correct the ref_mod on
+ * head
+ */
+ switch (ref->action) {
+ case BTRFS_ADD_DELAYED_REF:
+ case BTRFS_ADD_DELAYED_EXTENT:
+ locked_ref->ref_mod -= ref->ref_mod;
+ break;
+ case BTRFS_DROP_DELAYED_REF:
+ locked_ref->ref_mod += ref->ref_mod;
+ break;
+ default:
+ WARN_ON(1);
}
atomic_dec(&delayed_refs->num_entries);
- if (!btrfs_delayed_ref_is_head(ref)) {
- /*
- * when we play the delayed ref, also correct the
- * ref_mod on head
- */
- switch (ref->action) {
- case BTRFS_ADD_DELAYED_REF:
- case BTRFS_ADD_DELAYED_EXTENT:
- locked_ref->node.ref_mod -= ref->ref_mod;
- break;
- case BTRFS_DROP_DELAYED_REF:
- locked_ref->node.ref_mod += ref->ref_mod;
- break;
- default:
- WARN_ON(1);
- }
- }
+ /*
+ * Record the must-insert_reserved flag before we drop the spin
+ * lock.
+ */
+ must_insert_reserved = locked_ref->must_insert_reserved;
+ locked_ref->must_insert_reserved = 0;
+
+ extent_op = locked_ref->extent_op;
+ locked_ref->extent_op = NULL;
spin_unlock(&locked_ref->lock);
ret = run_one_delayed_ref(trans, fs_info, ref, extent_op,
@@ -2770,33 +2782,13 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
btrfs_free_delayed_extent_op(extent_op);
if (ret) {
- spin_lock(&delayed_refs->lock);
- locked_ref->processing = 0;
- delayed_refs->num_heads_ready++;
- spin_unlock(&delayed_refs->lock);
- btrfs_delayed_ref_unlock(locked_ref);
+ unselect_delayed_ref_head(delayed_refs, locked_ref);
btrfs_put_delayed_ref(ref);
btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
ret);
return ret;
}
- /*
- * If this node is a head, that means all the refs in this head
- * have been dealt with, and we will pick the next head to deal
- * with, so we must unlock the head and drop it from the cluster
- * list before we release it.
- */
- if (btrfs_delayed_ref_is_head(ref)) {
- if (locked_ref->is_data &&
- locked_ref->total_ref_mod < 0) {
- spin_lock(&delayed_refs->lock);
- delayed_refs->pending_csums -= ref->num_bytes;
- spin_unlock(&delayed_refs->lock);
- }
- btrfs_delayed_ref_unlock(locked_ref);
- locked_ref = NULL;
- }
btrfs_put_delayed_ref(ref);
count++;
cond_resched();
@@ -2907,7 +2899,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_block_rsv *global_rsv;
u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
- u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
+ unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
u64 num_bytes, num_dirty_bgs_bytes;
int ret = 0;
@@ -3000,7 +2992,7 @@ static void delayed_ref_async_start(struct btrfs_work *work)
if (trans->transid > async->transid)
goto end;
- ret = btrfs_run_delayed_refs(trans, fs_info, async->count);
+ ret = btrfs_run_delayed_refs(trans, async->count);
if (ret)
async->error = ret;
end:
@@ -3059,8 +3051,9 @@ int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
* Returns <0 on error and aborts the transaction
*/
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, unsigned long count)
+ unsigned long count)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *head;
@@ -3084,7 +3077,7 @@ again:
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
trans->can_flush_pending_bgs = false;
- ret = __btrfs_run_delayed_refs(trans, fs_info, count);
+ ret = __btrfs_run_delayed_refs(trans, count);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3092,7 +3085,7 @@ again:
if (run_all) {
if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans, fs_info);
+ btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
node = rb_first(&delayed_refs->href_root);
@@ -3100,33 +3093,16 @@ again:
spin_unlock(&delayed_refs->lock);
goto out;
}
+ head = rb_entry(node, struct btrfs_delayed_ref_head,
+ href_node);
+ refcount_inc(&head->refs);
+ spin_unlock(&delayed_refs->lock);
- while (node) {
- head = rb_entry(node, struct btrfs_delayed_ref_head,
- href_node);
- if (btrfs_delayed_ref_is_head(&head->node)) {
- struct btrfs_delayed_ref_node *ref;
-
- ref = &head->node;
- refcount_inc(&ref->refs);
-
- spin_unlock(&delayed_refs->lock);
- /*
- * Mutex was contended, block until it's
- * released and try again
- */
- mutex_lock(&head->mutex);
- mutex_unlock(&head->mutex);
+ /* Mutex was contended, block until it's released and retry. */
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref(ref);
- cond_resched();
- goto again;
- } else {
- WARN_ON(1);
- }
- node = rb_next(node);
- }
- spin_unlock(&delayed_refs->lock);
+ btrfs_put_delayed_ref_head(head);
cond_resched();
goto again;
}
@@ -3169,6 +3145,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
struct btrfs_delayed_data_ref *data_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_transaction *cur_trans;
+ struct rb_node *node;
int ret = 0;
cur_trans = root->fs_info->running_transaction;
@@ -3184,7 +3161,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
}
if (!mutex_trylock(&head->mutex)) {
- refcount_inc(&head->node.refs);
+ refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
btrfs_release_path(path);
@@ -3195,13 +3172,18 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
*/
mutex_lock(&head->mutex);
mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref(&head->node);
+ btrfs_put_delayed_ref_head(head);
return -EAGAIN;
}
spin_unlock(&delayed_refs->lock);
spin_lock(&head->lock);
- list_for_each_entry(ref, &head->ref_list, list) {
+ /*
+ * XXX: We should replace this with a proper search function in the
+ * future.
+ */
+ for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
/* If it's a shared ref we know a cross reference exists */
if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
ret = 1;
@@ -3351,7 +3333,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int level;
int ret = 0;
int (*process_func)(struct btrfs_trans_handle *,
- struct btrfs_fs_info *,
+ struct btrfs_root *,
u64, u64, u64, u64, u64, u64);
@@ -3391,7 +3373,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
key.offset -= btrfs_file_extent_offset(buf, fi);
- ret = process_func(trans, fs_info, bytenr, num_bytes,
+ ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, key.objectid,
key.offset);
if (ret)
@@ -3399,7 +3381,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
} else {
bytenr = btrfs_node_blockptr(buf, i);
num_bytes = fs_info->nodesize;
- ret = process_func(trans, fs_info, bytenr, num_bytes,
+ ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, level - 1, 0);
if (ret)
goto fail;
@@ -3526,13 +3508,6 @@ again:
goto again;
}
- /* We've already setup this transaction, go ahead and exit */
- if (block_group->cache_generation == trans->transid &&
- i_size_read(inode)) {
- dcs = BTRFS_DC_SETUP;
- goto out_put;
- }
-
/*
* We want to set the generation to 0, that way if anything goes wrong
* from here on out we know not to trust this cache when we load up next
@@ -3556,6 +3531,13 @@ again:
}
WARN_ON(ret);
+ /* We've already setup this transaction, go ahead and exit */
+ if (block_group->cache_generation == trans->transid &&
+ i_size_read(inode)) {
+ dcs = BTRFS_DC_SETUP;
+ goto out_put;
+ }
+
if (i_size_read(inode) > 0) {
ret = btrfs_check_trunc_cache_free_space(fs_info,
&fs_info->global_block_rsv);
@@ -3677,9 +3659,9 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
* the commit latency by getting rid of the easy block groups while
* we're still allowing others to join the commit.
*/
-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group_cache *cache;
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
@@ -3703,7 +3685,7 @@ again:
* make sure all the block groups on our dirty list actually
* exist
*/
- btrfs_create_pending_block_groups(trans, fs_info);
+ btrfs_create_pending_block_groups(trans);
if (!path) {
path = btrfs_alloc_path();
@@ -3758,8 +3740,9 @@ again:
should_put = 0;
/*
- * the cache_write_mutex is protecting
- * the io_list
+ * The cache_write_mutex is protecting the
+ * io_list, also refer to the definition of
+ * btrfs_transaction::io_bgs for more details
*/
list_add_tail(&cache->io_list, io);
} else {
@@ -3817,7 +3800,7 @@ again:
* go through delayed refs for all the stuff we've just kicked off
* and then loop back (just once)
*/
- ret = btrfs_run_delayed_refs(trans, fs_info, 0);
+ ret = btrfs_run_delayed_refs(trans, 0);
if (!ret && loops == 0) {
loops++;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -3899,7 +3882,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
cache_save_setup(cache, trans, path);
if (!ret)
- ret = btrfs_run_delayed_refs(trans, fs_info,
+ ret = btrfs_run_delayed_refs(trans,
(unsigned long) -1);
if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
@@ -3951,6 +3934,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
}
spin_unlock(&cur_trans->dirty_bgs_lock);
+ /*
+ * Refer to the definition of io_bgs member for details why it's safe
+ * to use it without any locking
+ */
while (!list_empty(io)) {
cache = list_first_entry(io, struct btrfs_block_group_cache,
io_list);
@@ -4007,7 +3994,7 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
bg = btrfs_lookup_block_group(fs_info, bytenr);
ASSERT(bg);
if (atomic_dec_and_test(&bg->nocow_writers))
- wake_up_atomic_t(&bg->nocow_writers);
+ wake_up_var(&bg->nocow_writers);
/*
* Once for our lookup and once for the lookup done by a previous call
* to btrfs_inc_nocow_writers()
@@ -4016,17 +4003,9 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
btrfs_put_block_group(bg);
}
-static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
-{
- schedule();
- return 0;
-}
-
void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
{
- wait_on_atomic_t(&bg->nocow_writers,
- btrfs_wait_nocow_writers_atomic_t,
- TASK_UNINTERRUPTIBLE);
+ wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
}
static const char *alloc_name(u64 flags)
@@ -4357,8 +4336,7 @@ again:
/* commit the current transaction and try again */
commit_trans:
- if (need_commit &&
- !atomic_read(&fs_info->open_ioctl_trans)) {
+ if (need_commit) {
need_commit--;
if (need_commit > 0) {
@@ -4566,7 +4544,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
* Needed because we can end up allocating a system chunk and for an
* atomic and race free space reservation in the chunk block reserve.
*/
- ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
+ lockdep_assert_held(&fs_info->chunk_mutex);
info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
spin_lock(&info->lock);
@@ -4627,11 +4605,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
return -ENOSPC;
space_info = __find_space_info(fs_info, flags);
- if (!space_info) {
- ret = create_space_info(fs_info, flags, &space_info);
- if (ret)
- return ret;
- }
+ ASSERT(space_info);
again:
spin_lock(&space_info->lock);
@@ -4730,7 +4704,7 @@ out:
*/
if (trans->can_flush_pending_bgs &&
trans->chunk_bytes_reserved >= (u64)SZ_2M) {
- btrfs_create_pending_block_groups(trans, fs_info);
+ btrfs_create_pending_block_groups(trans);
btrfs_trans_release_chunk_metadata(trans);
}
return ret;
@@ -4843,7 +4817,6 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
u64 orig, bool wait_ordered)
{
- struct btrfs_block_rsv *block_rsv;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
@@ -4852,15 +4825,13 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
long time_left;
unsigned long nr_pages;
int loops;
- enum btrfs_reserve_flush_enum flush;
/* Calc the number of the pages we need flush for space reservation */
items = calc_reclaim_items_nr(fs_info, to_reclaim);
to_reclaim = items * EXTENT_SIZE_PER_ITEM;
trans = (struct btrfs_trans_handle *)current->journal_info;
- block_rsv = &fs_info->delalloc_block_rsv;
- space_info = block_rsv->space_info;
+ space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes);
@@ -4894,10 +4865,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
atomic_read(&fs_info->async_delalloc_pages) <=
(int)max_reclaim);
skip_async:
- if (!trans)
- flush = BTRFS_RESERVE_FLUSH_ALL;
- else
- flush = BTRFS_RESERVE_NO_FLUSH;
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets) &&
list_empty(&space_info->priority_tickets)) {
@@ -4919,6 +4886,13 @@ skip_async:
}
}
+struct reserve_ticket {
+ u64 bytes;
+ int error;
+ struct list_head list;
+ wait_queue_head_t wait;
+};
+
/**
* maybe_commit_transaction - possibly commit the transaction if its ok to
* @root - the root we're allocating for
@@ -4930,18 +4904,29 @@ skip_async:
* will return -ENOSPC.
*/
static int may_commit_transaction(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 bytes, int force)
+ struct btrfs_space_info *space_info)
{
+ struct reserve_ticket *ticket = NULL;
struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
struct btrfs_trans_handle *trans;
+ u64 bytes;
trans = (struct btrfs_trans_handle *)current->journal_info;
if (trans)
return -EAGAIN;
- if (force)
- goto commit;
+ spin_lock(&space_info->lock);
+ if (!list_empty(&space_info->priority_tickets))
+ ticket = list_first_entry(&space_info->priority_tickets,
+ struct reserve_ticket, list);
+ else if (!list_empty(&space_info->tickets))
+ ticket = list_first_entry(&space_info->tickets,
+ struct reserve_ticket, list);
+ bytes = (ticket) ? ticket->bytes : 0;
+ spin_unlock(&space_info->lock);
+
+ if (!bytes)
+ return 0;
/* See if there is enough pinned space to make this reservation */
if (percpu_counter_compare(&space_info->total_bytes_pinned,
@@ -4956,12 +4941,16 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
return -ENOSPC;
spin_lock(&delayed_rsv->lock);
+ if (delayed_rsv->size > bytes)
+ bytes = 0;
+ else
+ bytes -= delayed_rsv->size;
+ spin_unlock(&delayed_rsv->lock);
+
if (percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes - delayed_rsv->size) < 0) {
- spin_unlock(&delayed_rsv->lock);
+ bytes) < 0) {
return -ENOSPC;
}
- spin_unlock(&delayed_rsv->lock);
commit:
trans = btrfs_join_transaction(fs_info->extent_root);
@@ -4971,13 +4960,6 @@ commit:
return btrfs_commit_transaction(trans);
}
-struct reserve_ticket {
- u64 bytes;
- int error;
- struct list_head list;
- wait_queue_head_t wait;
-};
-
/*
* Try to flush some data based on policy set by @state. This is only advisory
* and may fail for various reasons. The caller is supposed to examine the
@@ -5005,7 +4987,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
ret = PTR_ERR(trans);
break;
}
- ret = btrfs_run_delayed_items_nr(trans, fs_info, nr);
+ ret = btrfs_run_delayed_items_nr(trans, nr);
btrfs_end_transaction(trans);
break;
case FLUSH_DELALLOC:
@@ -5027,8 +5009,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
ret = 0;
break;
case COMMIT_TRANS:
- ret = may_commit_transaction(fs_info, space_info,
- num_bytes, 0);
+ ret = may_commit_transaction(fs_info, space_info);
break;
default:
ret = -ENOSPC;
@@ -5401,10 +5382,15 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
!block_rsv_use_bytes(global_rsv, orig_bytes))
ret = 0;
}
- if (ret == -ENOSPC)
+ if (ret == -ENOSPC) {
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
block_rsv->space_info->flags,
orig_bytes, 1);
+
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ dump_space_info(fs_info, block_rsv->space_info,
+ orig_bytes, 0);
+ }
return ret;
}
@@ -5582,11 +5568,12 @@ again:
}
}
-static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
struct btrfs_block_rsv *dest, u64 num_bytes)
{
struct btrfs_space_info *space_info = block_rsv->space_info;
+ u64 ret;
spin_lock(&block_rsv->lock);
if (num_bytes == (u64)-1)
@@ -5601,6 +5588,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
}
spin_unlock(&block_rsv->lock);
+ ret = num_bytes;
if (num_bytes > 0) {
if (dest) {
spin_lock(&dest->lock);
@@ -5620,6 +5608,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
space_info_add_old_bytes(fs_info, space_info,
num_bytes);
}
+ return ret;
}
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
@@ -5643,6 +5632,15 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
rsv->type = type;
}
+void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *rsv,
+ unsigned short type)
+{
+ btrfs_init_block_rsv(rsv, type);
+ rsv->space_info = __find_space_info(fs_info,
+ BTRFS_BLOCK_GROUP_METADATA);
+}
+
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
unsigned short type)
{
@@ -5652,9 +5650,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
if (!block_rsv)
return NULL;
- btrfs_init_block_rsv(block_rsv, type);
- block_rsv->space_info = __find_space_info(fs_info,
- BTRFS_BLOCK_GROUP_METADATA);
+ btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
return block_rsv;
}
@@ -5737,6 +5733,77 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
+/**
+ * btrfs_inode_rsv_refill - refill the inode block rsv.
+ * @inode - the inode we are refilling.
+ * @flush - the flusing restriction.
+ *
+ * Essentially the same as btrfs_block_rsv_refill, except it uses the
+ * block_rsv->size as the minimum size. We'll either refill the missing amount
+ * or return if we already have enough space. This will also handle the resreve
+ * tracepoint for the reserved amount.
+ */
+static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
+ enum btrfs_reserve_flush_enum flush)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+ u64 num_bytes = 0;
+ int ret = -ENOSPC;
+
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved < block_rsv->size)
+ num_bytes = block_rsv->size - block_rsv->reserved;
+ spin_unlock(&block_rsv->lock);
+
+ if (num_bytes == 0)
+ return 0;
+
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+ if (ret)
+ return ret;
+ ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ if (!ret) {
+ block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ btrfs_ino(inode), num_bytes, 1);
+ }
+ return ret;
+}
+
+/**
+ * btrfs_inode_rsv_release - release any excessive reservation.
+ * @inode - the inode we need to release from.
+ * @qgroup_free - free or convert qgroup meta.
+ * Unlike normal operation, qgroup meta reservation needs to know if we are
+ * freeing qgroup reservation or just converting it into per-trans. Normally
+ * @qgroup_free is true for error handling, and false for normal release.
+ *
+ * This is the same as btrfs_block_rsv_release, except that it handles the
+ * tracepoint for the reservation.
+ */
+static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+ u64 released = 0;
+
+ /*
+ * Since we statically set the block_rsv->size we just want to say we
+ * are releasing 0 bytes, and then we'll just get the reservation over
+ * the size free'd.
+ */
+ released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0);
+ if (released > 0)
+ trace_btrfs_space_reservation(fs_info, "delalloc",
+ btrfs_ino(inode), released, 0);
+ if (qgroup_free)
+ btrfs_qgroup_free_meta_prealloc(inode->root, released);
+ else
+ btrfs_qgroup_convert_reserved_meta(inode->root, released);
+}
+
void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes)
@@ -5808,7 +5875,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
fs_info->global_block_rsv.space_info = space_info;
- fs_info->delalloc_block_rsv.space_info = space_info;
fs_info->trans_block_rsv.space_info = space_info;
fs_info->empty_block_rsv.space_info = space_info;
fs_info->delayed_block_rsv.space_info = space_info;
@@ -5828,8 +5894,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
{
block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
(u64)-1);
- WARN_ON(fs_info->delalloc_block_rsv.size > 0);
- WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
WARN_ON(fs_info->trans_block_rsv.size > 0);
WARN_ON(fs_info->trans_block_rsv.reserved > 0);
WARN_ON(fs_info->chunk_block_rsv.size > 0);
@@ -5838,21 +5902,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
}
-void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- if (!trans->block_rsv)
- return;
-
- if (!trans->bytes_reserved)
- return;
-
- trace_btrfs_space_reservation(fs_info, "transaction",
- trans->transid, trans->bytes_reserved, 0);
- btrfs_block_rsv_release(fs_info, trans->block_rsv,
- trans->bytes_reserved);
- trans->bytes_reserved = 0;
-}
/*
* To be called after all the new block groups attached to the transaction
@@ -5894,7 +5943,7 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
*/
u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
- trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
+ trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
num_bytes, 1);
return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
}
@@ -5938,7 +5987,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
/* One for parent inode, two for dir entries */
num_bytes = 3 * fs_info->nodesize;
- ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
if (ret)
return ret;
} else {
@@ -5957,7 +6006,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
if (ret && *qgroup_reserved)
- btrfs_qgroup_free_meta(root, *qgroup_reserved);
+ btrfs_qgroup_free_meta_prealloc(root, *qgroup_reserved);
return ret;
}
@@ -5968,104 +6017,36 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
}
-/**
- * drop_outstanding_extent - drop an outstanding extent
- * @inode: the inode we're dropping the extent for
- * @num_bytes: the number of bytes we're releasing.
- *
- * This is called when we are freeing up an outstanding extent, either called
- * after an error or after an extent is written. This will return the number of
- * reserved extents that need to be freed. This must be called with
- * BTRFS_I(inode)->lock held.
- */
-static unsigned drop_outstanding_extent(struct btrfs_inode *inode,
- u64 num_bytes)
+static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
+ struct btrfs_inode *inode)
{
- unsigned drop_inode_space = 0;
- unsigned dropped_extents = 0;
- unsigned num_extents;
-
- num_extents = count_max_extents(num_bytes);
- ASSERT(num_extents);
- ASSERT(inode->outstanding_extents >= num_extents);
- inode->outstanding_extents -= num_extents;
-
- if (inode->outstanding_extents == 0 &&
- test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
- &inode->runtime_flags))
- drop_inode_space = 1;
-
- /*
- * If we have more or the same amount of outstanding extents than we have
- * reserved then we need to leave the reserved extents count alone.
- */
- if (inode->outstanding_extents >= inode->reserved_extents)
- return drop_inode_space;
-
- dropped_extents = inode->reserved_extents - inode->outstanding_extents;
- inode->reserved_extents -= dropped_extents;
- return dropped_extents + drop_inode_space;
-}
-
-/**
- * calc_csum_metadata_size - return the amount of metadata space that must be
- * reserved/freed for the given bytes.
- * @inode: the inode we're manipulating
- * @num_bytes: the number of bytes in question
- * @reserve: 1 if we are reserving space, 0 if we are freeing space
- *
- * This adjusts the number of csum_bytes in the inode and then returns the
- * correct amount of metadata that must either be reserved or freed. We
- * calculate how many checksums we can fit into one leaf and then divide the
- * number of bytes that will need to be checksumed by this value to figure out
- * how many checksums will be required. If we are adding bytes then the number
- * may go up and we will return the number of additional bytes that must be
- * reserved. If it is going down we will return the number of bytes that must
- * be freed.
- *
- * This must be called with BTRFS_I(inode)->lock held.
- */
-static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes,
- int reserve)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
- u64 old_csums, num_csums;
-
- if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0)
- return 0;
-
- old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
- if (reserve)
- inode->csum_bytes += num_bytes;
- else
- inode->csum_bytes -= num_bytes;
- num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
+ struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+ u64 reserve_size = 0;
+ u64 csum_leaves;
+ unsigned outstanding_extents;
- /* No change, no need to reserve more */
- if (old_csums == num_csums)
- return 0;
-
- if (reserve)
- return btrfs_calc_trans_metadata_size(fs_info,
- num_csums - old_csums);
+ lockdep_assert_held(&inode->lock);
+ outstanding_extents = inode->outstanding_extents;
+ if (outstanding_extents)
+ reserve_size = btrfs_calc_trans_metadata_size(fs_info,
+ outstanding_extents + 1);
+ csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
+ inode->csum_bytes);
+ reserve_size += btrfs_calc_trans_metadata_size(fs_info,
+ csum_leaves);
- return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums);
+ spin_lock(&block_rsv->lock);
+ block_rsv->size = reserve_size;
+ spin_unlock(&block_rsv->lock);
}
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
- struct btrfs_root *root = inode->root;
- struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv;
- u64 to_reserve = 0;
- u64 csum_bytes;
unsigned nr_extents;
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
int ret = 0;
bool delalloc_lock = true;
- u64 to_free = 0;
- unsigned dropped;
- bool release_extra = false;
/* If we are a free space inode we need to not flush since we will be in
* the middle of a transaction commit. We also don't need the delalloc
@@ -6078,127 +6059,44 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
if (btrfs_is_free_space_inode(inode)) {
flush = BTRFS_RESERVE_NO_FLUSH;
delalloc_lock = false;
- } else if (current->journal_info) {
- flush = BTRFS_RESERVE_FLUSH_LIMIT;
- }
+ } else {
+ if (current->journal_info)
+ flush = BTRFS_RESERVE_FLUSH_LIMIT;
- if (flush != BTRFS_RESERVE_NO_FLUSH &&
- btrfs_transaction_in_commit(fs_info))
- schedule_timeout(1);
+ if (btrfs_transaction_in_commit(fs_info))
+ schedule_timeout(1);
+ }
if (delalloc_lock)
mutex_lock(&inode->delalloc_mutex);
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
+ /* Add our new extents and calculate the new rsv size. */
spin_lock(&inode->lock);
nr_extents = count_max_extents(num_bytes);
- inode->outstanding_extents += nr_extents;
-
- nr_extents = 0;
- if (inode->outstanding_extents > inode->reserved_extents)
- nr_extents += inode->outstanding_extents -
- inode->reserved_extents;
-
- /* We always want to reserve a slot for updating the inode. */
- to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1);
- to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
- csum_bytes = inode->csum_bytes;
+ btrfs_mod_outstanding_extents(inode, nr_extents);
+ inode->csum_bytes += num_bytes;
+ btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
- ret = btrfs_qgroup_reserve_meta(root,
- nr_extents * fs_info->nodesize, true);
- if (ret)
- goto out_fail;
- }
-
- ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
- if (unlikely(ret)) {
- btrfs_qgroup_free_meta(root,
- nr_extents * fs_info->nodesize);
+ ret = btrfs_inode_rsv_refill(inode, flush);
+ if (unlikely(ret))
goto out_fail;
- }
-
- spin_lock(&inode->lock);
- if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
- &inode->runtime_flags)) {
- to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1);
- release_extra = true;
- }
- inode->reserved_extents += nr_extents;
- spin_unlock(&inode->lock);
if (delalloc_lock)
mutex_unlock(&inode->delalloc_mutex);
-
- if (to_reserve)
- trace_btrfs_space_reservation(fs_info, "delalloc",
- btrfs_ino(inode), to_reserve, 1);
- if (release_extra)
- btrfs_block_rsv_release(fs_info, block_rsv,
- btrfs_calc_trans_metadata_size(fs_info, 1));
return 0;
out_fail:
spin_lock(&inode->lock);
- dropped = drop_outstanding_extent(inode, num_bytes);
- /*
- * If the inodes csum_bytes is the same as the original
- * csum_bytes then we know we haven't raced with any free()ers
- * so we can just reduce our inodes csum bytes and carry on.
- */
- if (inode->csum_bytes == csum_bytes) {
- calc_csum_metadata_size(inode, num_bytes, 0);
- } else {
- u64 orig_csum_bytes = inode->csum_bytes;
- u64 bytes;
-
- /*
- * This is tricky, but first we need to figure out how much we
- * freed from any free-ers that occurred during this
- * reservation, so we reset ->csum_bytes to the csum_bytes
- * before we dropped our lock, and then call the free for the
- * number of bytes that were freed while we were trying our
- * reservation.
- */
- bytes = csum_bytes - inode->csum_bytes;
- inode->csum_bytes = csum_bytes;
- to_free = calc_csum_metadata_size(inode, bytes, 0);
-
-
- /*
- * Now we need to see how much we would have freed had we not
- * been making this reservation and our ->csum_bytes were not
- * artificially inflated.
- */
- inode->csum_bytes = csum_bytes - num_bytes;
- bytes = csum_bytes - orig_csum_bytes;
- bytes = calc_csum_metadata_size(inode, bytes, 0);
-
- /*
- * Now reset ->csum_bytes to what it should be. If bytes is
- * more than to_free then we would have freed more space had we
- * not had an artificially high ->csum_bytes, so we need to free
- * the remainder. If bytes is the same or less then we don't
- * need to do anything, the other free-ers did the correct
- * thing.
- */
- inode->csum_bytes = orig_csum_bytes - num_bytes;
- if (bytes > to_free)
- to_free = bytes - to_free;
- else
- to_free = 0;
- }
+ nr_extents = count_max_extents(num_bytes);
+ btrfs_mod_outstanding_extents(inode, -nr_extents);
+ inode->csum_bytes -= num_bytes;
+ btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
- if (dropped)
- to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
- if (to_free) {
- btrfs_block_rsv_release(fs_info, block_rsv, to_free);
- trace_btrfs_space_reservation(fs_info, "delalloc",
- btrfs_ino(inode), to_free, 0);
- }
+ btrfs_inode_rsv_release(inode, true);
if (delalloc_lock)
mutex_unlock(&inode->delalloc_mutex);
return ret;
@@ -6206,36 +6104,59 @@ out_fail:
/**
* btrfs_delalloc_release_metadata - release a metadata reservation for an inode
- * @inode: the inode to release the reservation for
- * @num_bytes: the number of bytes we're releasing
+ * @inode: the inode to release the reservation for.
+ * @num_bytes: the number of bytes we are releasing.
+ * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
*
* This will release the metadata reservation for an inode. This can be called
* once we complete IO for a given set of bytes to release their metadata
- * reservations.
+ * reservations, or on error for the same reason.
*/
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ bool qgroup_free)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
- u64 to_free = 0;
- unsigned dropped;
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
spin_lock(&inode->lock);
- dropped = drop_outstanding_extent(inode, num_bytes);
-
- if (num_bytes)
- to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+ inode->csum_bytes -= num_bytes;
+ btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
- if (dropped > 0)
- to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
if (btrfs_is_testing(fs_info))
return;
- trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode),
- to_free, 0);
+ btrfs_inode_rsv_release(inode, qgroup_free);
+}
+
+/**
+ * btrfs_delalloc_release_extents - release our outstanding_extents
+ * @inode: the inode to balance the reservation for.
+ * @num_bytes: the number of bytes we originally reserved with
+ * @qgroup_free: do we need to free qgroup meta reservation or convert them.
+ *
+ * When we reserve space we increase outstanding_extents for the extents we may
+ * add. Once we've set the range as delalloc or created our ordered extents we
+ * have outstanding_extents to track the real usage, so we use this to free our
+ * temporarily tracked outstanding_extents. This _must_ be used in conjunction
+ * with btrfs_delalloc_reserve_metadata.
+ */
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
+ bool qgroup_free)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
+ unsigned num_extents;
+
+ spin_lock(&inode->lock);
+ num_extents = count_max_extents(num_bytes);
+ btrfs_mod_outstanding_extents(inode, -num_extents);
+ btrfs_calculate_inode_block_rsv_size(fs_info, inode);
+ spin_unlock(&inode->lock);
+
+ if (btrfs_is_testing(fs_info))
+ return;
- btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
+ btrfs_inode_rsv_release(inode, qgroup_free);
}
/**
@@ -6282,10 +6203,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
* @inode: inode we're releasing space for
* @start: start position of the space already reserved
* @len: the len of the space already reserved
- *
- * This must be matched with a call to btrfs_delalloc_reserve_space. This is
- * called in the case that we don't need the metadata AND data reservations
- * anymore. So if there is an error or we insert an inline extent.
+ * @release_bytes: the len of the space we consumed or didn't use
*
* This function will release the metadata space that was not used and will
* decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
@@ -6293,9 +6211,10 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
* Also it will handle the qgroup reserved space.
*/
void btrfs_delalloc_release_space(struct inode *inode,
- struct extent_changeset *reserved, u64 start, u64 len)
+ struct extent_changeset *reserved,
+ u64 start, u64 len, bool qgroup_free)
{
- btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
+ btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
btrfs_free_reserved_data_space(inode, reserved, start, len);
}
@@ -6591,16 +6510,10 @@ void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
bg = btrfs_lookup_block_group(fs_info, start);
ASSERT(bg);
if (atomic_dec_and_test(&bg->reservations))
- wake_up_atomic_t(&bg->reservations);
+ wake_up_var(&bg->reservations);
btrfs_put_block_group(bg);
}
-static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
-{
- schedule();
- return 0;
-}
-
void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
{
struct btrfs_space_info *space_info = bg->space_info;
@@ -6623,9 +6536,7 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
down_write(&space_info->groups_sem);
up_write(&space_info->groups_sem);
- wait_on_atomic_t(&bg->reservations,
- btrfs_wait_bg_reservations_atomic_t,
- TASK_UNINTERRUPTIBLE);
+ wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
}
/**
@@ -6857,9 +6768,9 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group_cache *block_group, *tmp;
struct list_head *deleted_bgs;
struct extent_io_tree *unpin;
@@ -6958,7 +6869,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
BUG_ON(!is_data && refs_to_drop != 1);
if (is_data)
- skinny_metadata = 0;
+ skinny_metadata = false;
ret = lookup_extent_backref(trans, info, path, &iref,
bytenr, num_bytes, parent,
@@ -7213,7 +7124,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
goto out_delayed_unlock;
spin_lock(&head->lock);
- if (!list_empty(&head->ref_list))
+ if (!RB_EMPTY_ROOT(&head->ref_tree))
goto out;
if (head->extent_op) {
@@ -7234,9 +7145,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
* at this point we have a head with no other entries. Go
* ahead and process it.
*/
- head->node.in_tree = 0;
rb_erase(&head->href_node, &delayed_refs->href_root);
-
+ RB_CLEAR_NODE(&head->href_node);
atomic_dec(&delayed_refs->num_entries);
/*
@@ -7255,7 +7165,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
ret = 1;
mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref(&head->node);
+ btrfs_put_delayed_ref_head(head);
return ret;
out:
spin_unlock(&head->lock);
@@ -7277,6 +7187,10 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
int old_ref_mod, new_ref_mod;
+ btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
+ root->root_key.objectid,
+ btrfs_header_level(buf), 0,
+ BTRFS_DROP_DELAYED_REF);
ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start,
buf->len, parent,
root->root_key.objectid,
@@ -7329,16 +7243,21 @@ out:
/* Can return -ENOMEM */
int btrfs_free_extent(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
u64 owner, u64 offset)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int old_ref_mod, new_ref_mod;
int ret;
if (btrfs_is_testing(fs_info))
return 0;
+ if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
+ btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
+ root_objectid, owner, offset,
+ BTRFS_DROP_DELAYED_REF);
/*
* tree log blocks never actually go into the extent allocation
@@ -7417,29 +7336,6 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
return ret;
}
-int __get_raid_index(u64 flags)
-{
- if (flags & BTRFS_BLOCK_GROUP_RAID10)
- return BTRFS_RAID_RAID10;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1)
- return BTRFS_RAID_RAID1;
- else if (flags & BTRFS_BLOCK_GROUP_DUP)
- return BTRFS_RAID_DUP;
- else if (flags & BTRFS_BLOCK_GROUP_RAID0)
- return BTRFS_RAID_RAID0;
- else if (flags & BTRFS_BLOCK_GROUP_RAID5)
- return BTRFS_RAID_RAID5;
- else if (flags & BTRFS_BLOCK_GROUP_RAID6)
- return BTRFS_RAID_RAID6;
-
- return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
-}
-
-int get_block_group_index(struct btrfs_block_group_cache *cache)
-{
- return __get_raid_index(cache->flags);
-}
-
static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = "raid10",
[BTRFS_RAID_RAID1] = "raid1",
@@ -7554,7 +7450,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
u64 empty_cluster = 0;
struct btrfs_space_info *space_info;
int loop = 0;
- int index = __get_raid_index(flags);
+ int index = btrfs_bg_flags_to_raid_index(flags);
bool failed_cluster_refill = false;
bool failed_alloc = false;
bool use_cluster = true;
@@ -7640,7 +7536,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
btrfs_put_block_group(block_group);
up_read(&space_info->groups_sem);
} else {
- index = get_block_group_index(block_group);
+ index = btrfs_bg_flags_to_raid_index(
+ block_group->flags);
btrfs_lock_block_group(block_group, delalloc);
goto have_block_group;
}
@@ -7650,7 +7547,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
}
search:
have_caching_bg = false;
- if (index == 0 || index == __get_raid_index(flags))
+ if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
full_search = true;
down_read(&space_info->groups_sem);
list_for_each_entry(block_group, &space_info->block_groups[index],
@@ -7908,7 +7805,8 @@ checks:
loop:
failed_cluster_refill = false;
failed_alloc = false;
- BUG_ON(index != get_block_group_index(block_group));
+ BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
+ index);
btrfs_release_block_group(block_group, delalloc);
cond_resched();
}
@@ -8062,6 +7960,51 @@ again:
up_read(&info->groups_sem);
}
+/*
+ * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
+ * hole that is at least as big as @num_bytes.
+ *
+ * @root - The root that will contain this extent
+ *
+ * @ram_bytes - The amount of space in ram that @num_bytes take. This
+ * is used for accounting purposes. This value differs
+ * from @num_bytes only in the case of compressed extents.
+ *
+ * @num_bytes - Number of bytes to allocate on-disk.
+ *
+ * @min_alloc_size - Indicates the minimum amount of space that the
+ * allocator should try to satisfy. In some cases
+ * @num_bytes may be larger than what is required and if
+ * the filesystem is fragmented then allocation fails.
+ * However, the presence of @min_alloc_size gives a
+ * chance to try and satisfy the smaller allocation.
+ *
+ * @empty_size - A hint that you plan on doing more COW. This is the
+ * size in bytes the allocator should try to find free
+ * next to the block it returns. This is just a hint and
+ * may be ignored by the allocator.
+ *
+ * @hint_byte - Hint to the allocator to start searching above the byte
+ * address passed. It might be ignored.
+ *
+ * @ins - This key is modified to record the found hole. It will
+ * have the following values:
+ * ins->objectid == start position
+ * ins->flags = BTRFS_EXTENT_ITEM_KEY
+ * ins->offset == the size of the hole.
+ *
+ * @is_data - Boolean flag indicating whether an extent is
+ * allocated for data (true) or metadata (false)
+ *
+ * @delalloc - Boolean flag indicating whether this allocation is for
+ * delalloc or not. If 'true' data_rwsem of block groups
+ * is going to be acquired.
+ *
+ *
+ * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
+ * case -ENOSPC is returned then @ins->offset will contain the size of the
+ * largest available hole the allocator managed to find.
+ */
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
@@ -8306,17 +8249,22 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
}
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
- u64 root_objectid, u64 owner,
+ struct btrfs_root *root, u64 owner,
u64 offset, u64 ram_bytes,
struct btrfs_key *ins)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
- BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
+ BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+
+ btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
+ root->root_key.objectid, owner, offset,
+ BTRFS_ADD_DELAYED_EXTENT);
ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid,
- ins->offset, 0, root_objectid, owner,
+ ins->offset, 0,
+ root->root_key.objectid, owner,
offset, ram_bytes,
BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
return ret;
@@ -8538,6 +8486,9 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
extent_op->is_data = false;
extent_op->level = level;
+ btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
+ root_objectid, level, 0,
+ BTRFS_ADD_DELAYED_EXTENT);
ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid,
ins.offset, parent,
root_objectid, level,
@@ -8757,6 +8708,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
u64 parent;
u32 blocksize;
struct btrfs_key key;
+ struct btrfs_key first_key;
struct extent_buffer *next;
int level = wc->level;
int reada = 0;
@@ -8777,6 +8729,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
}
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
+ btrfs_node_key_to_cpu(path->nodes[level], &first_key,
+ path->slots[level]);
blocksize = fs_info->nodesize;
next = find_extent_buffer(fs_info, bytenr);
@@ -8841,7 +8795,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
- next = read_tree_block(fs_info, bytenr, generation);
+ next = read_tree_block(fs_info, bytenr, generation, level - 1,
+ &first_key);
if (IS_ERR(next)) {
return PTR_ERR(next);
} else if (!extent_buffer_uptodate(next)) {
@@ -8894,7 +8849,7 @@ skip:
ret);
}
}
- ret = btrfs_free_extent(trans, fs_info, bytenr, blocksize,
+ ret = btrfs_free_extent(trans, root, bytenr, blocksize,
parent, root->root_key.objectid,
level - 1, 0);
if (ret)
@@ -9269,6 +9224,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
ret = btrfs_del_root(trans, fs_info, &root->root_key);
if (ret) {
btrfs_abort_transaction(trans, ret);
+ err = ret;
goto out_end_trans;
}
@@ -9311,7 +9267,7 @@ out:
* don't have it in the radix (like when we recover after a power fail
* or unmount) so we don't leak memory.
*/
- if (!for_reloc && root_dropped == false)
+ if (!for_reloc && !root_dropped)
btrfs_add_dead_root(root);
if (err && err != -EAGAIN)
btrfs_handle_fs_error(fs_info, err, NULL);
@@ -9705,7 +9661,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
*/
target = get_restripe_target(fs_info, block_group->flags);
if (target) {
- index = __get_raid_index(extended_to_chunk(target));
+ index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
} else {
/*
* this is just a balance, so if we were marked as full
@@ -9719,7 +9675,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
goto out;
}
- index = get_block_group_index(block_group);
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
}
if (index == BTRFS_RAID_RAID10) {
@@ -9752,7 +9708,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
* space to fit our block group in.
*/
if (device->total_bytes > device->bytes_used + min_free &&
- !device->is_tgtdev_for_dev_replace) {
+ !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = find_free_dev_extent(trans, device, min_free,
&dev_offset, NULL);
if (!ret)
@@ -9968,10 +9924,40 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
return 0;
}
-static void __link_block_group(struct btrfs_space_info *space_info,
- struct btrfs_block_group_cache *cache)
+/* link_block_group will queue up kobjects to add when we're reclaim-safe */
+void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *space_info;
+ struct raid_kobject *rkobj;
+ LIST_HEAD(list);
+ int index;
+ int ret = 0;
+
+ spin_lock(&fs_info->pending_raid_kobjs_lock);
+ list_splice_init(&fs_info->pending_raid_kobjs, &list);
+ spin_unlock(&fs_info->pending_raid_kobjs_lock);
+
+ list_for_each_entry(rkobj, &list, list) {
+ space_info = __find_space_info(fs_info, rkobj->flags);
+ index = btrfs_bg_flags_to_raid_index(rkobj->flags);
+
+ ret = kobject_add(&rkobj->kobj, &space_info->kobj,
+ "%s", get_raid_name(index));
+ if (ret) {
+ kobject_put(&rkobj->kobj);
+ break;
+ }
+ }
+ if (ret)
+ btrfs_warn(fs_info,
+ "failed to add kobject for block cache, ignoring");
+}
+
+static void link_block_group(struct btrfs_block_group_cache *cache)
{
- int index = get_block_group_index(cache);
+ struct btrfs_space_info *space_info = cache->space_info;
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ int index = btrfs_bg_flags_to_raid_index(cache->flags);
bool first = false;
down_write(&space_info->groups_sem);
@@ -9981,27 +9967,20 @@ static void __link_block_group(struct btrfs_space_info *space_info,
up_write(&space_info->groups_sem);
if (first) {
- struct raid_kobject *rkobj;
- int ret;
-
- rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
- if (!rkobj)
- goto out_err;
- rkobj->raid_type = index;
- kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
- ret = kobject_add(&rkobj->kobj, &space_info->kobj,
- "%s", get_raid_name(index));
- if (ret) {
- kobject_put(&rkobj->kobj);
- goto out_err;
+ struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
+ if (!rkobj) {
+ btrfs_warn(cache->fs_info,
+ "couldn't alloc memory for raid level kobject");
+ return;
}
+ rkobj->flags = cache->flags;
+ kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+
+ spin_lock(&fs_info->pending_raid_kobjs_lock);
+ list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
+ spin_unlock(&fs_info->pending_raid_kobjs_lock);
space_info->block_group_kobjs[index] = &rkobj->kobj;
}
-
- return;
-out_err:
- btrfs_warn(cache->fs_info,
- "failed to add kobject for block cache, ignoring");
}
static struct btrfs_block_group_cache *
@@ -10178,7 +10157,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
cache->space_info = space_info;
- __link_block_group(space_info, cache);
+ link_block_group(cache);
set_avail_alloc_bits(info, cache->flags);
if (btrfs_chunk_readonly(info, cache->key.objectid)) {
@@ -10217,6 +10196,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
inc_block_group_ro(cache, 1);
}
+ btrfs_add_raid_kobjects(info);
init_global_block_rsv(info);
ret = 0;
error:
@@ -10224,9 +10204,9 @@ error:
return ret;
}
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group_cache *block_group, *tmp;
struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_block_group_item item;
@@ -10311,15 +10291,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
* with its ->space_info set.
*/
cache->space_info = __find_space_info(fs_info, cache->flags);
- if (!cache->space_info) {
- ret = create_space_info(fs_info, cache->flags,
- &cache->space_info);
- if (ret) {
- btrfs_remove_free_space_cache(cache);
- btrfs_put_block_group(cache);
- return ret;
- }
- }
+ ASSERT(cache->space_info);
ret = btrfs_add_block_group_cache(fs_info, cache);
if (ret) {
@@ -10337,7 +10309,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->bytes_super, &cache->space_info);
update_global_block_rsv(fs_info);
- __link_block_group(cache->space_info, cache);
+ link_block_group(cache);
list_add_tail(&cache->bg_list, &trans->new_bgs);
@@ -10387,9 +10359,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* remove it.
*/
free_excluded_extents(fs_info, block_group);
+ btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
+ block_group->key.offset);
memcpy(&key, &block_group->key, sizeof(key));
- index = get_block_group_index(block_group);
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
@@ -10935,7 +10909,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
*trimmed = 0;
/* Not writeable = nothing to do. */
- if (!device->writeable)
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return 0;
/* No free space = nothing to do. */
@@ -11106,12 +11080,6 @@ int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
return 1;
}
-static int wait_snapshotting_atomic_t(atomic_t *a)
-{
- schedule();
- return 0;
-}
-
void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
{
while (true) {
@@ -11120,8 +11088,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
ret = btrfs_start_write_no_snapshotting(root);
if (ret)
break;
- wait_on_atomic_t(&root->will_be_snapshotted,
- wait_snapshotting_atomic_t,
- TASK_UNINTERRUPTIBLE);
+ wait_var_event(&root->will_be_snapshotted,
+ !atomic_read(&root->will_be_snapshotted));
}
}