diff options
Diffstat (limited to 'fs/btrfs/delayed-inode.c')
-rw-r--r-- | fs/btrfs/delayed-inode.c | 245 |
1 files changed, 114 insertions, 131 deletions
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 19e4ad2f3f2e..86ec2edc05e8 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -18,10 +18,12 @@ */ #include <linux/slab.h> +#include <linux/iversion.h> #include "delayed-inode.h" #include "disk-io.h" #include "transaction.h" #include "ctree.h" +#include "qgroup.h" #define BTRFS_DELAYED_WRITEBACK 512 #define BTRFS_DELAYED_BACKGROUND 128 @@ -41,7 +43,7 @@ int __init btrfs_delayed_inode_init(void) return 0; } -void btrfs_delayed_inode_exit(void) +void __cold btrfs_delayed_inode_exit(void) { kmem_cache_destroy(delayed_node_cache); } @@ -87,6 +89,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( spin_lock(&root->inode_lock); node = radix_tree_lookup(&root->delayed_nodes_tree, ino); + if (node) { if (btrfs_inode->delayed_node) { refcount_inc(&node->refs); /* can be accessed */ @@ -94,9 +97,30 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( spin_unlock(&root->inode_lock); return node; } - btrfs_inode->delayed_node = node; - /* can be accessed and cached in the inode */ - refcount_add(2, &node->refs); + + /* + * It's possible that we're racing into the middle of removing + * this node from the radix tree. In this case, the refcount + * was zero and it should never go back to one. Just return + * NULL like it was never in the radix at all; our release + * function is in the process of removing it. + * + * Some implementations of refcount_inc refuse to bump the + * refcount once it has hit zero. If we don't do this dance + * here, refcount_inc() may decide to just WARN_ONCE() instead + * of actually bumping the refcount. + * + * If this node is properly in the radix, we want to bump the + * refcount twice, once for the inode and once for this get + * operation. + */ + if (refcount_inc_not_zero(&node->refs)) { + refcount_inc(&node->refs); + btrfs_inode->delayed_node = node; + } else { + node = NULL; + } + spin_unlock(&root->inode_lock); return node; } @@ -254,17 +278,18 @@ static void __btrfs_release_delayed_node( mutex_unlock(&delayed_node->mutex); if (refcount_dec_and_test(&delayed_node->refs)) { - bool free = false; struct btrfs_root *root = delayed_node->root; + spin_lock(&root->inode_lock); - if (refcount_read(&delayed_node->refs) == 0) { - radix_tree_delete(&root->delayed_nodes_tree, - delayed_node->inode_id); - free = true; - } + /* + * Once our refcount goes to zero, nobody is allowed to bump it + * back up. We can delete it now. + */ + ASSERT(refcount_read(&delayed_node->refs) == 0); + radix_tree_delete(&root->delayed_nodes_tree, + delayed_node->inode_id); spin_unlock(&root->inode_lock); - if (free) - kmem_cache_free(delayed_node_cache, delayed_node); + kmem_cache_free(delayed_node_cache, delayed_node); } } @@ -528,11 +553,12 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item( } static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, + struct btrfs_root *root, struct btrfs_delayed_item *item) { struct btrfs_block_rsv *src_rsv; struct btrfs_block_rsv *dst_rsv; + struct btrfs_fs_info *fs_info = root->fs_info; u64 num_bytes; int ret; @@ -554,15 +580,17 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, return ret; } -static void btrfs_delayed_item_release_metadata(struct btrfs_fs_info *fs_info, +static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, struct btrfs_delayed_item *item) { struct btrfs_block_rsv *rsv; + struct btrfs_fs_info *fs_info = root->fs_info; if (!item->bytes_reserved) return; rsv = &fs_info->delayed_block_rsv; + btrfs_qgroup_convert_reserved_meta(root, item->bytes_reserved); trace_btrfs_space_reservation(fs_info, "delayed_item", item->key.objectid, item->bytes_reserved, 0); @@ -581,36 +609,15 @@ static int btrfs_delayed_inode_reserve_metadata( struct btrfs_block_rsv *dst_rsv; u64 num_bytes; int ret; - bool release = false; src_rsv = trans->block_rsv; dst_rsv = &fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - /* - * If our block_rsv is the delalloc block reserve then check and see if - * we have our extra reservation for updating the inode. If not fall - * through and try to reserve space quickly. - * - * We used to try and steal from the delalloc block rsv or the global - * reserve, but we'd steal a full reservation, which isn't kind. We are - * here through delalloc which means we've likely just cowed down close - * to the leaf that contains the inode, so we would steal less just - * doing the fallback inode update, so if we do end up having to steal - * from the global block rsv we hopefully only steal one or two blocks - * worth which is less likely to hurt us. - */ - if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) { - spin_lock(&inode->lock); - if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &inode->runtime_flags)) - release = true; - else - src_rsv = NULL; - spin_unlock(&inode->lock); - } - + ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); + if (ret < 0) + return ret; /* * btrfs_dirty_inode will update the inode under btrfs_join_transaction * which doesn't reserve space for speed. This is a problem since we @@ -618,7 +625,7 @@ static int btrfs_delayed_inode_reserve_metadata( * space. * * Now if src_rsv == delalloc_block_rsv we'll let it just steal since - * we're accounted for. + * we always reserve enough to update the inode item. */ if (!src_rsv || (!trans->bytes_reserved && src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { @@ -630,8 +637,10 @@ static int btrfs_delayed_inode_reserve_metadata( * EAGAIN to make us stop the transaction we have, so return * ENOSPC instead so that btrfs_dirty_inode knows what to do. */ - if (ret == -EAGAIN) + if (ret == -EAGAIN) { ret = -ENOSPC; + btrfs_qgroup_free_meta_prealloc(root, num_bytes); + } if (!ret) { node->bytes_reserved = num_bytes; trace_btrfs_space_reservation(fs_info, @@ -643,37 +652,18 @@ static int btrfs_delayed_inode_reserve_metadata( } ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); - - /* - * Migrate only takes a reservation, it doesn't touch the size of the - * block_rsv. This is to simplify people who don't normally have things - * migrated from their block rsv. If they go to release their - * reservation, that will decrease the size as well, so if migrate - * reduced size we'd end up with a negative size. But for the - * delalloc_meta_reserved stuff we will only know to drop 1 reservation, - * but we could in fact do this reserve/migrate dance several times - * between the time we did the original reservation and we'd clean it - * up. So to take care of this, release the space for the meta - * reservation here. I think it may be time for a documentation page on - * how block rsvs. work. - */ if (!ret) { trace_btrfs_space_reservation(fs_info, "delayed_inode", btrfs_ino(inode), num_bytes, 1); node->bytes_reserved = num_bytes; } - if (release) { - trace_btrfs_space_reservation(fs_info, "delalloc", - btrfs_ino(inode), num_bytes, 0); - btrfs_block_rsv_release(fs_info, src_rsv, num_bytes); - } - return ret; } static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_node *node) + struct btrfs_delayed_node *node, + bool qgroup_free) { struct btrfs_block_rsv *rsv; @@ -685,6 +675,12 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info, node->inode_id, node->bytes_reserved, 0); btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved); + if (qgroup_free) + btrfs_qgroup_free_meta_prealloc(node->root, + node->bytes_reserved); + else + btrfs_qgroup_convert_reserved_meta(node->root, + node->bytes_reserved); node->bytes_reserved = 0; } @@ -786,7 +782,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, curr->data_len); slot++; - btrfs_delayed_item_release_metadata(fs_info, curr); + btrfs_delayed_item_release_metadata(root, curr); list_del(&curr->tree_list); btrfs_release_delayed_item(curr); @@ -808,7 +804,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_item *delayed_item) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; char *ptr; int ret; @@ -826,7 +821,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, delayed_item->data_len); btrfs_mark_buffer_dirty(leaf); - btrfs_delayed_item_release_metadata(fs_info, delayed_item); + btrfs_delayed_item_release_metadata(root, delayed_item); return 0; } @@ -878,7 +873,6 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_item *item) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_delayed_item *curr, *next; struct extent_buffer *leaf; struct btrfs_key key; @@ -928,7 +922,7 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, goto out; list_for_each_entry_safe(curr, next, &head, tree_list) { - btrfs_delayed_item_release_metadata(fs_info, curr); + btrfs_delayed_item_release_metadata(root, curr); list_del(&curr->tree_list); btrfs_release_delayed_item(curr); } @@ -1071,7 +1065,7 @@ out: no_iref: btrfs_release_path(path); err_out: - btrfs_delayed_inode_release_metadata(fs_info, node); + btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0)); btrfs_release_delayed_inode(node); return ret; @@ -1135,9 +1129,9 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, * Returns < 0 on error and returns with an aborted transaction with any * outstanding delayed items cleaned up. */ -static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, int nr) +static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; struct btrfs_path *path; @@ -1182,16 +1176,14 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, return ret; } -int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_run_delayed_items(struct btrfs_trans_handle *trans) { - return __btrfs_run_delayed_items(trans, fs_info, -1); + return __btrfs_run_delayed_items(trans, -1); } -int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, int nr) +int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr) { - return __btrfs_run_delayed_items(trans, fs_info, nr); + return __btrfs_run_delayed_items(trans, nr); } int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, @@ -1323,40 +1315,42 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) if (!path) goto out; -again: - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2) - goto free_path; + do { + if (atomic_read(&delayed_root->items) < + BTRFS_DELAYED_BACKGROUND / 2) + break; - delayed_node = btrfs_first_prepared_delayed_node(delayed_root); - if (!delayed_node) - goto free_path; + delayed_node = btrfs_first_prepared_delayed_node(delayed_root); + if (!delayed_node) + break; - path->leave_spinning = 1; - root = delayed_node->root; + path->leave_spinning = 1; + root = delayed_node->root; - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - goto release_path; + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + btrfs_release_path(path); + btrfs_release_prepared_delayed_node(delayed_node); + total_done++; + continue; + } - block_rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->delayed_block_rsv; + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; - __btrfs_commit_inode_delayed_items(trans, path, delayed_node); + __btrfs_commit_inode_delayed_items(trans, path, delayed_node); - trans->block_rsv = block_rsv; - btrfs_end_transaction(trans); - btrfs_btree_balance_dirty_nodelay(root->fs_info); + trans->block_rsv = block_rsv; + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty_nodelay(root->fs_info); -release_path: - btrfs_release_path(path); - total_done++; + btrfs_release_path(path); + btrfs_release_prepared_delayed_node(delayed_node); + total_done++; - btrfs_release_prepared_delayed_node(delayed_node); - if ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) || - total_done < async_work->nr) - goto again; + } while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) + || total_done < async_work->nr); -free_path: btrfs_free_path(path); out: wake_up(&delayed_root->wait); @@ -1369,10 +1363,6 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, { struct btrfs_async_delayed_work *async_work; - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND || - btrfs_workqueue_normal_congested(fs_info->delayed_workers)) - return 0; - async_work = kmalloc(sizeof(*async_work), GFP_NOFS); if (!async_work) return -ENOMEM; @@ -1408,7 +1398,8 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) { struct btrfs_delayed_root *delayed_root = fs_info->delayed_root; - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) + if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) || + btrfs_workqueue_normal_congested(fs_info->delayed_workers)) return; if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { @@ -1464,7 +1455,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, btrfs_set_stack_dir_type(dir_item, type); memcpy((char *)(dir_item + 1), name, name_len); - ret = btrfs_delayed_item_reserve_metadata(trans, fs_info, delayed_item); + ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, delayed_item); /* * we have reserved enough space when we start a new transaction, * so reserving metadata failure is impossible @@ -1501,7 +1492,7 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, return 1; } - btrfs_delayed_item_release_metadata(fs_info, item); + btrfs_delayed_item_release_metadata(node->root, item); btrfs_release_delayed_item(item); mutex_unlock(&node->mutex); return 0; @@ -1536,7 +1527,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, item->key = item_key; - ret = btrfs_delayed_item_reserve_metadata(trans, fs_info, item); + ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item); /* * we have reserved enough space when we start a new transaction, * so reserving metadata failure is impossible. @@ -1654,28 +1645,18 @@ void btrfs_readdir_put_delayed_items(struct inode *inode, int btrfs_should_delete_dir_index(struct list_head *del_list, u64 index) { - struct btrfs_delayed_item *curr, *next; - int ret; - - if (list_empty(del_list)) - return 0; + struct btrfs_delayed_item *curr; + int ret = 0; - list_for_each_entry_safe(curr, next, del_list, readdir_list) { + list_for_each_entry(curr, del_list, readdir_list) { if (curr->key.offset > index) break; - - list_del(&curr->readdir_list); - ret = (curr->key.offset == index); - - if (refcount_dec_and_test(&curr->refs)) - kfree(curr); - - if (ret) - return 1; - else - continue; + if (curr->key.offset == index) { + ret = 1; + break; + } } - return 0; + return ret; } /* @@ -1744,7 +1725,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); btrfs_set_stack_inode_generation(inode_item, BTRFS_I(inode)->generation); - btrfs_set_stack_inode_sequence(inode_item, inode->i_version); + btrfs_set_stack_inode_sequence(inode_item, + inode_peek_iversion(inode)); btrfs_set_stack_inode_transid(inode_item, trans->transid); btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); @@ -1798,7 +1780,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item); - inode->i_version = btrfs_stack_inode_sequence(inode_item); + inode_set_iversion_queried(inode, + btrfs_stack_inode_sequence(inode_item)); inode->i_rdev = 0; *rdev = btrfs_stack_inode_rdev(inode_item); BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); @@ -1909,7 +1892,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) mutex_lock(&delayed_node->mutex); curr_item = __btrfs_first_delayed_insertion_item(delayed_node); while (curr_item) { - btrfs_delayed_item_release_metadata(fs_info, curr_item); + btrfs_delayed_item_release_metadata(root, curr_item); prev_item = curr_item; curr_item = __btrfs_next_delayed_item(prev_item); btrfs_release_delayed_item(prev_item); @@ -1917,7 +1900,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) curr_item = __btrfs_first_delayed_deletion_item(delayed_node); while (curr_item) { - btrfs_delayed_item_release_metadata(fs_info, curr_item); + btrfs_delayed_item_release_metadata(root, curr_item); prev_item = curr_item; curr_item = __btrfs_next_delayed_item(prev_item); btrfs_release_delayed_item(prev_item); @@ -1927,7 +1910,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) btrfs_release_delayed_iref(delayed_node); if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { - btrfs_delayed_inode_release_metadata(fs_info, delayed_node); + btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false); btrfs_release_delayed_inode(delayed_node); } mutex_unlock(&delayed_node->mutex); |