From 53b381b3abeb86f12787a6c40fee9b2f71edc23b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 29 Jan 2013 18:40:14 -0500 Subject: Btrfs: RAID5 and RAID6 This builds on David Woodhouse's original Btrfs raid5/6 implementation. The code has changed quite a bit, blame Chris Mason for any bugs. Read/modify/write is done after the higher levels of the filesystem have prepared a given bio. This means the higher layers are not responsible for building full stripes, and they don't need to query for the topology of the extents that may get allocated during delayed allocation runs. It also means different files can easily share the same stripe. But, it does expose us to incorrect parity if we crash or lose power while doing a read/modify/write cycle. This will be addressed in a later commit. Scrub is unable to repair crc errors on raid5/6 chunks. Discard does not work on raid5/6 (yet) The stripe size is fixed at 64KiB per disk. This will be tunable in a later commit. Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 87fac9a21ea5..a065dec0e330 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -686,7 +686,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root, struct extent_state *cached_state = NULL; u64 start = 0; u64 end; + struct blk_plug plug; + blk_start_plug(&plug); while (!find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, @@ -700,6 +702,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, } if (err) werr = err; + blk_finish_plug(&plug); return werr; } -- cgit From bb721703aa551e98dc5c7fb259cf90343408baf2 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 29 Jan 2013 18:44:12 -0500 Subject: Btrfs: reduce CPU contention while waiting for delayed extent operations We batch up operations to the extent allocation tree, which allows us to deal with the recursive nature of using the extent allocation tree to allocate extents to the extent allocation tree. It also provides a mechanism to sort and collect extent operations, which makes it much more efficient to record extents that are close together. The delayed extent operations must all be finished before the running transaction commits, so we have code to make sure and run a few of the batched operations when closing our transaction handles. This creates a great deal of contention for the locks in the delayed extent operation tree, and also contention for the lock on the extent allocation tree itself. All the extra contention just slows down the operations and doesn't get things done any faster. This commit changes things to use a wait queue instead. As procs want to run the delayed operations, one of them races in and gets permission to hit the tree, and the others step back and wait for progress to be made. Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a065dec0e330..1e7f176bd211 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -156,6 +156,9 @@ loop: spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); + atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); + atomic_set(&cur_trans->delayed_refs.ref_seq, 0); + init_waitqueue_head(&cur_trans->delayed_refs.wait); INIT_LIST_HEAD(&cur_trans->pending_snapshots); list_add_tail(&cur_trans->list, &fs_info->trans_list); @@ -577,7 +580,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); - while (count < 2) { + while (count < 1) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (cur && @@ -589,6 +592,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } count++; } + btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; -- cgit From 7b5a1c5310a50abc96c9ca07039688027d0a4282 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 15 Nov 2012 08:14:11 +0000 Subject: Btrfs: cleanup unnecessary clear when freeing a transaction or a trans handle We clear the transaction object and the trans handle when they are about to be freed, it is unnecessary, cleanup it. Signed-off-by: Miao Xie --- fs/btrfs/transaction.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index fc03aa60b684..a1455f1e4676 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -40,7 +40,6 @@ void put_transaction(struct btrfs_transaction *transaction) if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(transaction->delayed_refs.root.rb_node); - memset(transaction, 0, sizeof(*transaction)); kmem_cache_free(btrfs_transaction_cachep, transaction); } } @@ -650,7 +649,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } assert_qgroups_uptodate(trans); - memset(trans, 0, sizeof(*trans)); kmem_cache_free(btrfs_trans_handle_cachep, trans); return err; } -- cgit From 7892b5afe4a1a00af25107e27357db30434ab876 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 15 Nov 2012 08:14:47 +0000 Subject: Btrfs: use common work instead of delayed work Since we do not want to delay the async transaction commit, we should use common work, not delayed work. Signed-off-by: Miao Xie --- fs/btrfs/transaction.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a1455f1e4676..86bb105b3982 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1305,13 +1305,13 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, struct btrfs_async_commit { struct btrfs_trans_handle *newtrans; struct btrfs_root *root; - struct delayed_work work; + struct work_struct work; }; static void do_async_commit(struct work_struct *work) { struct btrfs_async_commit *ac = - container_of(work, struct btrfs_async_commit, work.work); + container_of(work, struct btrfs_async_commit, work); /* * We've got freeze protection passed with the transaction. @@ -1339,7 +1339,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, if (!ac) return -ENOMEM; - INIT_DELAYED_WORK(&ac->work, do_async_commit); + INIT_WORK(&ac->work, do_async_commit); ac->root = root; ac->newtrans = btrfs_join_transaction(root); if (IS_ERR(ac->newtrans)) { @@ -1363,7 +1363,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1, _THIS_IP_); - schedule_delayed_work(&ac->work, 0); + schedule_work(&ac->work); /* wait for transaction to start and unblock */ if (wait_for_unblock) -- cgit From c6b305a89b1903d63652691ad5eb9f05aa0326b8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Dec 2012 09:16:16 -0500 Subject: Btrfs: don't re-enter when allocating a chunk If we start running low on metadata space we will try to allocate a chunk, which could then try to allocate a chunk to add the device entry. The thing is we allocate a chunk before we try really hard to make the allocation, so we should be able to find space for the device entry. Add a flag to the trans handle so we know we're currently allocating a chunk so we can just bail out if we try to allocate another chunk. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 86bb105b3982..24fde97cba81 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -385,6 +385,7 @@ again: h->qgroup_reserved = qgroup_reserved; h->delayed_ref_elem.seq = 0; h->type = type; + h->allocating_chunk = false; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); -- cgit From 3edb2a68cb23cd6ca84022421eeae2604722cdc4 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 22 Jan 2013 10:49:33 +0000 Subject: Btrfs: check the return value of btrfs_start_delalloc_inodes() We forget to check the return value of btrfs_start_delalloc_inodes(), fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 24fde97cba81..42dac27207ab 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1427,7 +1427,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, } if (flush_on_commit || snap_pending) { - btrfs_start_delalloc_inodes(root, 1); + ret = btrfs_start_delalloc_inodes(root, 1); + if (ret) + return ret; btrfs_wait_ordered_extents(root, 1); } -- cgit From eebc60840636e7351371fc17bcd057384bf0c16a Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 22 Jan 2013 10:50:01 +0000 Subject: Btrfs: check the return value of btrfs_run_ordered_operations() We forget to check the return value of btrfs_run_ordered_operations() when flushing all the pending stuffs, fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 42dac27207ab..34610dc6d140 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1451,9 +1451,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, * it here and no for sure that nothing new will be added * to the list */ - btrfs_run_ordered_operations(root, 1); + ret = btrfs_run_ordered_operations(root, 1); - return 0; + return ret; } /* -- cgit From 87533c475187c1420794a2e164bc67a7974f1327 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 29 Jan 2013 10:14:48 +0000 Subject: Btrfs: use bit operation for ->fs_state There is no lock to protect fs_info->fs_state, it will introduce some problems, such as the value may be covered by the other task when several tasks modify it. For example: Task0 - CPU0 Task1 - CPU1 mov %fs_state rax or $0x1 rax mov %fs_state rax or $0x2 rax mov rax %fs_state mov rax %fs_state The expected value is 3, but in fact, it is 2. Though this problem doesn't happen now (because there is only one flag currently), the code is error prone, if we add other flags, the above problem will happen to a certainty. Now we use bit operation for it to fix the above problem. In this way, we can make the code more robust and be easy to add new flags. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 34610dc6d140..baf6d74fd0f2 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -61,7 +61,7 @@ static noinline int join_transaction(struct btrfs_root *root, int type) spin_lock(&fs_info->trans_lock); loop: /* The file system has been taken offline. No new transactions. */ - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { spin_unlock(&fs_info->trans_lock); return -EROFS; } @@ -113,7 +113,7 @@ loop: kmem_cache_free(btrfs_transaction_cachep, cur_trans); cur_trans = fs_info->running_transaction; goto loop; - } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { spin_unlock(&fs_info->trans_lock); kmem_cache_free(btrfs_transaction_cachep, cur_trans); return -EROFS; @@ -301,7 +301,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, int ret; u64 qgroup_reserved = 0; - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); if (current->journal_info) { @@ -645,9 +645,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_run_delayed_iputs(root); if (trans->aborted || - root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) err = -EIO; - } assert_qgroups_uptodate(trans); kmem_cache_free(btrfs_trans_handle_cachep, trans); -- cgit From e4a2bcaca9643e7430207810653222fc5187f2be Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 6 Feb 2013 16:55:41 -0500 Subject: Btrfs: if we aren't committing just end the transaction if we error out I hit a deadlock where transaction commit was waiting on num_writers to be 0. This happened because somebody came into btrfs_commit_transaction and noticed we had aborted and it went to cleanup_transaction. This shouldn't happen because cleanup_transaction is really to fixup a bad commit, it doesn't do the normal trans handle cleanup things. So if we have an error just do the normal btrfs_end_transaction dance and return. Once we are in the actual commit path we can use cleanup_transaction and be good to go. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index baf6d74fd0f2..5144ad19ef47 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1476,21 +1476,25 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_run_ordered_operations(root, 0); if (ret) { btrfs_abort_transaction(trans, root, ret); - goto cleanup_transaction; + btrfs_end_transaction(trans, root); + return ret; } /* Stop the commit early if ->aborted is set */ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; - goto cleanup_transaction; + btrfs_end_transaction(trans, root); + return ret; } /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here */ ret = btrfs_run_delayed_refs(trans, root, 0); - if (ret) - goto cleanup_transaction; + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; @@ -1507,8 +1511,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_create_pending_block_groups(trans, root); ret = btrfs_run_delayed_refs(trans, root, 0); - if (ret) - goto cleanup_transaction; + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } spin_lock(&cur_trans->commit_lock); if (cur_trans->in_commit) { -- cgit From de78b51a2852bddccd6535e9e12de65f92787a1e Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 31 Jan 2013 18:21:12 +0000 Subject: btrfs: remove cache only arguments from defrag path The entry point at the defrag ioctl always sets "cache only" to 0; the codepaths haven't run for a long time as far as I can tell. Chris says they're dead code, so remove them. Signed-off-by: Eric Sandeen Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5144ad19ef47..60481a53e004 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -959,10 +959,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, } /* - * defrag a given btree. If cacheonly == 1, this won't read from the disk, - * otherwise every leaf in the btree is read and defragged. + * defrag a given btree. + * Every leaf in the btree is read and defragged. */ -int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) +int btrfs_defrag_root(struct btrfs_root *root) { struct btrfs_fs_info *info = root->fs_info; struct btrfs_trans_handle *trans; @@ -976,7 +976,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_defrag_leaves(trans, root, cacheonly); + ret = btrfs_defrag_leaves(trans, root); btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(info->tree_root); -- cgit From 210549ebe9047ae5a8cc47487203d3ee16a7749b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sat, 9 Feb 2013 23:38:06 +0000 Subject: btrfs: add cancellation points to defrag The defrag operation can take very long, we want to have a way how to cancel it. The code checks for a pending signal at safe points in the defrag loops and returns EAGAIN. This means a user can press ^C after running 'btrfs fi defrag', woks for both defrag modes, files and root. Returning from the command was instant in my light tests, but may take longer depending on the aging factor of the filesystem. Signed-off-by: David Sterba Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 60481a53e004..d574d830a1c4 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -984,6 +984,12 @@ int btrfs_defrag_root(struct btrfs_root *root) if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) break; + + if (btrfs_defrag_cancelled(root->fs_info)) { + printk(KERN_DEBUG "btrfs: defrag_root cancelled\n"); + ret = -EAGAIN; + break; + } } root->defrag_running = 0; return ret; -- cgit From 569e0f358c0c37f6733702d4a5d2c412860f7169 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 13 Feb 2013 11:09:14 -0500 Subject: Btrfs: place ordered operations on a per transaction list Miao made the ordered operations stuff run async, which introduced a deadlock where we could get somebody (sync) racing in and committing the transaction while a commit was already happening. The new committer would try and flush ordered operations which would hang waiting for the commit to finish because it is done asynchronously and no longer inherits the callers trans handle. To fix this we need to make the ordered operations list a per transaction list. We can get new inodes added to the ordered operation list by truncating them and then having another process writing to them, so this makes it so that anybody trying to add an ordered operation _must_ start a transaction in order to add itself to the list, which will keep new inodes from getting added to the ordered operations list after we start committing. This should fix the deadlock and also keeps us from doing a lot more work than we need to during commit. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d574d830a1c4..0c87d18d1881 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -157,6 +157,7 @@ loop: spin_lock_init(&cur_trans->delayed_refs.lock); INIT_LIST_HEAD(&cur_trans->pending_snapshots); + INIT_LIST_HEAD(&cur_trans->ordered_operations); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -1456,7 +1457,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, * it here and no for sure that nothing new will be added * to the list */ - ret = btrfs_run_ordered_operations(root, 1); + ret = btrfs_run_ordered_operations(trans, root, 1); return ret; } @@ -1479,7 +1480,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, int should_grow = 0; unsigned long now = get_seconds(); - ret = btrfs_run_ordered_operations(root, 0); + ret = btrfs_run_ordered_operations(trans, root, 0); if (ret) { btrfs_abort_transaction(trans, root, ret); btrfs_end_transaction(trans, root); -- cgit From 4b82490649f5c8ecbf888752c325ea68831c497e Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 20 Feb 2013 09:13:32 +0000 Subject: Btrfs: fix the qgroup reserved space is released prematurely In start_transactio(), we will try to join the transaction again after the current transaction is committed, so we should not release the reserved space of the qgroup. Fix it. Cc: Arne Jansen Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0c87d18d1881..425d5b57d377 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -383,7 +383,7 @@ again: h->block_rsv = NULL; h->orig_rsv = NULL; h->aborted = 0; - h->qgroup_reserved = qgroup_reserved; + h->qgroup_reserved = 0; h->delayed_ref_elem.seq = 0; h->type = type; h->allocating_chunk = false; @@ -402,6 +402,7 @@ again: h->block_rsv = &root->fs_info->trans_block_rsv; h->bytes_reserved = num_bytes; } + h->qgroup_reserved = qgroup_reserved; got_it: btrfs_record_root_in_trans(h, root); -- cgit From 178260b2c14969f29ba39a78df74ed485abc6203 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 20 Feb 2013 09:16:24 +0000 Subject: Btrfs: fix the deadlock between the transaction start/attach and commit Now btrfs_commit_transaction() does this ret = btrfs_run_ordered_operations(root, 0) which async flushes all inodes on the ordered operations list, it introduced a deadlock that transaction-start task, transaction-commit task and the flush workers waited for each other. (See the following URL to get the detail http://marc.info/?l=linux-btrfs&m=136070705732646&w=2) As we know, if ->in_commit is set, it means someone is committing the current transaction, we should not try to join it if we are not JOIN or JOIN_NOLOCK, wait is the best choice for it. In this way, we can avoid the above problem. In this way, there is another benefit: there is no new transaction handle to block the transaction which is on the way of commit, once we set ->in_commit. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 425d5b57d377..5767ea1c0150 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -50,6 +50,14 @@ static noinline void switch_commit_root(struct btrfs_root *root) root->commit_root = btrfs_root_node(root); } +static inline int can_join_transaction(struct btrfs_transaction *trans, + int type) +{ + return !(trans->in_commit && + type != TRANS_JOIN && + type != TRANS_JOIN_NOLOCK); +} + /* * either allocate a new transaction or hop into the existing one */ @@ -85,6 +93,10 @@ loop: spin_unlock(&fs_info->trans_lock); return cur_trans->aborted; } + if (!can_join_transaction(cur_trans, type)) { + spin_unlock(&fs_info->trans_lock); + return -EBUSY; + } atomic_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); cur_trans->num_joined++; @@ -360,8 +372,11 @@ again: do { ret = join_transaction(root, type); - if (ret == -EBUSY) + if (ret == -EBUSY) { wait_current_trans(root); + if (unlikely(type == TRANS_ATTACH)) + ret = -ENOENT; + } } while (ret == -EBUSY); if (ret < 0) { -- cgit From d4edf39bd5db443151efc993dac67ec9d6b5b8c1 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 20 Feb 2013 09:17:06 +0000 Subject: Btrfs: fix uncompleted transaction In some cases, we need commit the current transaction, but don't want to start a new one if there is no running transaction, so we introduce the function - btrfs_attach_transaction(), which can catch the current transaction, and return -ENOENT if there is no running transaction. But no running transaction doesn't mean the current transction completely, because we removed the running transaction before it completes. In some cases, it doesn't matter. But in some special cases, such as freeze fs, we hope the transaction is fully on disk, it will introduce some bugs, for example, we may feeze the fs and dump the data in the disk, if the transction doesn't complete, we would dump inconsistent data. So we need fix the above problem for those cases. We fixes this problem by introducing a function: btrfs_attach_transaction_barrier() if we hope all the transaction is fully on the disk, even they are not running, we can use this function. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5767ea1c0150..c1ce664c0c39 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -469,11 +469,43 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root return start_transaction(root, 0, TRANS_USERSPACE, 0); } +/* + * btrfs_attach_transaction() - catch the running transaction + * + * It is used when we want to commit the current the transaction, but + * don't want to start a new one. + * + * Note: If this function return -ENOENT, it just means there is no + * running transaction. But it is possible that the inactive transaction + * is still in the memory, not fully on disk. If you hope there is no + * inactive transaction in the fs when -ENOENT is returned, you should + * invoke + * btrfs_attach_transaction_barrier() + */ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_ATTACH, 0); } +/* + * btrfs_attach_transaction() - catch the running transaction + * + * It is similar to the above function, the differentia is this one + * will wait for all the inactive transactions until they fully + * complete. + */ +struct btrfs_trans_handle * +btrfs_attach_transaction_barrier(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + + trans = start_transaction(root, 0, TRANS_ATTACH, 0); + if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT) + btrfs_wait_for_commit(root, 0); + + return trans; +} + /* wait for a transaction commit to be fully complete */ static noinline void wait_for_commit(struct btrfs_root *root, struct btrfs_transaction *commit) -- cgit From 272d26d0ad8c0e326689f2fa3cdc6a5fcc8e74e0 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 20 Feb 2013 14:16:39 +0000 Subject: Btrfs: fix missing release of qgroup reservation in commit_transaction() We forget to free qgroup reservation in commit_transaction(),fix it. Signed-off-by: Miao Xie Signed-off-by: Wang Shilong Cc: Arne Jansen Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c1ce664c0c39..955204ca0447 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1553,6 +1553,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; + if (trans->qgroup_reserved) { + btrfs_qgroup_free(root, trans->qgroup_reserved); + trans->qgroup_reserved = 0; + } cur_trans = trans->transaction; @@ -1833,6 +1837,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, cleanup_transaction: btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; + if (trans->qgroup_reserved) { + btrfs_qgroup_free(root, trans->qgroup_reserved); + trans->qgroup_reserved = 0; + } btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); // WARN_ON(1); if (current->journal_info == trans) -- cgit From 2382c5cc7ed0396b61a359765bf5ee125b0a2f46 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 22 Feb 2013 04:33:36 +0000 Subject: Btrfs: use reserved space for creating a snapshot While inserting dir index and updating inode for a snapshot, we'd add delayed items which consume trans->block_rsv, if we don't have any space reserved in this trans handle, we either just return or reserve space again. But before creating pending snapshots during committing transaction, we've done a release on this trans handle, so we don't have space reserved in it at this stage. What we're using is block_rsv of pending snapshots which has already reserved well enough space for both inserting dir index and updating inode, so we need to set trans handle to indicate that we have space now. Signed-off-by: Liu Bo Reviewed-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a83d486cc70c..4330433b7b4f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1123,6 +1123,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; + trans->bytes_reserved = trans->block_rsv->reserved; dentry = pending->dentry; parent = dget_parent(dentry); @@ -1276,6 +1277,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, fail: dput(parent); trans->block_rsv = rsv; + trans->bytes_reserved = 0; no_free_objectid: kfree(new_root_item); root_item_alloc_fail: -- cgit From f094ac32aba3a51c00e970a2ea029339af2ca048 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 27 Feb 2013 13:28:25 +0000 Subject: Btrfs: fix NULL pointer after aborting a transaction While doing cleanup work on an aborted transaction, we've set the global running transaction pointer to NULL _before_ waiting all other transaction handles to finish, so others'd hit NULL pointer crash when referencing the global running transaction pointer. This first sets a hint to avoid new transaction handle joining, then waits other existing handles to abort or finish so that we can safely set the above global pointer to NULL. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 4330433b7b4f..3733c4939a27 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1447,6 +1447,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, int err) { struct btrfs_transaction *cur_trans = trans->transaction; + DEFINE_WAIT(wait); WARN_ON(trans->use_count > 1); @@ -1455,8 +1456,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->trans_lock); list_del_init(&cur_trans->list); if (cur_trans == root->fs_info->running_transaction) { + root->fs_info->trans_no_join = 1; + spin_unlock(&root->fs_info->trans_lock); + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + + spin_lock(&root->fs_info->trans_lock); root->fs_info->running_transaction = NULL; - root->fs_info->trans_no_join = 0; } spin_unlock(&root->fs_info->trans_lock); -- cgit From e9662f701c85ebc99f532bf8bb53208c0648846a Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 28 Feb 2013 10:01:15 +0000 Subject: Btrfs: remove unnecessary dget_parent/dput when creating the pending snapshot Since we have grabbed the parent inode at the beginning of the snapshot creation, and both sync and async snapshot creation release it after the pending snapshots are actually created, it is safe to access the parent inode directly during the snapshot creation, we needn't use dget_parent/dput to fix the parent dentry and get the dir inode. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3733c4939a27..71de435a291e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1068,7 +1068,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct inode *parent_inode; struct btrfs_path *path; struct btrfs_dir_item *dir_item; - struct dentry *parent; struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; @@ -1126,8 +1125,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, trans->bytes_reserved = trans->block_rsv->reserved; dentry = pending->dentry; - parent = dget_parent(dentry); - parent_inode = parent->d_inode; + parent_inode = pending->dir; parent_root = BTRFS_I(parent_inode)->root; record_root_in_trans(trans, parent_root); @@ -1275,7 +1273,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret) btrfs_abort_transaction(trans, root, ret); fail: - dput(parent); trans->block_rsv = rsv; trans->bytes_reserved = 0; no_free_objectid: -- cgit From d5c1207017cd8387b4d3224dd7ab6cf5cd7f1c9a Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 28 Feb 2013 10:04:33 +0000 Subject: Btrfs: fix wrong reserved space in qgroup during snap/subv creation There are two problems in the space reservation of the snapshot/ subvolume creation. - don't reserve the space for the root item insertion - the space which is reserved in the qgroup is different with the free space reservation. we need reserve free space for 7 items, but in qgroup reservation, we need reserve space only for 3 items. So we implement new metadata reservation functions for the snapshot/subvolume creation. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 71de435a291e..f11c2e0a3746 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1082,7 +1082,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = pending->error = -ENOMEM; - goto path_alloc_fail; + return ret; } new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); @@ -1279,8 +1279,6 @@ no_free_objectid: kfree(new_root_item); root_item_alloc_fail: btrfs_free_path(path); -path_alloc_fail: - btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); return ret; } -- cgit