diff options
Diffstat (limited to 'fs/btrfs/ordered-data.c')
| -rw-r--r-- | fs/btrfs/ordered-data.c | 184 |
1 files changed, 106 insertions, 78 deletions
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 30eceaf829a7..5df02c707aee 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -153,25 +153,30 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( struct btrfs_ordered_extent *entry; int ret; u64 qgroup_rsv = 0; + const bool is_nocow = (flags & + ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC))); - if (flags & - ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { - /* For nocow write, we can release the qgroup rsv right now */ + /* + * For a NOCOW write we can free the qgroup reserve right now. For a COW + * one we transfer the reserved space from the inode's iotree into the + * ordered extent by calling btrfs_qgroup_release_data() and tracking + * the qgroup reserved amount in the ordered extent, so that later after + * completing the ordered extent, when running the data delayed ref it + * creates, we free the reserved data with btrfs_qgroup_free_refroot(). + */ + if (is_nocow) ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); - if (ret < 0) - return ERR_PTR(ret); - } else { - /* - * The ordered extent has reserved qgroup space, release now - * and pass the reserved number for qgroup_record to free. - */ + else ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv); - if (ret < 0) - return ERR_PTR(ret); - } + + if (ret < 0) + return ERR_PTR(ret); + entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); - if (!entry) - return ERR_PTR(-ENOMEM); + if (!entry) { + entry = ERR_PTR(-ENOMEM); + goto out; + } entry->file_offset = file_offset; entry->num_bytes = num_bytes; @@ -180,7 +185,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( entry->disk_num_bytes = disk_num_bytes; entry->offset = offset; entry->bytes_left = num_bytes; - entry->inode = BTRFS_I(igrab(&inode->vfs_inode)); + if (WARN_ON_ONCE(!igrab(&inode->vfs_inode))) { + kmem_cache_free(btrfs_ordered_extent_cache, entry); + entry = ERR_PTR(-ESTALE); + goto out; + } + entry->inode = inode; entry->compress_type = compress_type; entry->truncated_len = (u64)-1; entry->qgroup_rsv = qgroup_rsv; @@ -203,6 +213,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( btrfs_mod_outstanding_extents(inode, 1); spin_unlock(&inode->lock); +out: + if (IS_ERR(entry) && !is_nocow) + btrfs_qgroup_free_refroot(inode->root->fs_info, + btrfs_root_id(inode->root), + qgroup_rsv, BTRFS_QGROUP_RSV_DATA); + return entry; } @@ -221,14 +237,14 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) /* One ref for the tree. */ refcount_inc(&entry->refs); - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); node = tree_insert(&inode->ordered_tree, entry->file_offset, &entry->rb_node); if (unlikely(node)) btrfs_panic(fs_info, -EEXIST, "inconsistency in ordered tree at offset %llu", entry->file_offset); - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); spin_lock(&root->ordered_extent_lock); list_add_tail(&entry->root_extent_list, @@ -253,7 +269,7 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) * @disk_bytenr: Offset of extent on disk. * @disk_num_bytes: Size of extent on disk. * @offset: Offset into unencoded data where file data starts. - * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*). + * @flags: Flags specifying type of extent (1U << BTRFS_ORDERED_*). * @compress_type: Compression algorithm used for data. * * Most of these parameters correspond to &struct btrfs_file_extent_item. The @@ -312,9 +328,9 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, { struct btrfs_inode *inode = entry->inode; - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); list_add_tail(&sum->list, &entry->list); - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); } void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered) @@ -343,7 +359,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, if (folio) { ASSERT(folio->mapping); ASSERT(folio_pos(folio) <= file_offset); - ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio)); + ASSERT(file_offset + len <= folio_next_pos(folio)); /* * Ordered flag indicates whether we still have @@ -401,15 +417,14 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, bool uptodate) { struct btrfs_inode *inode = ordered->inode; - unsigned long flags; bool ret; trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); - spin_lock_irqsave(&inode->ordered_tree_lock, flags); + spin_lock(&inode->ordered_tree_lock); ret = can_finish_ordered_extent(ordered, folio, file_offset, len, uptodate); - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); /* * If this is a COW write it means we created new extent maps for the @@ -465,18 +480,16 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, { struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - unsigned long flags; u64 cur = file_offset; + const u64 end = file_offset + num_bytes; - trace_btrfs_writepage_end_io_hook(inode, file_offset, - file_offset + num_bytes - 1, - uptodate); + trace_btrfs_writepage_end_io_hook(inode, file_offset, end - 1, uptodate); - spin_lock_irqsave(&inode->ordered_tree_lock, flags); - while (cur < file_offset + num_bytes) { + spin_lock(&inode->ordered_tree_lock); + while (cur < end) { u64 entry_end; - u64 end; - u32 len; + u64 this_end; + u64 len; node = ordered_tree_search(inode, cur); /* No ordered extents at all */ @@ -519,19 +532,18 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, * | * cur */ - end = min(entry->file_offset + entry->num_bytes, - file_offset + num_bytes) - 1; - ASSERT(end + 1 - cur < U32_MAX); - len = end + 1 - cur; + this_end = min(entry_end, end); + len = this_end - cur; + ASSERT(len < U32_MAX); if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) { - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); btrfs_queue_ordered_fn(entry); - spin_lock_irqsave(&inode->ordered_tree_lock, flags); + spin_lock(&inode->ordered_tree_lock); } cur += len; } - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); } /* @@ -557,10 +569,9 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, { struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - unsigned long flags; bool finished = false; - spin_lock_irqsave(&inode->ordered_tree_lock, flags); + spin_lock(&inode->ordered_tree_lock); if (cached && *cached) { entry = *cached; goto have_entry; @@ -597,7 +608,7 @@ out: refcount_inc(&entry->refs); trace_btrfs_ordered_extent_dec_test_pending(inode, entry); } - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); return finished; } @@ -607,23 +618,18 @@ out: */ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) { - struct list_head *cur; - struct btrfs_ordered_sum *sum; - trace_btrfs_ordered_extent_put(entry->inode, entry); if (refcount_dec_and_test(&entry->refs)) { + struct btrfs_ordered_sum *sum; + struct btrfs_ordered_sum *tmp; + ASSERT(list_empty(&entry->root_extent_list)); ASSERT(list_empty(&entry->log_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); - if (entry->inode) - btrfs_add_delayed_iput(entry->inode); - while (!list_empty(&entry->list)) { - cur = entry->list.next; - sum = list_entry(cur, struct btrfs_ordered_sum, list); - list_del(&sum->list); + btrfs_add_delayed_iput(entry->inode); + list_for_each_entry_safe(sum, tmp, &entry->list, list) kvfree(sum); - } kmem_cache_free(btrfs_ordered_extent_cache, entry); } } @@ -667,7 +673,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, fs_info->delalloc_batch); - spin_lock_irq(&btrfs_inode->ordered_tree_lock); + spin_lock(&btrfs_inode->ordered_tree_lock); node = &entry->rb_node; rb_erase(node, &btrfs_inode->ordered_tree); RB_CLEAR_NODE(node); @@ -675,7 +681,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, btrfs_inode->ordered_tree_last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); - spin_unlock_irq(&btrfs_inode->ordered_tree_lock); + spin_unlock(&btrfs_inode->ordered_tree_lock); /* * The current running transaction is waiting on us, we need to let it @@ -842,10 +848,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, /* * Start IO and wait for a given ordered extent to finish. * - * Wait on page writeback for all the pages in the extent and the IO completion - * code to insert metadata into the btree corresponding to the extent. + * Wait on page writeback for all the pages in the extent but not in + * [@nowriteback_start, @nowriteback_start + @nowriteback_len) and the + * IO completion code to insert metadata into the btree corresponding to the extent. */ -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) +void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry, + u64 nowriteback_start, u32 nowriteback_len) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; @@ -865,8 +873,19 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) * start IO on any dirty ones so the wait doesn't stall waiting * for the flusher thread to find them */ - if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); + if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) { + if (!nowriteback_len) { + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); + } else { + if (start < nowriteback_start) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, + nowriteback_start - 1); + if (nowriteback_start + nowriteback_len < end) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, + nowriteback_start + nowriteback_len, + end); + } + } if (!freespace_inode) btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); @@ -947,9 +966,8 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino { struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - unsigned long flags; - spin_lock_irqsave(&inode->ordered_tree_lock, flags); + spin_lock(&inode->ordered_tree_lock); node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -962,7 +980,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino trace_btrfs_ordered_extent_lookup(inode, entry); } out: - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); return entry; } @@ -975,7 +993,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); node = ordered_tree_search(inode, file_offset); if (!node) { node = ordered_tree_search(inode, file_offset + len); @@ -1002,7 +1020,7 @@ out: refcount_inc(&entry->refs); trace_btrfs_ordered_extent_lookup_range(inode, entry); } - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); return entry; } @@ -1017,7 +1035,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, btrfs_assert_inode_locked(inode); - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) { struct btrfs_ordered_extent *ordered; @@ -1031,7 +1049,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, refcount_inc(&ordered->refs); trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered); } - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); } /* @@ -1044,7 +1062,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -1053,7 +1071,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) refcount_inc(&entry->refs); trace_btrfs_ordered_extent_lookup_first(inode, entry); out: - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); return entry; } @@ -1075,7 +1093,7 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( struct rb_node *next; struct btrfs_ordered_extent *entry = NULL; - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); node = inode->ordered_tree.rb_node; /* * Here we don't want to use tree_search() which will use tree->last @@ -1130,7 +1148,7 @@ out: trace_btrfs_ordered_extent_lookup_first_range(inode, entry); } - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); return entry; } @@ -1160,7 +1178,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, cachedp = cached_state; while (1) { - lock_extent(&inode->io_tree, start, end, cachedp); + btrfs_lock_extent(&inode->io_tree, start, end, cachedp); ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); if (!ordered) { @@ -1173,7 +1191,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, refcount_dec(&cache->refs); break; } - unlock_extent(&inode->io_tree, start, end, cachedp); + btrfs_unlock_extent(&inode->io_tree, start, end, cachedp); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } @@ -1191,7 +1209,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, { struct btrfs_ordered_extent *ordered; - if (!try_lock_extent(&inode->io_tree, start, end, cached_state)) + if (!btrfs_try_lock_extent(&inode->io_tree, start, end, cached_state)) return false; ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); @@ -1199,7 +1217,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, return true; btrfs_put_ordered_extent(ordered); - unlock_extent(&inode->io_tree, start, end, cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, cached_state); return false; } @@ -1229,6 +1247,18 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( */ if (WARN_ON_ONCE(len >= ordered->num_bytes)) return ERR_PTR(-EINVAL); + /* + * If our ordered extent had an error there's no point in continuing. + * The error may have come from a transaction abort done either by this + * task or some other concurrent task, and the transaction abort path + * iterates over all existing ordered extents and sets the flag + * BTRFS_ORDERED_IOERR on them. + */ + if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) { + const int fs_error = BTRFS_FS_ERROR(fs_info); + + return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO); + } /* We cannot split partially completed ordered extents. */ if (ordered->bytes_left) { ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); @@ -1250,9 +1280,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( /* * Take the root's ordered_extent_lock to avoid a race with * btrfs_wait_ordered_extents() when updating the disk_bytenr and - * disk_num_bytes fields of the ordered extent below. And we disable - * IRQs because the inode's ordered_tree_lock is used in IRQ context - * elsewhere. + * disk_num_bytes fields of the ordered extent below. * * There's no concern about a previous caller of * btrfs_wait_ordered_extents() getting the trimmed ordered extent |
