summaryrefslogtreecommitdiff
path: root/fs/btrfs/ordered-data.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/ordered-data.c')
-rw-r--r--fs/btrfs/ordered-data.c184
1 files changed, 106 insertions, 78 deletions
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 30eceaf829a7..5df02c707aee 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -153,25 +153,30 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
struct btrfs_ordered_extent *entry;
int ret;
u64 qgroup_rsv = 0;
+ const bool is_nocow = (flags &
+ ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC)));
- if (flags &
- ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
- /* For nocow write, we can release the qgroup rsv right now */
+ /*
+ * For a NOCOW write we can free the qgroup reserve right now. For a COW
+ * one we transfer the reserved space from the inode's iotree into the
+ * ordered extent by calling btrfs_qgroup_release_data() and tracking
+ * the qgroup reserved amount in the ordered extent, so that later after
+ * completing the ordered extent, when running the data delayed ref it
+ * creates, we free the reserved data with btrfs_qgroup_free_refroot().
+ */
+ if (is_nocow)
ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv);
- if (ret < 0)
- return ERR_PTR(ret);
- } else {
- /*
- * The ordered extent has reserved qgroup space, release now
- * and pass the reserved number for qgroup_record to free.
- */
+ else
ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv);
- if (ret < 0)
- return ERR_PTR(ret);
- }
+
+ if (ret < 0)
+ return ERR_PTR(ret);
+
entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
- if (!entry)
- return ERR_PTR(-ENOMEM);
+ if (!entry) {
+ entry = ERR_PTR(-ENOMEM);
+ goto out;
+ }
entry->file_offset = file_offset;
entry->num_bytes = num_bytes;
@@ -180,7 +185,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
entry->disk_num_bytes = disk_num_bytes;
entry->offset = offset;
entry->bytes_left = num_bytes;
- entry->inode = BTRFS_I(igrab(&inode->vfs_inode));
+ if (WARN_ON_ONCE(!igrab(&inode->vfs_inode))) {
+ kmem_cache_free(btrfs_ordered_extent_cache, entry);
+ entry = ERR_PTR(-ESTALE);
+ goto out;
+ }
+ entry->inode = inode;
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
entry->qgroup_rsv = qgroup_rsv;
@@ -203,6 +213,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
btrfs_mod_outstanding_extents(inode, 1);
spin_unlock(&inode->lock);
+out:
+ if (IS_ERR(entry) && !is_nocow)
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ btrfs_root_id(inode->root),
+ qgroup_rsv, BTRFS_QGROUP_RSV_DATA);
+
return entry;
}
@@ -221,14 +237,14 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
/* One ref for the tree. */
refcount_inc(&entry->refs);
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
node = tree_insert(&inode->ordered_tree, entry->file_offset,
&entry->rb_node);
if (unlikely(node))
btrfs_panic(fs_info, -EEXIST,
"inconsistency in ordered tree at offset %llu",
entry->file_offset);
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
spin_lock(&root->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
@@ -253,7 +269,7 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
* @disk_bytenr: Offset of extent on disk.
* @disk_num_bytes: Size of extent on disk.
* @offset: Offset into unencoded data where file data starts.
- * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*).
+ * @flags: Flags specifying type of extent (1U << BTRFS_ORDERED_*).
* @compress_type: Compression algorithm used for data.
*
* Most of these parameters correspond to &struct btrfs_file_extent_item. The
@@ -312,9 +328,9 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
{
struct btrfs_inode *inode = entry->inode;
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
list_add_tail(&sum->list, &entry->list);
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
}
void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered)
@@ -343,7 +359,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
if (folio) {
ASSERT(folio->mapping);
ASSERT(folio_pos(folio) <= file_offset);
- ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio));
+ ASSERT(file_offset + len <= folio_next_pos(folio));
/*
* Ordered flag indicates whether we still have
@@ -401,15 +417,14 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
bool uptodate)
{
struct btrfs_inode *inode = ordered->inode;
- unsigned long flags;
bool ret;
trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate);
- spin_lock_irqsave(&inode->ordered_tree_lock, flags);
+ spin_lock(&inode->ordered_tree_lock);
ret = can_finish_ordered_extent(ordered, folio, file_offset, len,
uptodate);
- spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ spin_unlock(&inode->ordered_tree_lock);
/*
* If this is a COW write it means we created new extent maps for the
@@ -465,18 +480,16 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
{
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- unsigned long flags;
u64 cur = file_offset;
+ const u64 end = file_offset + num_bytes;
- trace_btrfs_writepage_end_io_hook(inode, file_offset,
- file_offset + num_bytes - 1,
- uptodate);
+ trace_btrfs_writepage_end_io_hook(inode, file_offset, end - 1, uptodate);
- spin_lock_irqsave(&inode->ordered_tree_lock, flags);
- while (cur < file_offset + num_bytes) {
+ spin_lock(&inode->ordered_tree_lock);
+ while (cur < end) {
u64 entry_end;
- u64 end;
- u32 len;
+ u64 this_end;
+ u64 len;
node = ordered_tree_search(inode, cur);
/* No ordered extents at all */
@@ -519,19 +532,18 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
* |
* cur
*/
- end = min(entry->file_offset + entry->num_bytes,
- file_offset + num_bytes) - 1;
- ASSERT(end + 1 - cur < U32_MAX);
- len = end + 1 - cur;
+ this_end = min(entry_end, end);
+ len = this_end - cur;
+ ASSERT(len < U32_MAX);
if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) {
- spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ spin_unlock(&inode->ordered_tree_lock);
btrfs_queue_ordered_fn(entry);
- spin_lock_irqsave(&inode->ordered_tree_lock, flags);
+ spin_lock(&inode->ordered_tree_lock);
}
cur += len;
}
- spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ spin_unlock(&inode->ordered_tree_lock);
}
/*
@@ -557,10 +569,9 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
{
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- unsigned long flags;
bool finished = false;
- spin_lock_irqsave(&inode->ordered_tree_lock, flags);
+ spin_lock(&inode->ordered_tree_lock);
if (cached && *cached) {
entry = *cached;
goto have_entry;
@@ -597,7 +608,7 @@ out:
refcount_inc(&entry->refs);
trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
}
- spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ spin_unlock(&inode->ordered_tree_lock);
return finished;
}
@@ -607,23 +618,18 @@ out:
*/
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
{
- struct list_head *cur;
- struct btrfs_ordered_sum *sum;
-
trace_btrfs_ordered_extent_put(entry->inode, entry);
if (refcount_dec_and_test(&entry->refs)) {
+ struct btrfs_ordered_sum *sum;
+ struct btrfs_ordered_sum *tmp;
+
ASSERT(list_empty(&entry->root_extent_list));
ASSERT(list_empty(&entry->log_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
- if (entry->inode)
- btrfs_add_delayed_iput(entry->inode);
- while (!list_empty(&entry->list)) {
- cur = entry->list.next;
- sum = list_entry(cur, struct btrfs_ordered_sum, list);
- list_del(&sum->list);
+ btrfs_add_delayed_iput(entry->inode);
+ list_for_each_entry_safe(sum, tmp, &entry->list, list)
kvfree(sum);
- }
kmem_cache_free(btrfs_ordered_extent_cache, entry);
}
}
@@ -667,7 +673,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
fs_info->delalloc_batch);
- spin_lock_irq(&btrfs_inode->ordered_tree_lock);
+ spin_lock(&btrfs_inode->ordered_tree_lock);
node = &entry->rb_node;
rb_erase(node, &btrfs_inode->ordered_tree);
RB_CLEAR_NODE(node);
@@ -675,7 +681,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
btrfs_inode->ordered_tree_last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
- spin_unlock_irq(&btrfs_inode->ordered_tree_lock);
+ spin_unlock(&btrfs_inode->ordered_tree_lock);
/*
* The current running transaction is waiting on us, we need to let it
@@ -842,10 +848,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
/*
* Start IO and wait for a given ordered extent to finish.
*
- * Wait on page writeback for all the pages in the extent and the IO completion
- * code to insert metadata into the btree corresponding to the extent.
+ * Wait on page writeback for all the pages in the extent but not in
+ * [@nowriteback_start, @nowriteback_start + @nowriteback_len) and the
+ * IO completion code to insert metadata into the btree corresponding to the extent.
*/
-void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
+void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry,
+ u64 nowriteback_start, u32 nowriteback_len)
{
u64 start = entry->file_offset;
u64 end = start + entry->num_bytes - 1;
@@ -865,8 +873,19 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
* start IO on any dirty ones so the wait doesn't stall waiting
* for the flusher thread to find them
*/
- if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
- filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
+ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) {
+ if (!nowriteback_len) {
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
+ } else {
+ if (start < nowriteback_start)
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start,
+ nowriteback_start - 1);
+ if (nowriteback_start + nowriteback_len < end)
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping,
+ nowriteback_start + nowriteback_len,
+ end);
+ }
+ }
if (!freespace_inode)
btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
@@ -947,9 +966,8 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
{
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- unsigned long flags;
- spin_lock_irqsave(&inode->ordered_tree_lock, flags);
+ spin_lock(&inode->ordered_tree_lock);
node = ordered_tree_search(inode, file_offset);
if (!node)
goto out;
@@ -962,7 +980,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
trace_btrfs_ordered_extent_lookup(inode, entry);
}
out:
- spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ spin_unlock(&inode->ordered_tree_lock);
return entry;
}
@@ -975,7 +993,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
node = ordered_tree_search(inode, file_offset);
if (!node) {
node = ordered_tree_search(inode, file_offset + len);
@@ -1002,7 +1020,7 @@ out:
refcount_inc(&entry->refs);
trace_btrfs_ordered_extent_lookup_range(inode, entry);
}
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
return entry;
}
@@ -1017,7 +1035,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
btrfs_assert_inode_locked(inode);
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) {
struct btrfs_ordered_extent *ordered;
@@ -1031,7 +1049,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
refcount_inc(&ordered->refs);
trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
}
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
}
/*
@@ -1044,7 +1062,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
node = ordered_tree_search(inode, file_offset);
if (!node)
goto out;
@@ -1053,7 +1071,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
refcount_inc(&entry->refs);
trace_btrfs_ordered_extent_lookup_first(inode, entry);
out:
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
return entry;
}
@@ -1075,7 +1093,7 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
struct rb_node *next;
struct btrfs_ordered_extent *entry = NULL;
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
node = inode->ordered_tree.rb_node;
/*
* Here we don't want to use tree_search() which will use tree->last
@@ -1130,7 +1148,7 @@ out:
trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
}
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
return entry;
}
@@ -1160,7 +1178,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
cachedp = cached_state;
while (1) {
- lock_extent(&inode->io_tree, start, end, cachedp);
+ btrfs_lock_extent(&inode->io_tree, start, end, cachedp);
ordered = btrfs_lookup_ordered_range(inode, start,
end - start + 1);
if (!ordered) {
@@ -1173,7 +1191,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
refcount_dec(&cache->refs);
break;
}
- unlock_extent(&inode->io_tree, start, end, cachedp);
+ btrfs_unlock_extent(&inode->io_tree, start, end, cachedp);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
}
@@ -1191,7 +1209,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
{
struct btrfs_ordered_extent *ordered;
- if (!try_lock_extent(&inode->io_tree, start, end, cached_state))
+ if (!btrfs_try_lock_extent(&inode->io_tree, start, end, cached_state))
return false;
ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
@@ -1199,7 +1217,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
return true;
btrfs_put_ordered_extent(ordered);
- unlock_extent(&inode->io_tree, start, end, cached_state);
+ btrfs_unlock_extent(&inode->io_tree, start, end, cached_state);
return false;
}
@@ -1229,6 +1247,18 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
*/
if (WARN_ON_ONCE(len >= ordered->num_bytes))
return ERR_PTR(-EINVAL);
+ /*
+ * If our ordered extent had an error there's no point in continuing.
+ * The error may have come from a transaction abort done either by this
+ * task or some other concurrent task, and the transaction abort path
+ * iterates over all existing ordered extents and sets the flag
+ * BTRFS_ORDERED_IOERR on them.
+ */
+ if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) {
+ const int fs_error = BTRFS_FS_ERROR(fs_info);
+
+ return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO);
+ }
/* We cannot split partially completed ordered extents. */
if (ordered->bytes_left) {
ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS));
@@ -1250,9 +1280,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
/*
* Take the root's ordered_extent_lock to avoid a race with
* btrfs_wait_ordered_extents() when updating the disk_bytenr and
- * disk_num_bytes fields of the ordered extent below. And we disable
- * IRQs because the inode's ordered_tree_lock is used in IRQ context
- * elsewhere.
+ * disk_num_bytes fields of the ordered extent below.
*
* There's no concern about a previous caller of
* btrfs_wait_ordered_extents() getting the trimmed ordered extent