summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/ctree.c16
-rw-r--r--fs/btrfs/ctree.h7
-rw-r--r--fs/btrfs/disk-io.c12
-rw-r--r--fs/btrfs/extent-tree.c21
-rw-r--r--fs/btrfs/inode.c5
-rw-r--r--fs/btrfs/ioctl.c49
-rw-r--r--fs/btrfs/volumes.c12
7 files changed, 99 insertions, 23 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d92462fe66c8..f64aad613727 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1016,19 +1016,21 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
parent_start = parent->start;
/*
- * If we are COWing a node/leaf from the extent, chunk or device trees,
- * make sure that we do not finish block group creation of pending block
- * groups. We do this to avoid a deadlock.
+ * If we are COWing a node/leaf from the extent, chunk, device or free
+ * space trees, make sure that we do not finish block group creation of
+ * pending block groups. We do this to avoid a deadlock.
* COWing can result in allocation of a new chunk, and flushing pending
* block groups (btrfs_create_pending_block_groups()) can be triggered
* when finishing allocation of a new chunk. Creation of a pending block
- * group modifies the extent, chunk and device trees, therefore we could
- * deadlock with ourselves since we are holding a lock on an extent
- * buffer that btrfs_create_pending_block_groups() may try to COW later.
+ * group modifies the extent, chunk, device and free space trees,
+ * therefore we could deadlock with ourselves since we are holding a
+ * lock on an extent buffer that btrfs_create_pending_block_groups() may
+ * try to COW later.
*/
if (root == fs_info->extent_root ||
root == fs_info->chunk_root ||
- root == fs_info->dev_root)
+ root == fs_info->dev_root ||
+ root == fs_info->free_space_root)
trans->can_flush_pending_bgs = false;
cow = btrfs_alloc_tree_block(trans, root, parent_start,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0a68cf7032f5..7a2a2621f0d9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -35,6 +35,7 @@
struct btrfs_trans_handle;
struct btrfs_transaction;
struct btrfs_pending_snapshot;
+struct btrfs_delayed_ref_root;
extern struct kmem_cache *btrfs_trans_handle_cachep;
extern struct kmem_cache *btrfs_bit_radix_cachep;
extern struct kmem_cache *btrfs_path_cachep;
@@ -786,6 +787,9 @@ enum {
* main phase. The fs_info::balance_ctl is initialized.
*/
BTRFS_FS_BALANCE_RUNNING,
+
+ /* Indicate that the cleaner thread is awake and doing something. */
+ BTRFS_FS_CLEANER_RUNNING,
};
struct btrfs_fs_info {
@@ -2661,6 +2665,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
unsigned long count);
int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
unsigned long count, u64 transid, int wait);
+void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head);
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8da2f380d3c0..6a2a2a951705 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1682,6 +1682,8 @@ static int cleaner_kthread(void *arg)
while (1) {
again = 0;
+ set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
+
/* Make the cleaner go to sleep early. */
if (btrfs_need_cleaner_sleep(fs_info))
goto sleep;
@@ -1728,6 +1730,7 @@ static int cleaner_kthread(void *arg)
*/
btrfs_delete_unused_bgs(fs_info);
sleep:
+ clear_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
if (kthread_should_park())
kthread_parkme();
if (kthread_should_stop())
@@ -4201,6 +4204,14 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->ordered_root_lock);
}
spin_unlock(&fs_info->ordered_root_lock);
+
+ /*
+ * We need this here because if we've been flipped read-only we won't
+ * get sync() from the umount, so we need to make sure any ordered
+ * extents that haven't had their dirty pages IO start writeout yet
+ * actually get run and error out properly.
+ */
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
}
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -4265,6 +4276,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
if (pin_bytes)
btrfs_pin_extent(fs_info, head->bytenr,
head->num_bytes, 1);
+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
btrfs_put_delayed_ref_head(head);
cond_resched();
spin_lock(&delayed_refs->lock);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b15afeae16df..d81035b7ea7d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2456,12 +2456,10 @@ static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
return ret ? ret : 1;
}
-static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *head)
+void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_delayed_ref_root *delayed_refs =
- &trans->transaction->delayed_refs;
int nr_items = 1; /* Dropping this ref head update. */
if (head->total_ref_mod < 0) {
@@ -2544,7 +2542,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
}
}
- cleanup_ref_head_accounting(trans, head);
+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
trace_run_delayed_ref_head(fs_info, head, 0);
btrfs_delayed_ref_unlock(head);
@@ -4954,6 +4952,15 @@ static void flush_space(struct btrfs_fs_info *fs_info,
ret = 0;
break;
case COMMIT_TRANS:
+ /*
+ * If we have pending delayed iputs then we could free up a
+ * bunch of pinned space, so make sure we run the iputs before
+ * we do our pinned bytes check below.
+ */
+ mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
+ btrfs_run_delayed_iputs(fs_info);
+ mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
+
ret = may_commit_transaction(fs_info, space_info);
break;
default:
@@ -7188,7 +7195,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
if (head->must_insert_reserved)
ret = 1;
- cleanup_ref_head_accounting(trans, head);
+ btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head);
return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 43eb4535319d..5c349667c761 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3129,9 +3129,6 @@ out:
/* once for the tree */
btrfs_put_ordered_extent(ordered_extent);
- /* Try to release some metadata so we don't get an OOM but don't wait */
- btrfs_btree_balance_dirty_nodelay(fs_info);
-
return ret;
}
@@ -3254,6 +3251,8 @@ void btrfs_add_delayed_iput(struct inode *inode)
ASSERT(list_empty(&binode->delayed_iput));
list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
spin_unlock(&fs_info->delayed_iput_lock);
+ if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
+ wake_up_process(fs_info->cleaner_kthread);
}
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fab9443f6a42..9c8e1734429c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3221,6 +3221,26 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
inode_lock_nested(inode2, I_MUTEX_CHILD);
}
+static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+ unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+ unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+}
+
+static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+ if (inode1 < inode2) {
+ swap(inode1, inode2);
+ swap(loff1, loff2);
+ } else if (inode1 == inode2 && loff2 < loff1) {
+ swap(loff1, loff2);
+ }
+ lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+ lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+}
+
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
struct inode *dst, u64 dst_loff)
{
@@ -3242,11 +3262,12 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
return -EINVAL;
/*
- * Lock destination range to serialize with concurrent readpages().
+ * Lock destination range to serialize with concurrent readpages() and
+ * source range to serialize with relocation.
*/
- lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1);
+ btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
- unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1);
+ btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
return ret;
}
@@ -3905,17 +3926,33 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
len = ALIGN(src->i_size, bs) - off;
if (destoff > inode->i_size) {
+ const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
+
ret = btrfs_cont_expand(inode, inode->i_size, destoff);
if (ret)
return ret;
+ /*
+ * We may have truncated the last block if the inode's size is
+ * not sector size aligned, so we need to wait for writeback to
+ * complete before proceeding further, otherwise we can race
+ * with cloning and attempt to increment a reference to an
+ * extent that no longer exists (writeback completed right after
+ * we found the previous extent covering eof and before we
+ * attempted to increment its reference count).
+ */
+ ret = btrfs_wait_ordered_range(inode, wb_start,
+ destoff - wb_start);
+ if (ret)
+ return ret;
}
/*
- * Lock destination range to serialize with concurrent readpages().
+ * Lock destination range to serialize with concurrent readpages() and
+ * source range to serialize with relocation.
*/
- lock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1);
+ btrfs_double_extent_lock(src, off, inode, destoff, len);
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
- unlock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1);
+ btrfs_double_extent_unlock(src, off, inode, destoff, len);
/*
* Truncate page cache pages so that future reads will see the cloned
* data immediately and not the previous data.
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2576b1a379c9..3e4f8f88353e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7825,6 +7825,18 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
ret = -EUCLEAN;
goto out;
}
+
+ /* It's possible this device is a dummy for seed device */
+ if (dev->disk_total_bytes == 0) {
+ dev = find_device(fs_info->fs_devices->seed, devid, NULL);
+ if (!dev) {
+ btrfs_err(fs_info, "failed to find seed devid %llu",
+ devid);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
if (physical_offset + physical_len > dev->disk_total_bytes) {
btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",