summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/block-group.c242
-rw-r--r--fs/btrfs/block-group.h8
-rw-r--r--fs/btrfs/btrfs_inode.h46
-rw-r--r--fs/btrfs/check-integrity.c205
-rw-r--r--fs/btrfs/compression.c685
-rw-r--r--fs/btrfs/compression.h4
-rw-r--r--fs/btrfs/ctree.c157
-rw-r--r--fs/btrfs/ctree.h86
-rw-r--r--fs/btrfs/delayed-inode.c41
-rw-r--r--fs/btrfs/delayed-ref.c17
-rw-r--r--fs/btrfs/delayed-ref.h51
-rw-r--r--fs/btrfs/dev-replace.c19
-rw-r--r--fs/btrfs/dir-item.c48
-rw-r--r--fs/btrfs/disk-io.c53
-rw-r--r--fs/btrfs/disk-io.h5
-rw-r--r--fs/btrfs/extent-tree.c327
-rw-r--r--fs/btrfs/extent_io.c334
-rw-r--r--fs/btrfs/extent_io.h10
-rw-r--r--fs/btrfs/extent_map.c4
-rw-r--r--fs/btrfs/file-item.c21
-rw-r--r--fs/btrfs/file.c54
-rw-r--r--fs/btrfs/free-space-cache.c24
-rw-r--r--fs/btrfs/inode.c623
-rw-r--r--fs/btrfs/ioctl.c1008
-rw-r--r--fs/btrfs/locking.h7
-rw-r--r--fs/btrfs/lzo.c301
-rw-r--r--fs/btrfs/raid56.c175
-rw-r--r--fs/btrfs/raid56.h22
-rw-r--r--fs/btrfs/reada.c26
-rw-r--r--fs/btrfs/ref-verify.c4
-rw-r--r--fs/btrfs/reflink.c4
-rw-r--r--fs/btrfs/relocation.c81
-rw-r--r--fs/btrfs/scrub.c139
-rw-r--r--fs/btrfs/send.c38
-rw-r--r--fs/btrfs/send.h7
-rw-r--r--fs/btrfs/space-info.c28
-rw-r--r--fs/btrfs/subpage.c290
-rw-r--r--fs/btrfs/subpage.h56
-rw-r--r--fs/btrfs/super.c28
-rw-r--r--fs/btrfs/sysfs.c93
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c2
-rw-r--r--fs/btrfs/tests/extent-io-tests.c12
-rw-r--r--fs/btrfs/tests/inode-tests.c4
-rw-r--r--fs/btrfs/transaction.c11
-rw-r--r--fs/btrfs/tree-log.c822
-rw-r--r--fs/btrfs/tree-log.h18
-rw-r--r--fs/btrfs/volumes.c602
-rw-r--r--fs/btrfs/volumes.h119
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/btrfs/zlib.c36
-rw-r--r--fs/btrfs/zoned.c531
-rw-r--r--fs/btrfs/zoned.h39
-rw-r--r--fs/btrfs/zstd.c27
53 files changed, 4608 insertions, 2988 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index a3b830b8410a..444e9c89ff3e 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/list_sort.h>
#include "misc.h"
#include "ctree.h"
#include "block-group.h"
@@ -144,6 +145,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
*/
WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
kfree(cache->free_space_ctl);
+ kfree(cache->physical_map);
kfree(cache);
}
}
@@ -902,6 +904,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cluster->refill_lock);
btrfs_clear_treelog_bg(block_group);
+ btrfs_clear_data_reloc_bg(block_group);
path = btrfs_alloc_path();
if (!path) {
@@ -1484,6 +1487,21 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
}
+/*
+ * We want block groups with a low number of used bytes to be in the beginning
+ * of the list, so they will get reclaimed first.
+ */
+static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
+ const struct list_head *b)
+{
+ const struct btrfs_block_group *bg1, *bg2;
+
+ bg1 = list_entry(a, struct btrfs_block_group, bg_list);
+ bg2 = list_entry(b, struct btrfs_block_group, bg_list);
+
+ return bg1->used > bg2->used;
+}
+
void btrfs_reclaim_bgs_work(struct work_struct *work)
{
struct btrfs_fs_info *fs_info =
@@ -1508,6 +1526,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
}
spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * Sort happens under lock because we can't simply splice it and sort.
+ * The block groups might still be in use and reachable via bg_list,
+ * and their presence in the reclaim_bgs list must be preserved.
+ */
+ list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
while (!list_empty(&fs_info->reclaim_bgs)) {
u64 zone_unusable;
int ret = 0;
@@ -1895,6 +1919,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
INIT_LIST_HEAD(&cache->discard_list);
INIT_LIST_HEAD(&cache->dirty_list);
INIT_LIST_HEAD(&cache->io_list);
+ INIT_LIST_HEAD(&cache->active_bg_list);
btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
atomic_set(&cache->frozen, 0);
mutex_init(&cache->free_space_lock);
@@ -2035,6 +2060,8 @@ static int read_one_block_group(struct btrfs_fs_info *info,
*/
if (btrfs_is_zoned(info)) {
btrfs_calc_zone_unusable(cache);
+ /* Should not have any excluded extents. Just in case, though. */
+ btrfs_free_excluded_extents(cache);
} else if (cache->length == cache->used) {
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
@@ -2062,15 +2089,18 @@ static int read_one_block_group(struct btrfs_fs_info *info,
link_block_group(cache);
set_avail_alloc_bits(info, cache->flags);
- if (btrfs_chunk_readonly(info, cache->start)) {
+ if (btrfs_chunk_writeable(info, cache->start)) {
+ if (cache->used == 0) {
+ ASSERT(list_empty(&cache->bg_list));
+ if (btrfs_test_opt(info, DISCARD_ASYNC))
+ btrfs_discard_queue_work(&info->discard_ctl, cache);
+ else
+ btrfs_mark_bg_unused(cache);
+ }
+ } else {
inc_block_group_ro(cache, 1);
- } else if (cache->used == 0) {
- ASSERT(list_empty(&cache->bg_list));
- if (btrfs_test_opt(info, DISCARD_ASYNC))
- btrfs_discard_queue_work(&info->discard_ctl, cache);
- else
- btrfs_mark_bg_unused(cache);
}
+
return 0;
error:
btrfs_put_block_group(cache);
@@ -2438,6 +2468,12 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
return ERR_PTR(ret);
}
+ /*
+ * New block group is likely to be used soon. Try to activate it now.
+ * Failure is OK for now.
+ */
+ btrfs_zone_activate(cache);
+
ret = exclude_super_stripes(cache);
if (ret) {
/* We may have excluded something, so call this just in case */
@@ -2479,7 +2515,8 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
*/
trace_btrfs_add_block_group(fs_info, cache, 1);
btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
- cache->bytes_super, 0, &cache->space_info);
+ cache->bytes_super, cache->zone_unusable,
+ &cache->space_info);
btrfs_update_global_block_rsv(fs_info);
link_block_group(cache);
@@ -2594,7 +2631,9 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
if (!--cache->ro) {
if (btrfs_is_zoned(cache->fs_info)) {
/* Migrate zone_unusable bytes back */
- cache->zone_unusable = cache->alloc_offset - cache->used;
+ cache->zone_unusable =
+ (cache->alloc_offset - cache->used) +
+ (cache->length - cache->zone_capacity);
sinfo->bytes_zone_unusable += cache->zone_unusable;
sinfo->bytes_readonly -= cache->zone_unusable;
}
@@ -3143,7 +3182,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
}
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, int alloc)
+ u64 bytenr, u64 num_bytes, bool alloc)
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_block_group *cache = NULL;
@@ -3380,36 +3419,17 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
*/
check_system_chunk(trans, flags);
- bg = btrfs_alloc_chunk(trans, flags);
+ bg = btrfs_create_chunk(trans, flags);
if (IS_ERR(bg)) {
ret = PTR_ERR(bg);
goto out;
}
- /*
- * If this is a system chunk allocation then stop right here and do not
- * add the chunk item to the chunk btree. This is to prevent a deadlock
- * because this system chunk allocation can be triggered while COWing
- * some extent buffer of the chunk btree and while holding a lock on a
- * parent extent buffer, in which case attempting to insert the chunk
- * item (or update the device item) would result in a deadlock on that
- * parent extent buffer. In this case defer the chunk btree updates to
- * the second phase of chunk allocation and keep our reservation until
- * the second phase completes.
- *
- * This is a rare case and can only be triggered by the very few cases
- * we have where we need to touch the chunk btree outside chunk allocation
- * and chunk removal. These cases are basically adding a device, removing
- * a device or resizing a device.
- */
- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- return 0;
-
ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
/*
* Normally we are not expected to fail with -ENOSPC here, since we have
* previously reserved space in the system space_info and allocated one
- * new system chunk if necessary. However there are two exceptions:
+ * new system chunk if necessary. However there are three exceptions:
*
* 1) We may have enough free space in the system space_info but all the
* existing system block groups have a profile which can not be used
@@ -3435,13 +3455,20 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
* with enough free space got turned into RO mode by a running scrub,
* and in this case we have to allocate a new one and retry. We only
* need do this allocate and retry once, since we have a transaction
- * handle and scrub uses the commit root to search for block groups.
+ * handle and scrub uses the commit root to search for block groups;
+ *
+ * 3) We had one system block group with enough free space when we called
+ * check_system_chunk(), but after that, right before we tried to
+ * allocate the last extent buffer we needed, a discard operation came
+ * in and it temporarily removed the last free space entry from the
+ * block group (discard removes a free space entry, discards it, and
+ * then adds back the entry to the block group cache).
*/
if (ret == -ENOSPC) {
const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
struct btrfs_block_group *sys_bg;
- sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+ sys_bg = btrfs_create_chunk(trans, sys_flags);
if (IS_ERR(sys_bg)) {
ret = PTR_ERR(sys_bg);
btrfs_abort_transaction(trans, ret);
@@ -3519,7 +3546,15 @@ out:
* properly, either intentionally or as a bug. One example where this is
* done intentionally is fsync, as it does not reserve any transaction units
* and ends up allocating a variable number of metadata extents for log
- * tree extent buffers.
+ * tree extent buffers;
+ *
+ * 4) The task has reserved enough transaction units / metadata space, but right
+ * before it tries to allocate the last extent buffer it needs, a discard
+ * operation comes in and, temporarily, removes the last free space entry from
+ * the only metadata block group that had free space (discard starts by
+ * removing a free space entry from a block group, then does the discard
+ * operation and, once it's done, it adds back the free space entry to the
+ * block group).
*
* We also need this 2 phases setup when adding a device to a filesystem with
* a seed device - we must create new metadata and system chunks without adding
@@ -3537,14 +3572,14 @@ out:
* This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
* the system chunk array due to concurrent allocations") provides more details.
*
- * For allocation of system chunks, we defer the updates and insertions into the
- * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
- * if the chunk allocation is triggered while COWing an extent buffer of the
- * chunk btree, we are holding a lock on the parent of that extent buffer and
- * doing the chunk btree updates and insertions can require locking that parent.
- * This is for the very few and rare cases where we update the chunk btree that
- * are not chunk allocation or chunk removal: adding a device, removing a device
- * or resizing a device.
+ * Allocation of system chunks does not happen through this function. A task that
+ * needs to update the chunk btree (the only btree that uses system chunks), must
+ * preallocate chunk space by calling either check_system_chunk() or
+ * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
+ * metadata chunk or when removing a chunk, while the later is used before doing
+ * a modification to the chunk btree - use cases for the later are adding,
+ * removing and resizing a device as well as relocation of a system chunk.
+ * See the comment below for more details.
*
* The reservation of system space, done through check_system_chunk(), as well
* as all the updates and insertions into the chunk btree must be done while
@@ -3581,11 +3616,27 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
if (trans->allocating_chunk)
return -ENOSPC;
/*
- * If we are removing a chunk, don't re-enter or we would deadlock.
- * System space reservation and system chunk allocation is done by the
- * chunk remove operation (btrfs_remove_chunk()).
+ * Allocation of system chunks can not happen through this path, as we
+ * could end up in a deadlock if we are allocating a data or metadata
+ * chunk and there is another task modifying the chunk btree.
+ *
+ * This is because while we are holding the chunk mutex, we will attempt
+ * to add the new chunk item to the chunk btree or update an existing
+ * device item in the chunk btree, while the other task that is modifying
+ * the chunk btree is attempting to COW an extent buffer while holding a
+ * lock on it and on its parent - if the COW operation triggers a system
+ * chunk allocation, then we can deadlock because we are holding the
+ * chunk mutex and we may need to access that extent buffer or its parent
+ * in order to add the chunk item or update a device item.
+ *
+ * Tasks that want to modify the chunk tree should reserve system space
+ * before updating the chunk btree, by calling either
+ * btrfs_reserve_chunk_metadata() or check_system_chunk().
+ * It's possible that after a task reserves the space, it still ends up
+ * here - this happens in the cases described above at do_chunk_alloc().
+ * The task will have to either retry or fail.
*/
- if (trans->removing_chunk)
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
return -ENOSPC;
space_info = btrfs_find_space_info(fs_info, flags);
@@ -3684,17 +3735,14 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
return num_dev;
}
-/*
- * Reserve space in the system space for allocating or removing a chunk
- */
-void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+static void reserve_chunk_space(struct btrfs_trans_handle *trans,
+ u64 bytes,
+ u64 type)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *info;
u64 left;
- u64 thresh;
int ret = 0;
- u64 num_devs;
/*
* Needed because we can end up allocating a system chunk and for an
@@ -3707,19 +3755,13 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
left = info->total_bytes - btrfs_space_info_used(info, true);
spin_unlock(&info->lock);
- num_devs = get_profile_num_devs(fs_info, type);
-
- /* num_devs device items to update and 1 chunk item to add or remove */
- thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
- btrfs_calc_insert_metadata_size(fs_info, 1);
-
- if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
- left, thresh, type);
+ left, bytes, type);
btrfs_dump_space_info(fs_info, info, 0, 0);
}
- if (left < thresh) {
+ if (left < bytes) {
u64 flags = btrfs_system_alloc_profile(fs_info);
struct btrfs_block_group *bg;
@@ -3728,21 +3770,20 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
* needing it, as we might not need to COW all nodes/leafs from
* the paths we visit in the chunk tree (they were already COWed
* or created in the current transaction for example).
- *
- * Also, if our caller is allocating a system chunk, do not
- * attempt to insert the chunk item in the chunk btree, as we
- * could deadlock on an extent buffer since our caller may be
- * COWing an extent buffer from the chunk btree.
*/
- bg = btrfs_alloc_chunk(trans, flags);
+ bg = btrfs_create_chunk(trans, flags);
if (IS_ERR(bg)) {
ret = PTR_ERR(bg);
- } else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
+ } else {
/*
* If we fail to add the chunk item here, we end up
* trying again at phase 2 of chunk allocation, at
* btrfs_create_pending_block_groups(). So ignore
- * any error here.
+ * any error here. An ENOSPC here could happen, due to
+ * the cases described at do_chunk_alloc() - the system
+ * block group we just created was just turned into RO
+ * mode by a scrub for example, or a running discard
+ * temporarily removed its free space entries, etc.
*/
btrfs_chunk_alloc_add_chunk_item(trans, bg);
}
@@ -3751,12 +3792,61 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
if (!ret) {
ret = btrfs_block_rsv_add(fs_info->chunk_root,
&fs_info->chunk_block_rsv,
- thresh, BTRFS_RESERVE_NO_FLUSH);
+ bytes, BTRFS_RESERVE_NO_FLUSH);
if (!ret)
- trans->chunk_bytes_reserved += thresh;
+ trans->chunk_bytes_reserved += bytes;
}
}
+/*
+ * Reserve space in the system space for allocating or removing a chunk.
+ * The caller must be holding fs_info->chunk_mutex.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ const u64 num_devs = get_profile_num_devs(fs_info, type);
+ u64 bytes;
+
+ /* num_devs device items to update and 1 chunk item to add or remove. */
+ bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
+ btrfs_calc_insert_metadata_size(fs_info, 1);
+
+ reserve_chunk_space(trans, bytes, type);
+}
+
+/*
+ * Reserve space in the system space, if needed, for doing a modification to the
+ * chunk btree.
+ *
+ * @trans: A transaction handle.
+ * @is_item_insertion: Indicate if the modification is for inserting a new item
+ * in the chunk btree or if it's for the deletion or update
+ * of an existing item.
+ *
+ * This is used in a context where we need to update the chunk btree outside
+ * block group allocation and removal, to avoid a deadlock with a concurrent
+ * task that is allocating a metadata or data block group and therefore needs to
+ * update the chunk btree while holding the chunk mutex. After the update to the
+ * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
+ *
+ */
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+ bool is_item_insertion)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ u64 bytes;
+
+ if (is_item_insertion)
+ bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+ else
+ bytes = btrfs_calc_metadata_size(fs_info, 1);
+
+ mutex_lock(&fs_info->chunk_mutex);
+ reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
+ mutex_unlock(&fs_info->chunk_mutex);
+}
+
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
struct btrfs_block_group *block_group;
@@ -3833,6 +3923,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
spin_unlock(&info->unused_bgs_lock);
+ spin_lock(&info->zone_active_bgs_lock);
+ while (!list_empty(&info->zone_active_bgs)) {
+ block_group = list_first_entry(&info->zone_active_bgs,
+ struct btrfs_block_group,
+ active_bg_list);
+ list_del_init(&block_group->active_bg_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&info->zone_active_bgs_lock);
+
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group,
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index c72a71efcb18..5878b7ce3b78 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -98,6 +98,7 @@ struct btrfs_block_group {
unsigned int to_copy:1;
unsigned int relocating_repair:1;
unsigned int chunk_item_inserted:1;
+ unsigned int zone_is_active:1;
int disk_cache_state;
@@ -202,7 +203,10 @@ struct btrfs_block_group {
*/
u64 alloc_offset;
u64 zone_unusable;
+ u64 zone_capacity;
u64 meta_write_pointer;
+ struct map_lookup *physical_map;
+ struct list_head active_bg_list;
};
static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
@@ -280,7 +284,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, int alloc);
+ u64 bytenr, u64 num_bytes, bool alloc);
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
u64 ram_bytes, u64 num_bytes, int delalloc);
void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
@@ -289,6 +293,8 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
enum btrfs_chunk_alloc_enum force);
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+ bool is_item_insertion);
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 76ee1452c57b..ab2a4a52e0bb 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -138,17 +138,34 @@ struct btrfs_inode {
/* a local copy of root's last_log_commit */
int last_log_commit;
- /* total number of bytes pending delalloc, used by stat to calc the
- * real block usage of the file
- */
- u64 delalloc_bytes;
-
- /*
- * Total number of bytes pending delalloc that fall within a file
- * range that is either a hole or beyond EOF (and no prealloc extent
- * exists in the range). This is always <= delalloc_bytes.
- */
- u64 new_delalloc_bytes;
+ union {
+ /*
+ * Total number of bytes pending delalloc, used by stat to
+ * calculate the real block usage of the file. This is used
+ * only for files.
+ */
+ u64 delalloc_bytes;
+ /*
+ * The offset of the last dir item key that was logged.
+ * This is used only for directories.
+ */
+ u64 last_dir_item_offset;
+ };
+
+ union {
+ /*
+ * Total number of bytes pending delalloc that fall within a file
+ * range that is either a hole or beyond EOF (and no prealloc extent
+ * exists in the range). This is always <= delalloc_bytes and this
+ * is used only for files.
+ */
+ u64 new_delalloc_bytes;
+ /*
+ * The offset of the last dir index key that was logged.
+ * This is used only for directories.
+ */
+ u64 last_dir_index_offset;
+ };
/*
* total number of bytes pending defrag, used by stat to check whether
@@ -339,7 +356,12 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
struct btrfs_dio_private {
struct inode *inode;
- u64 logical_offset;
+
+ /*
+ * Since DIO can use anonymous page, we cannot use page_offset() to
+ * grab the file offset, thus need a dedicated member for file offset.
+ */
+ u64 file_offset;
u64 disk_bytenr;
/* Used for bio::bi_size */
u32 bytes;
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 86816088927f..7e9f90fa0388 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -186,7 +186,6 @@ struct btrfsic_dev_state {
struct list_head collision_resolving_node; /* list node */
struct btrfsic_block dummy_block_for_bio_bh_flush;
u64 last_flush_gen;
- char name[BDEVNAME_SIZE];
};
struct btrfsic_block_hashtable {
@@ -403,7 +402,6 @@ static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
ds->bdev = NULL;
ds->state = NULL;
- ds->name[0] = '\0';
INIT_LIST_HEAD(&ds->collision_resolving_node);
ds->last_flush_gen = 0;
btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
@@ -756,10 +754,10 @@ static int btrfsic_process_superblock_dev_mirror(
superblock_tmp->mirror_num = 1 + superblock_mirror_num;
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
btrfs_info_in_rcu(fs_info,
- "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)",
+ "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)",
superblock_bdev,
rcu_str_deref(device->name), dev_bytenr,
- dev_state->name, dev_bytenr,
+ dev_state->bdev, dev_bytenr,
superblock_mirror_num);
list_add(&superblock_tmp->all_blocks_node,
&state->all_blocks_list);
@@ -938,9 +936,10 @@ continue_with_current_leaf_stack_frame:
if (disk_item_offset + sizeof(struct btrfs_item) >
sf->block_ctx->len) {
leaf_item_out_of_bounce_error:
- pr_info("btrfsic: leaf item out of bounce at logical %llu, dev %s\n",
+ pr_info(
+ "btrfsic: leaf item out of bounce at logical %llu, dev %pg\n",
sf->block_ctx->start,
- sf->block_ctx->dev->name);
+ sf->block_ctx->dev->bdev);
goto one_stack_frame_backwards;
}
btrfsic_read_from_block_data(sf->block_ctx,
@@ -1058,9 +1057,10 @@ continue_with_current_node_stack_frame:
(uintptr_t)nodehdr;
if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
sf->block_ctx->len) {
- pr_info("btrfsic: node item out of bounce at logical %llu, dev %s\n",
+ pr_info(
+ "btrfsic: node item out of bounce at logical %llu, dev %pg\n",
sf->block_ctx->start,
- sf->block_ctx->dev->name);
+ sf->block_ctx->dev->bdev);
goto one_stack_frame_backwards;
}
btrfsic_read_from_block_data(
@@ -1228,15 +1228,17 @@ static int btrfsic_create_link_to_next_block(
if (next_block->logical_bytenr != next_bytenr &&
!(!next_block->is_metadata &&
0 == next_block->logical_bytenr))
- pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
- next_bytenr, next_block_ctx->dev->name,
+ pr_info(
+"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
+ next_bytenr, next_block_ctx->dev->bdev,
next_block_ctx->dev_bytenr, *mirror_nump,
btrfsic_get_block_type(state,
next_block),
next_block->logical_bytenr);
else
- pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n",
- next_bytenr, next_block_ctx->dev->name,
+ pr_info(
+ "referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+ next_bytenr, next_block_ctx->dev->bdev,
next_block_ctx->dev_bytenr, *mirror_nump,
btrfsic_get_block_type(state,
next_block));
@@ -1324,8 +1326,8 @@ static int btrfsic_handle_extent_data(
if (file_extent_item_offset +
offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
block_ctx->len) {
- pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n",
- block_ctx->start, block_ctx->dev->name);
+ pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
+ block_ctx->start, block_ctx->dev->bdev);
return -1;
}
@@ -1344,8 +1346,8 @@ static int btrfsic_handle_extent_data(
if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
block_ctx->len) {
- pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n",
- block_ctx->start, block_ctx->dev->name);
+ pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
+ block_ctx->start, block_ctx->dev->bdev);
return -1;
}
btrfsic_read_from_block_data(block_ctx, &file_extent_item,
@@ -1421,9 +1423,10 @@ static int btrfsic_handle_extent_data(
next_block->logical_bytenr != next_bytenr &&
!(!next_block->is_metadata &&
0 == next_block->logical_bytenr)) {
- pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu).\n",
+ pr_info(
+"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n",
next_bytenr,
- next_block_ctx.dev->name,
+ next_block_ctx.dev->bdev,
next_block_ctx.dev_bytenr,
mirror_num,
next_block->logical_bytenr);
@@ -1455,7 +1458,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
struct btrfs_fs_info *fs_info = state->fs_info;
int ret;
u64 length;
- struct btrfs_bio *multi = NULL;
+ struct btrfs_io_context *multi = NULL;
struct btrfs_device *device;
length = len;
@@ -1561,7 +1564,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
struct bio *bio;
unsigned int j;
- bio = btrfs_io_bio_alloc(num_pages - i);
+ bio = btrfs_bio_alloc(num_pages - i);
bio_set_dev(bio, block_ctx->dev->bdev);
bio->bi_iter.bi_sector = dev_bytenr >> 9;
bio->bi_opf = REQ_OP_READ;
@@ -1577,8 +1580,8 @@ static int btrfsic_read_block(struct btrfsic_state *state,
return -1;
}
if (submit_bio_wait(bio)) {
- pr_info("btrfsic: read error at logical %llu dev %s!\n",
- block_ctx->start, block_ctx->dev->name);
+ pr_info("btrfsic: read error at logical %llu dev %pg!\n",
+ block_ctx->start, block_ctx->dev->bdev);
bio_put(bio);
return -1;
}
@@ -1602,33 +1605,35 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) {
const struct btrfsic_block_link *l;
- pr_info("%c-block @%llu (%s/%llu/%d)\n",
+ pr_info("%c-block @%llu (%pg/%llu/%d)\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num);
list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) {
- pr_info(" %c @%llu (%s/%llu/%d) refers %u* to %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ " %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
}
list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) {
- pr_info(" %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ " %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr,
l->block_ref_from->mirror_num);
}
@@ -1743,16 +1748,18 @@ again:
if (block->logical_bytenr != bytenr &&
!(!block->is_metadata &&
block->logical_bytenr == 0))
- pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
- bytenr, dev_state->name,
+ pr_info(
+"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
+ bytenr, dev_state->bdev,
dev_bytenr,
block->mirror_num,
btrfsic_get_block_type(state,
block),
block->logical_bytenr);
else
- pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
- bytenr, dev_state->name,
+ pr_info(
+ "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+ bytenr, dev_state->bdev,
dev_bytenr, block->mirror_num,
btrfsic_get_block_type(state,
block));
@@ -1767,8 +1774,9 @@ again:
processed_len = state->datablock_size;
bytenr = block->logical_bytenr;
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
- bytenr, dev_state->name, dev_bytenr,
+ pr_info(
+ "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+ bytenr, dev_state->bdev, dev_bytenr,
block->mirror_num,
btrfsic_get_block_type(state, block));
}
@@ -1778,9 +1786,10 @@ again:
list_empty(&block->ref_to_list) ? ' ' : '!',
list_empty(&block->ref_from_list) ? ' ' : '!');
if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
- pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
+ pr_info(
+"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
btrfsic_get_block_type(state, block), bytenr,
- dev_state->name, dev_bytenr, block->mirror_num,
+ dev_state->bdev, dev_bytenr, block->mirror_num,
block->generation,
btrfs_disk_key_objectid(&block->disk_key),
block->disk_key.type,
@@ -1792,9 +1801,10 @@ again:
}
if (!block->is_iodone && !block->never_written) {
- pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
+ pr_info(
+"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
btrfsic_get_block_type(state, block), bytenr,
- dev_state->name, dev_bytenr, block->mirror_num,
+ dev_state->bdev, dev_bytenr, block->mirror_num,
block->generation,
btrfs_stack_header_generation(
(struct btrfs_header *)
@@ -1921,8 +1931,9 @@ again:
if (!is_metadata) {
processed_len = state->datablock_size;
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("Written block (%s/%llu/?) !found in hash table, D.\n",
- dev_state->name, dev_bytenr);
+ pr_info(
+ "written block (%pg/%llu/?) !found in hash table, D\n",
+ dev_state->bdev, dev_bytenr);
if (!state->include_extent_data) {
/* ignore that written D block */
goto continue_loop;
@@ -1939,8 +1950,9 @@ again:
btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
dev_bytenr);
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("Written block @%llu (%s/%llu/?) !found in hash table, M.\n",
- bytenr, dev_state->name, dev_bytenr);
+ pr_info(
+ "written block @%llu (%pg/%llu/?) !found in hash table, M\n",
+ bytenr, dev_state->bdev, dev_bytenr);
}
block_ctx.dev = dev_state;
@@ -1995,9 +2007,9 @@ again:
block->next_in_same_bio = NULL;
}
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("New written %c-block @%llu (%s/%llu/%d)\n",
+ pr_info("new written %c-block @%llu (%pg/%llu/%d)\n",
is_metadata ? 'M' : 'D',
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num);
list_add(&block->all_blocks_node, &state->all_blocks_list);
btrfsic_block_hashtable_add(block, &state->block_hashtable);
@@ -2041,10 +2053,10 @@ static void btrfsic_bio_end_io(struct bio *bp)
if ((dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
+ pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n",
bp->bi_status,
btrfsic_get_block_type(dev_state->state, block),
- block->logical_bytenr, dev_state->name,
+ block->logical_bytenr, dev_state->bdev,
block->dev_bytenr, block->mirror_num);
next_block = block->next_in_same_bio;
block->iodone_w_error = iodone_w_error;
@@ -2052,8 +2064,8 @@ static void btrfsic_bio_end_io(struct bio *bp)
dev_state->last_flush_gen++;
if ((dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bio_end_io() new %s flush_gen=%llu\n",
- dev_state->name,
+ pr_info("bio_end_io() new %pg flush_gen=%llu\n",
+ dev_state->bdev,
dev_state->last_flush_gen);
}
if (block->submit_bio_bh_rw & REQ_FUA)
@@ -2078,17 +2090,19 @@ static int btrfsic_process_written_superblock(
if (!(superblock->generation > state->max_superblock_generation ||
0 == state->max_superblock_generation)) {
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- pr_info("btrfsic: superblock @%llu (%s/%llu/%d) with old gen %llu <= %llu\n",
+ pr_info(
+ "btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n",
superblock->logical_bytenr,
- superblock->dev_state->name,
+ superblock->dev_state->bdev,
superblock->dev_bytenr, superblock->mirror_num,
btrfs_super_generation(super_hdr),
state->max_superblock_generation);
} else {
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- pr_info("btrfsic: got new superblock @%llu (%s/%llu/%d) with new gen %llu > %llu\n",
+ pr_info(
+ "btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n",
superblock->logical_bytenr,
- superblock->dev_state->name,
+ superblock->dev_state->bdev,
superblock->dev_bytenr, superblock->mirror_num,
btrfs_super_generation(super_hdr),
state->max_superblock_generation);
@@ -2232,38 +2246,42 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
*/
list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("rl=%d, %c @%llu (%s/%llu/%d) %u* refers to %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ "rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n",
recursion_level,
btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
if (l->block_ref_to->never_written) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is never written!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
ret = -1;
} else if (!l->block_ref_to->is_iodone) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not yet iodone!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
ret = -1;
} else if (l->block_ref_to->iodone_w_error) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which has write error!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
ret = -1;
@@ -2273,10 +2291,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
l->parent_generation &&
BTRFSIC_GENERATION_UNKNOWN !=
l->block_ref_to->generation) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) with generation %llu != parent generation %llu!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num,
l->block_ref_to->generation,
@@ -2284,10 +2303,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
ret = -1;
} else if (l->block_ref_to->flush_gen >
l->block_ref_to->dev_state->last_flush_gen) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num, block->flush_gen,
l->block_ref_to->dev_state->last_flush_gen);
@@ -2324,15 +2344,16 @@ static int btrfsic_is_block_ref_by_superblock(
*/
list_for_each_entry(l, &block->ref_from_list, node_ref_from) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("rl=%d, %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ "rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
recursion_level,
btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr,
l->block_ref_from->mirror_num);
if (l->block_ref_from->is_superblock &&
@@ -2354,30 +2375,30 @@ static int btrfsic_is_block_ref_by_superblock(
static void btrfsic_print_add_link(const struct btrfsic_state *state,
const struct btrfsic_block_link *l)
{
- pr_info("Add %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n",
+ pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+ l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
}
static void btrfsic_print_rem_link(const struct btrfsic_state *state,
const struct btrfsic_block_link *l)
{
- pr_info("Rem %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n",
+ pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+ l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
}
@@ -2419,9 +2440,9 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
* This algorithm is recursive because the amount of used stack space
* is very small and the max recursion depth is limited.
*/
- indent_add = sprintf(buf, "%c-%llu(%s/%llu/%u)",
+ indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)",
btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num);
if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
printk("[...]\n");
@@ -2542,10 +2563,10 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
block->never_written = never_written;
block->mirror_num = mirror_num;
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("New %s%c-block @%llu (%s/%llu/%d)\n",
+ pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n",
additional_string,
btrfsic_get_block_type(state, block),
- block->logical_bytenr, dev_state->name,
+ block->logical_bytenr, dev_state->bdev,
block->dev_bytenr, mirror_num);
list_add(&block->all_blocks_node, &state->all_blocks_list);
btrfsic_block_hashtable_add(block, &state->block_hashtable);
@@ -2592,8 +2613,9 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
}
if (WARN_ON(!match)) {
- pr_info("btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%s, phys_bytenr=%llu)!\n",
- bytenr, dev_state->name, dev_bytenr);
+ pr_info(
+"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n",
+ bytenr, dev_state->bdev, dev_bytenr);
for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
ret = btrfsic_map_block(state, bytenr,
state->metablock_size,
@@ -2601,8 +2623,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
if (ret)
continue;
- pr_info("Read logical bytenr @%llu maps to (%s/%llu/%d)\n",
- bytenr, block_ctx.dev->name,
+ pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n",
+ bytenr, block_ctx.dev->bdev,
block_ctx.dev_bytenr, mirror_num);
}
}
@@ -2675,8 +2697,9 @@ static void __btrfsic_submit_bio(struct bio *bio)
if ((dev_state->state->print_mask &
(BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
BTRFSIC_PRINT_MASK_VERBOSE)))
- pr_info("btrfsic_submit_bio(%s) with FLUSH but dummy block already in use (ignored)!\n",
- dev_state->name);
+ pr_info(
+"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
+ dev_state->bdev);
} else {
struct btrfsic_block *const block =
&dev_state->dummy_block_for_bio_bh_flush;
@@ -2751,7 +2774,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
list_for_each_entry(device, dev_head, dev_list) {
struct btrfsic_dev_state *ds;
- const char *p;
if (!device->bdev || !device->name)
continue;
@@ -2763,10 +2785,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
}
ds->bdev = device->bdev;
ds->state = state;
- bdevname(ds->bdev, ds->name);
- ds->name[BDEVNAME_SIZE - 1] = '\0';
- p = kbasename(ds->name);
- strlcpy(ds->name, p, sizeof(ds->name));
btrfsic_dev_state_hashtable_add(ds,
&btrfsic_dev_state_hashtable);
}
@@ -2844,9 +2862,10 @@ void btrfsic_unmount(struct btrfs_fs_devices *fs_devices)
if (b_all->is_iodone || b_all->never_written)
btrfsic_block_free(b_all);
else
- pr_info("btrfs: attempt to free %c-block @%llu (%s/%llu/%d) on umount which is not yet iodone!\n",
+ pr_info(
+"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num);
}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 7869ad12bc6e..32da97c3c19d 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -9,6 +9,7 @@
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
+#include <linux/kthread.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
@@ -28,6 +29,7 @@
#include "compression.h"
#include "extent_io.h"
#include "extent_map.h"
+#include "subpage.h"
#include "zoned.h"
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
@@ -172,16 +174,17 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
/* Hash through the page sector by sector */
for (pg_offset = 0; pg_offset < bytes_left;
pg_offset += sectorsize) {
- kaddr = page_address(page);
+ kaddr = kmap_atomic(page);
crypto_shash_digest(shash, kaddr + pg_offset,
sectorsize, csum);
+ kunmap_atomic(kaddr);
if (memcmp(&csum, cb_sum, csum_size) != 0) {
btrfs_print_data_csum_error(inode, disk_start,
csum, cb_sum, cb->mirror_num);
- if (btrfs_io_bio(bio)->device)
+ if (btrfs_bio(bio)->device)
btrfs_dev_stat_inc_and_print(
- btrfs_io_bio(bio)->device,
+ btrfs_bio(bio)->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
return -EIO;
}
@@ -192,6 +195,87 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
return 0;
}
+/*
+ * Reduce bio and io accounting for a compressed_bio with its corresponding bio.
+ *
+ * Return true if there is no pending bio nor io.
+ * Return false otherwise.
+ */
+static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+ unsigned int bi_size = 0;
+ bool last_io = false;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ /*
+ * At endio time, bi_iter.bi_size doesn't represent the real bio size.
+ * Thus here we have to iterate through all segments to grab correct
+ * bio size.
+ */
+ bio_for_each_segment_all(bvec, bio, iter_all)
+ bi_size += bvec->bv_len;
+
+ if (bio->bi_status)
+ cb->errors = 1;
+
+ ASSERT(bi_size && bi_size <= cb->compressed_len);
+ last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
+ &cb->pending_sectors);
+ /*
+ * Here we must wake up the possible error handler after all other
+ * operations on @cb finished, or we can race with
+ * finish_compressed_bio_*() which may free @cb.
+ */
+ wake_up_var(cb);
+
+ return last_io;
+}
+
+static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio)
+{
+ unsigned int index;
+ struct page *page;
+
+ /* Release the compressed pages */
+ for (index = 0; index < cb->nr_pages; index++) {
+ page = cb->compressed_pages[index];
+ page->mapping = NULL;
+ put_page(page);
+ }
+
+ /* Do io completion on the original bio */
+ if (cb->errors) {
+ bio_io_error(cb->orig_bio);
+ } else {
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ ASSERT(bio);
+ ASSERT(!bio->bi_status);
+ /*
+ * We have verified the checksum already, set page checked so
+ * the end_io handlers know about it
+ */
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
+ bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) {
+ u64 bvec_start = page_offset(bvec->bv_page) +
+ bvec->bv_offset;
+
+ btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb),
+ bvec->bv_page, bvec_start,
+ bvec->bv_len);
+ }
+
+ bio_endio(cb->orig_bio);
+ }
+
+ /* Finally free the cb struct */
+ kfree(cb->compressed_pages);
+ kfree(cb);
+}
+
/* when we finish reading compressed pages from the disk, we
* decompress them and then run the bio end_io routines on the
* decompressed pages (in the inode address space).
@@ -206,25 +290,17 @@ static void end_compressed_bio_read(struct bio *bio)
{
struct compressed_bio *cb = bio->bi_private;
struct inode *inode;
- struct page *page;
- unsigned int index;
- unsigned int mirror = btrfs_io_bio(bio)->mirror_num;
+ unsigned int mirror = btrfs_bio(bio)->mirror_num;
int ret = 0;
- if (bio->bi_status)
- cb->errors = 1;
-
- /* if there are more bios still pending for this compressed
- * extent, just exit
- */
- if (!refcount_dec_and_test(&cb->pending_bios))
+ if (!dec_and_test_compressed_bio(cb, bio))
goto out;
/*
* Record the correct mirror_num in cb->orig_bio so that
* read-repair can work properly.
*/
- btrfs_io_bio(cb->orig_bio)->mirror_num = mirror;
+ btrfs_bio(cb->orig_bio)->mirror_num = mirror;
cb->mirror_num = mirror;
/*
@@ -248,36 +324,7 @@ static void end_compressed_bio_read(struct bio *bio)
csum_failed:
if (ret)
cb->errors = 1;
-
- /* release the compressed pages */
- index = 0;
- for (index = 0; index < cb->nr_pages; index++) {
- page = cb->compressed_pages[index];
- page->mapping = NULL;
- put_page(page);
- }
-
- /* do io completion on the original bio */
- if (cb->errors) {
- bio_io_error(cb->orig_bio);
- } else {
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- /*
- * we have verified the checksum already, set page
- * checked so the end_io handlers know about it
- */
- ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, cb->orig_bio, iter_all)
- SetPageChecked(bvec->bv_page);
-
- bio_endio(cb->orig_bio);
- }
-
- /* finally free the cb struct */
- kfree(cb->compressed_pages);
- kfree(cb);
+ finish_compressed_bio_read(cb, bio);
out:
bio_put(bio);
}
@@ -289,6 +336,7 @@ out:
static noinline void end_compressed_writeback(struct inode *inode,
const struct compressed_bio *cb)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long index = cb->start >> PAGE_SHIFT;
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
struct page *pages[16];
@@ -311,7 +359,8 @@ static noinline void end_compressed_writeback(struct inode *inode,
for (i = 0; i < ret; i++) {
if (cb->errors)
SetPageError(pages[i]);
- end_page_writeback(pages[i]);
+ btrfs_page_clamp_clear_writeback(fs_info, pages[i],
+ cb->start, cb->len);
put_page(pages[i]);
}
nr_pages -= ret;
@@ -320,60 +369,127 @@ static noinline void end_compressed_writeback(struct inode *inode,
/* the inode may be gone now */
}
-/*
- * do the cleanup once all the compressed pages hit the disk.
- * This will clear writeback on the file pages and free the compressed
- * pages.
- *
- * This also calls the writeback end hooks for the file pages so that
- * metadata and checksums can be updated in the file.
- */
-static void end_compressed_bio_write(struct bio *bio)
+static void finish_compressed_bio_write(struct compressed_bio *cb)
{
- struct compressed_bio *cb = bio->bi_private;
- struct inode *inode;
- struct page *page;
+ struct inode *inode = cb->inode;
unsigned int index;
- if (bio->bi_status)
- cb->errors = 1;
-
- /* if there are more bios still pending for this compressed
- * extent, just exit
- */
- if (!refcount_dec_and_test(&cb->pending_bios))
- goto out;
-
- /* ok, we're the last bio for this extent, step one is to
- * call back into the FS and do all the end_io operations
+ /*
+ * Ok, we're the last bio for this extent, step one is to call back
+ * into the FS and do all the end_io operations.
*/
- inode = cb->inode;
- btrfs_record_physical_zoned(inode, cb->start, bio);
btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
cb->start, cb->start + cb->len - 1,
!cb->errors);
end_compressed_writeback(inode, cb);
- /* note, our inode could be gone now */
+ /* Note, our inode could be gone now */
/*
- * release the compressed pages, these came from alloc_page and
+ * Release the compressed pages, these came from alloc_page and
* are not attached to the inode at all
*/
- index = 0;
for (index = 0; index < cb->nr_pages; index++) {
- page = cb->compressed_pages[index];
+ struct page *page = cb->compressed_pages[index];
+
page->mapping = NULL;
put_page(page);
}
- /* finally free the cb struct */
+ /* Finally free the cb struct */
kfree(cb->compressed_pages);
kfree(cb);
+}
+
+/*
+ * Do the cleanup once all the compressed pages hit the disk. This will clear
+ * writeback on the file pages and free the compressed pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that metadata
+ * and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio)
+{
+ struct compressed_bio *cb = bio->bi_private;
+
+ if (!dec_and_test_compressed_bio(cb, bio))
+ goto out;
+
+ btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+
+ finish_compressed_bio_write(cb);
out:
bio_put(bio);
}
+static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
+ struct compressed_bio *cb,
+ struct bio *bio, int mirror_num)
+{
+ blk_status_t ret;
+
+ ASSERT(bio->bi_iter.bi_size);
+ ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
+ if (ret)
+ return ret;
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ return ret;
+}
+
+/*
+ * Allocate a compressed_bio, which will be used to read/write on-disk
+ * (aka, compressed) * data.
+ *
+ * @cb: The compressed_bio structure, which records all the needed
+ * information to bind the compressed data to the uncompressed
+ * page cache.
+ * @disk_byten: The logical bytenr where the compressed data will be read
+ * from or written to.
+ * @endio_func: The endio function to call after the IO for compressed data
+ * is finished.
+ * @next_stripe_start: Return value of logical bytenr of where next stripe starts.
+ * Let the caller know to only fill the bio up to the stripe
+ * boundary.
+ */
+
+
+static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
+ unsigned int opf, bio_end_io_t endio_func,
+ u64 *next_stripe_start)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+ struct btrfs_io_geometry geom;
+ struct extent_map *em;
+ struct bio *bio;
+ int ret;
+
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
+
+ bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+ bio->bi_opf = opf;
+ bio->bi_private = cb;
+ bio->bi_end_io = endio_func;
+
+ em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
+ if (IS_ERR(em)) {
+ bio_put(bio);
+ return ERR_CAST(em);
+ }
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+ bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev);
+
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom);
+ free_extent_map(em);
+ if (ret < 0) {
+ bio_put(bio);
+ return ERR_PTR(ret);
+ }
+ *next_stripe_start = disk_bytenr + geom.len;
+
+ return bio;
+}
+
/*
* worker function to build and submit bios for previously compressed pages.
* The corresponding pages in the inode should be marked for writeback
@@ -394,20 +510,19 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = NULL;
struct compressed_bio *cb;
- unsigned long bytes_left;
- int pg_index = 0;
- struct page *page;
- u64 first_byte = disk_start;
+ u64 cur_disk_bytenr = disk_start;
+ u64 next_stripe_start;
blk_status_t ret;
int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
const bool use_append = btrfs_use_zone_append(inode, disk_start);
const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
- WARN_ON(!PAGE_ALIGNED(start));
+ ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ IS_ALIGNED(len, fs_info->sectorsize));
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
if (!cb)
return BLK_STS_RESOURCE;
- refcount_set(&cb->pending_bios, 0);
+ refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
cb->errors = 0;
cb->inode = &inode->vfs_inode;
cb->start = start;
@@ -418,118 +533,100 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
cb->orig_bio = NULL;
cb->nr_pages = nr_pages;
- bio = btrfs_bio_alloc(first_byte);
- bio->bi_opf = bio_op | write_flags;
- bio->bi_private = cb;
- bio->bi_end_io = end_compressed_bio_write;
-
- if (use_append) {
- struct btrfs_device *device;
-
- device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE);
- if (IS_ERR(device)) {
- kfree(cb);
- bio_put(bio);
- return BLK_STS_NOTSUPP;
+ while (cur_disk_bytenr < disk_start + compressed_len) {
+ u64 offset = cur_disk_bytenr - disk_start;
+ unsigned int index = offset >> PAGE_SHIFT;
+ unsigned int real_size;
+ unsigned int added;
+ struct page *page = compressed_pages[index];
+ bool submit = false;
+
+ /* Allocate new bio if submitted or not yet allocated */
+ if (!bio) {
+ bio = alloc_compressed_bio(cb, cur_disk_bytenr,
+ bio_op | write_flags, end_compressed_bio_write,
+ &next_stripe_start);
+ if (IS_ERR(bio)) {
+ ret = errno_to_blk_status(PTR_ERR(bio));
+ bio = NULL;
+ goto finish_cb;
+ }
}
-
- bio_set_dev(bio, device->bdev);
- }
-
- if (blkcg_css) {
- bio->bi_opf |= REQ_CGROUP_PUNT;
- kthread_associate_blkcg(blkcg_css);
- }
- refcount_set(&cb->pending_bios, 1);
-
- /* create and submit bios for the compressed pages */
- bytes_left = compressed_len;
- for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
- int submit = 0;
- int len = 0;
-
- page = compressed_pages[pg_index];
- page->mapping = inode->vfs_inode.i_mapping;
- if (bio->bi_iter.bi_size)
- submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
- 0);
-
/*
- * Page can only be added to bio if the current bio fits in
- * stripe.
+ * We should never reach next_stripe_start start as we will
+ * submit comp_bio when reach the boundary immediately.
*/
- if (!submit) {
- if (pg_index == 0 && use_append)
- len = bio_add_zone_append_page(bio, page,
- PAGE_SIZE, 0);
- else
- len = bio_add_page(bio, page, PAGE_SIZE, 0);
- }
-
- page->mapping = NULL;
- if (submit || len < PAGE_SIZE) {
- /*
- * inc the count before we submit the bio so
- * we know the end IO handler won't happen before
- * we inc the count. Otherwise, the cb might get
- * freed before we're done setting it up
- */
- refcount_inc(&cb->pending_bios);
- ret = btrfs_bio_wq_end_io(fs_info, bio,
- BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
+ ASSERT(cur_disk_bytenr != next_stripe_start);
+ /*
+ * We have various limits on the real read size:
+ * - stripe boundary
+ * - page boundary
+ * - compressed length boundary
+ */
+ real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr);
+ real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+ real_size = min_t(u64, real_size, compressed_len - offset);
+ ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
+
+ if (use_append)
+ added = bio_add_zone_append_page(bio, page, real_size,
+ offset_in_page(offset));
+ else
+ added = bio_add_page(bio, page, real_size,
+ offset_in_page(offset));
+ /* Reached zoned boundary */
+ if (added == 0)
+ submit = true;
+
+ cur_disk_bytenr += added;
+ /* Reached stripe boundary */
+ if (cur_disk_bytenr == next_stripe_start)
+ submit = true;
+
+ /* Finished the range */
+ if (cur_disk_bytenr == disk_start + compressed_len)
+ submit = true;
+
+ if (submit) {
if (!skip_sum) {
ret = btrfs_csum_one_bio(inode, bio, start, 1);
- BUG_ON(ret); /* -ENOMEM */
- }
-
- ret = btrfs_map_bio(fs_info, bio, 0);
- if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
+ if (ret)
+ goto finish_cb;
}
- bio = btrfs_bio_alloc(first_byte);
- bio->bi_opf = bio_op | write_flags;
- bio->bi_private = cb;
- bio->bi_end_io = end_compressed_bio_write;
- if (blkcg_css)
- bio->bi_opf |= REQ_CGROUP_PUNT;
- /*
- * Use bio_add_page() to ensure the bio has at least one
- * page.
- */
- bio_add_page(bio, page, PAGE_SIZE, 0);
+ ret = submit_compressed_bio(fs_info, cb, bio, 0);
+ if (ret)
+ goto finish_cb;
+ bio = NULL;
}
- if (bytes_left < PAGE_SIZE) {
- btrfs_info(fs_info,
- "bytes left %lu compress len %u nr %u",
- bytes_left, cb->compressed_len, cb->nr_pages);
- }
- bytes_left -= PAGE_SIZE;
- first_byte += PAGE_SIZE;
cond_resched();
}
+ if (blkcg_css)
+ kthread_associate_blkcg(NULL);
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
-
- if (!skip_sum) {
- ret = btrfs_csum_one_bio(inode, bio, start, 1);
- BUG_ON(ret); /* -ENOMEM */
- }
+ return 0;
- ret = btrfs_map_bio(fs_info, bio, 0);
- if (ret) {
+finish_cb:
+ if (bio) {
bio->bi_status = ret;
bio_endio(bio);
}
+ /* Last byte of @cb is submitted, endio will free @cb */
+ if (cur_disk_bytenr == disk_start + compressed_len)
+ return ret;
- if (blkcg_css)
- kthread_associate_blkcg(NULL);
-
- return 0;
+ wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
+ (disk_start + compressed_len - cur_disk_bytenr) >>
+ fs_info->sectorsize_bits);
+ /*
+ * Even with previous bio ended, we should still have io not yet
+ * submitted, thus need to finish manually.
+ */
+ ASSERT(refcount_read(&cb->pending_sectors));
+ /* Now we are the only one referring @cb, can finish it safely. */
+ finish_compressed_bio_write(cb);
+ return ret;
}
static u64 bio_end_offset(struct bio *bio)
@@ -539,25 +636,33 @@ static u64 bio_end_offset(struct bio *bio)
return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
}
+/*
+ * Add extra pages in the same compressed file extent so that we don't need to
+ * re-read the same extent again and again.
+ *
+ * NOTE: this won't work well for subpage, as for subpage read, we lock the
+ * full page then submit bio for each compressed/regular extents.
+ *
+ * This means, if we have several sectors in the same page points to the same
+ * on-disk compressed data, we will re-read the same extent many times and
+ * this function can only help for the next page.
+ */
static noinline int add_ra_bio_pages(struct inode *inode,
u64 compressed_end,
struct compressed_bio *cb)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long end_index;
- unsigned long pg_index;
- u64 last_offset;
+ u64 cur = bio_end_offset(cb->orig_bio);
u64 isize = i_size_read(inode);
int ret;
struct page *page;
- unsigned long nr_pages = 0;
struct extent_map *em;
struct address_space *mapping = inode->i_mapping;
struct extent_map_tree *em_tree;
struct extent_io_tree *tree;
- u64 end;
- int misses = 0;
+ int sectors_missed = 0;
- last_offset = bio_end_offset(cb->orig_bio);
em_tree = &BTRFS_I(inode)->extent_tree;
tree = &BTRFS_I(inode)->io_tree;
@@ -576,18 +681,29 @@ static noinline int add_ra_bio_pages(struct inode *inode,
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
- while (last_offset < compressed_end) {
- pg_index = last_offset >> PAGE_SHIFT;
+ while (cur < compressed_end) {
+ u64 page_end;
+ u64 pg_index = cur >> PAGE_SHIFT;
+ u32 add_size;
if (pg_index > end_index)
break;
page = xa_load(&mapping->i_pages, pg_index);
if (page && !xa_is_value(page)) {
- misses++;
- if (misses > 4)
+ sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >>
+ fs_info->sectorsize_bits;
+
+ /* Beyond threshold, no need to continue */
+ if (sectors_missed > 4)
break;
- goto next;
+
+ /*
+ * Jump to next page start as we already have page for
+ * current offset.
+ */
+ cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+ continue;
}
page = __page_cache_alloc(mapping_gfp_constraint(mapping,
@@ -597,14 +713,11 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
put_page(page);
- goto next;
+ /* There is already a page, skip to page end */
+ cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+ continue;
}
- /*
- * at this point, we have a locked page in the page cache
- * for these bytes in the file. But, we have to make
- * sure they map to this compressed extent on disk.
- */
ret = set_page_extent_mapped(page);
if (ret < 0) {
unlock_page(page);
@@ -612,18 +725,22 @@ static noinline int add_ra_bio_pages(struct inode *inode,
break;
}
- end = last_offset + PAGE_SIZE - 1;
- lock_extent(tree, last_offset, end);
+ page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
+ lock_extent(tree, cur, page_end);
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, last_offset,
- PAGE_SIZE);
+ em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
read_unlock(&em_tree->lock);
- if (!em || last_offset < em->start ||
- (last_offset + PAGE_SIZE > extent_map_end(em)) ||
+ /*
+ * At this point, we have a locked page in the page cache for
+ * these bytes in the file. But, we have to make sure they map
+ * to this compressed extent on disk.
+ */
+ if (!em || cur < em->start ||
+ (cur + fs_info->sectorsize > extent_map_end(em)) ||
(em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
- unlock_extent(tree, last_offset, end);
+ unlock_extent(tree, cur, page_end);
unlock_page(page);
put_page(page);
break;
@@ -641,20 +758,23 @@ static noinline int add_ra_bio_pages(struct inode *inode,
}
}
- ret = bio_add_page(cb->orig_bio, page,
- PAGE_SIZE, 0);
-
- if (ret == PAGE_SIZE) {
- nr_pages++;
- put_page(page);
- } else {
- unlock_extent(tree, last_offset, end);
+ add_size = min(em->start + em->len, page_end + 1) - cur;
+ ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
+ if (ret != add_size) {
+ unlock_extent(tree, cur, page_end);
unlock_page(page);
put_page(page);
break;
}
-next:
- last_offset += PAGE_SIZE;
+ /*
+ * If it's subpage, we also need to increase its
+ * subpage::readers number, as at endio we will decrease
+ * subpage::readers and to unlock the page.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE)
+ btrfs_subpage_start_reader(fs_info, page, cur, add_size);
+ put_page(page);
+ cur += add_size;
}
return 0;
}
@@ -679,9 +799,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
unsigned int compressed_len;
unsigned int nr_pages;
unsigned int pg_index;
- struct page *page;
- struct bio *comp_bio;
- u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
+ struct bio *comp_bio = NULL;
+ const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 cur_disk_byte = disk_bytenr;
+ u64 next_stripe_start;
u64 file_offset;
u64 em_len;
u64 em_start;
@@ -708,7 +829,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
if (!cb)
goto out;
- refcount_set(&cb->pending_bios, 0);
+ refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
cb->errors = 0;
cb->inode = inode;
cb->mirror_num = mirror_num;
@@ -748,86 +869,74 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
/* include any pages we added in add_ra-bio_pages */
cb->len = bio->bi_iter.bi_size;
- comp_bio = btrfs_bio_alloc(cur_disk_byte);
- comp_bio->bi_opf = REQ_OP_READ;
- comp_bio->bi_private = cb;
- comp_bio->bi_end_io = end_compressed_bio_read;
- refcount_set(&cb->pending_bios, 1);
-
- for (pg_index = 0; pg_index < nr_pages; pg_index++) {
- u32 pg_len = PAGE_SIZE;
- int submit = 0;
+ while (cur_disk_byte < disk_bytenr + compressed_len) {
+ u64 offset = cur_disk_byte - disk_bytenr;
+ unsigned int index = offset >> PAGE_SHIFT;
+ unsigned int real_size;
+ unsigned int added;
+ struct page *page = cb->compressed_pages[index];
+ bool submit = false;
+
+ /* Allocate new bio if submitted or not yet allocated */
+ if (!comp_bio) {
+ comp_bio = alloc_compressed_bio(cb, cur_disk_byte,
+ REQ_OP_READ, end_compressed_bio_read,
+ &next_stripe_start);
+ if (IS_ERR(comp_bio)) {
+ ret = errno_to_blk_status(PTR_ERR(comp_bio));
+ comp_bio = NULL;
+ goto finish_cb;
+ }
+ }
+ /*
+ * We should never reach next_stripe_start start as we will
+ * submit comp_bio when reach the boundary immediately.
+ */
+ ASSERT(cur_disk_byte != next_stripe_start);
+ /*
+ * We have various limit on the real read size:
+ * - stripe boundary
+ * - page boundary
+ * - compressed length boundary
+ */
+ real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte);
+ real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+ real_size = min_t(u64, real_size, compressed_len - offset);
+ ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
+ added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset));
/*
- * To handle subpage case, we need to make sure the bio only
- * covers the range we need.
- *
- * If we're at the last page, truncate the length to only cover
- * the remaining part.
+ * Maximum compressed extent is smaller than bio size limit,
+ * thus bio_add_page() should always success.
*/
- if (pg_index == nr_pages - 1)
- pg_len = min_t(u32, PAGE_SIZE,
- compressed_len - pg_index * PAGE_SIZE);
+ ASSERT(added == real_size);
+ cur_disk_byte += added;
- page = cb->compressed_pages[pg_index];
- page->mapping = inode->i_mapping;
- page->index = em_start >> PAGE_SHIFT;
+ /* Reached stripe boundary, need to submit */
+ if (cur_disk_byte == next_stripe_start)
+ submit = true;
- if (comp_bio->bi_iter.bi_size)
- submit = btrfs_bio_fits_in_stripe(page, pg_len,
- comp_bio, 0);
+ /* Has finished the range, need to submit */
+ if (cur_disk_byte == disk_bytenr + compressed_len)
+ submit = true;
- page->mapping = NULL;
- if (submit || bio_add_page(comp_bio, page, pg_len, 0) < pg_len) {
+ if (submit) {
unsigned int nr_sectors;
- ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
- BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
-
- /*
- * inc the count before we submit the bio so
- * we know the end IO handler won't happen before
- * we inc the count. Otherwise, the cb might get
- * freed before we're done setting it up
- */
- refcount_inc(&cb->pending_bios);
-
ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ goto finish_cb;
nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
fs_info->sectorsize);
sums += fs_info->csum_size * nr_sectors;
- ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
- if (ret) {
- comp_bio->bi_status = ret;
- bio_endio(comp_bio);
- }
-
- comp_bio = btrfs_bio_alloc(cur_disk_byte);
- comp_bio->bi_opf = REQ_OP_READ;
- comp_bio->bi_private = cb;
- comp_bio->bi_end_io = end_compressed_bio_read;
-
- bio_add_page(comp_bio, page, pg_len, 0);
+ ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num);
+ if (ret)
+ goto finish_cb;
+ comp_bio = NULL;
}
- cur_disk_byte += pg_len;
}
-
- ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
-
- ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
- BUG_ON(ret); /* -ENOMEM */
-
- ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
- if (ret) {
- comp_bio->bi_status = ret;
- bio_endio(comp_bio);
- }
-
return 0;
fail2:
@@ -842,6 +951,26 @@ fail1:
out:
free_extent_map(em);
return ret;
+finish_cb:
+ if (comp_bio) {
+ comp_bio->bi_status = ret;
+ bio_endio(comp_bio);
+ }
+ /* All bytes of @cb is submitted, endio will free @cb */
+ if (cur_disk_byte == disk_bytenr + compressed_len)
+ return ret;
+
+ wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
+ (disk_bytenr + compressed_len - cur_disk_byte) >>
+ fs_info->sectorsize_bits);
+ /*
+ * Even with previous bio ended, we should still have io not yet
+ * submitted, thus need to finish @cb manually.
+ */
+ ASSERT(refcount_read(&cb->pending_sectors));
+ /* Now we are the only one referring @cb, can finish it safely. */
+ finish_compressed_bio_read(cb, NULL);
+ return ret;
}
/*
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 399be0b435bf..56eef0821e3e 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -28,8 +28,8 @@ struct btrfs_inode;
#define BTRFS_ZLIB_DEFAULT_LEVEL 3
struct compressed_bio {
- /* number of bios pending for this compressed extent */
- refcount_t pending_bios;
+ /* Number of sectors with unfinished IO (unsubmitted or unfinished) */
+ refcount_t pending_sectors;
/* Number of compressed pages in the array */
unsigned int nr_pages;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 84627cbd5b5b..c3983bdaf4b8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/mm.h>
+#include <linux/error-injection.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -395,7 +396,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (*cow_ret == buf)
unlock_orig = 1;
- btrfs_assert_tree_locked(buf);
+ btrfs_assert_tree_write_locked(buf);
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
@@ -2487,7 +2488,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
int ret;
BUG_ON(!path->nodes[level]);
- btrfs_assert_tree_locked(path->nodes[level]);
+ btrfs_assert_tree_write_locked(path->nodes[level]);
lower = path->nodes[level];
nritems = btrfs_header_nritems(lower);
BUG_ON(slot > nritems);
@@ -2827,7 +2828,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (slot >= btrfs_header_nritems(upper) - 1)
return 1;
- btrfs_assert_tree_locked(path->nodes[1]);
+ btrfs_assert_tree_write_locked(path->nodes[1]);
right = btrfs_read_node_slot(upper, slot + 1);
/*
@@ -3065,7 +3066,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
if (right_nritems == 0)
return 1;
- btrfs_assert_tree_locked(path->nodes[1]);
+ btrfs_assert_tree_write_locked(path->nodes[1]);
left = btrfs_read_node_slot(path->nodes[1], slot - 1);
/*
@@ -3581,40 +3582,6 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
}
/*
- * This function duplicate a item, giving 'new_key' to the new item.
- * It guarantees both items live in the same tree leaf and the new item
- * is contiguous with the original item.
- *
- * This allows us to split file extent in place, keeping a lock on the
- * leaf the entire time.
- */
-int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const struct btrfs_key *new_key)
-{
- struct extent_buffer *leaf;
- int ret;
- u32 item_size;
-
- leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
- ret = setup_leaf_for_split(trans, root, path,
- item_size + sizeof(struct btrfs_item));
- if (ret)
- return ret;
-
- path->slots[0]++;
- setup_items_for_insert(root, path, new_key, &item_size, 1);
- leaf = path->nodes[0];
- memcpy_extent_buffer(leaf,
- btrfs_item_ptr_offset(leaf, path->slots[0]),
- btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
- item_size);
- return 0;
-}
-
-/*
* make the item pointed to by the path smaller. new_size indicates
* how small to make it, and from_end tells us if we just chop bytes
* off the end of the item or if we shift the item to chop bytes off
@@ -3785,13 +3752,10 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
*
* @root: root we are inserting items to
* @path: points to the leaf/slot where we are going to insert new items
- * @cpu_key: array of keys for items to be inserted
- * @data_size: size of the body of each item we are going to insert
- * @nr: size of @cpu_key/@data_size arrays
+ * @batch: information about the batch of items to insert
*/
-void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- int nr)
+static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+ const struct btrfs_item_batch *batch)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_item *item;
@@ -3803,14 +3767,14 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
int slot;
struct btrfs_map_token token;
u32 total_size;
- u32 total_data = 0;
-
- for (i = 0; i < nr; i++)
- total_data += data_size[i];
- total_size = total_data + (nr * sizeof(struct btrfs_item));
+ /*
+ * Before anything else, update keys in the parent and other ancestors
+ * if needed, then release the write locks on them, so that other tasks
+ * can use them while we modify the leaf.
+ */
if (path->slots[0] == 0) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+ btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
fixup_low_keys(path, &disk_key, 1);
}
btrfs_unlock_up_safe(path, 1);
@@ -3820,6 +3784,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(leaf);
+ total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
if (btrfs_leaf_free_space(leaf) < total_size) {
btrfs_print_leaf(leaf);
@@ -3849,31 +3814,32 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
item = btrfs_item_nr(i);
ioff = btrfs_token_item_offset(&token, item);
btrfs_set_token_item_offset(&token, item,
- ioff - total_data);
+ ioff - batch->total_data_size);
}
/* shift the items */
- memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
btrfs_item_nr_offset(slot),
(nritems - slot) * sizeof(struct btrfs_item));
/* shift the data */
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
- data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
- data_end, old_data - data_end);
+ data_end - batch->total_data_size,
+ BTRFS_LEAF_DATA_OFFSET + data_end,
+ old_data - data_end);
data_end = old_data;
}
/* setup the item for the new data */
- for (i = 0; i < nr; i++) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+ for (i = 0; i < batch->nr; i++) {
+ btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
btrfs_set_item_key(leaf, &disk_key, slot + i);
item = btrfs_item_nr(slot + i);
- data_end -= data_size[i];
+ data_end -= batch->data_sizes[i];
btrfs_set_token_item_offset(&token, item, data_end);
- btrfs_set_token_item_size(&token, item, data_size[i]);
+ btrfs_set_token_item_size(&token, item, batch->data_sizes[i]);
}
- btrfs_set_header_nritems(leaf, nritems + nr);
+ btrfs_set_header_nritems(leaf, nritems + batch->nr);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
@@ -3883,26 +3849,43 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
}
/*
+ * Insert a new item into a leaf.
+ *
+ * @root: The root of the btree.
+ * @path: A path pointing to the target leaf and slot.
+ * @key: The key of the new item.
+ * @data_size: The size of the data associated with the new key.
+ */
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *key,
+ u32 data_size)
+{
+ struct btrfs_item_batch batch;
+
+ batch.keys = key;
+ batch.data_sizes = &data_size;
+ batch.total_data_size = data_size;
+ batch.nr = 1;
+
+ setup_items_for_insert(root, path, &batch);
+}
+
+/*
* Given a key and some data, insert items into the tree.
* This does all the path init required, making room in the tree if needed.
*/
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- int nr)
+ const struct btrfs_item_batch *batch)
{
int ret = 0;
int slot;
- int i;
- u32 total_size = 0;
- u32 total_data = 0;
-
- for (i = 0; i < nr; i++)
- total_data += data_size[i];
+ u32 total_size;
- total_size = total_data + (nr * sizeof(struct btrfs_item));
- ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+ total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
+ ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
if (ret == 0)
return -EEXIST;
if (ret < 0)
@@ -3911,7 +3894,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
slot = path->slots[0];
BUG_ON(slot < 0);
- setup_items_for_insert(root, path, cpu_key, data_size, nr);
+ setup_items_for_insert(root, path, batch);
return 0;
}
@@ -3943,6 +3926,40 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
/*
+ * This function duplicates an item, giving 'new_key' to the new item.
+ * It guarantees both items live in the same tree leaf and the new item is
+ * contiguous with the original item.
+ *
+ * This allows us to split a file extent in place, keeping a lock on the leaf
+ * the entire time.
+ */
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *new_key)
+{
+ struct extent_buffer *leaf;
+ int ret;
+ u32 item_size;
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ret = setup_leaf_for_split(trans, root, path,
+ item_size + sizeof(struct btrfs_item));
+ if (ret)
+ return ret;
+
+ path->slots[0]++;
+ btrfs_setup_item_for_insert(root, path, new_key, item_size);
+ leaf = path->nodes[0];
+ memcpy_extent_buffer(leaf,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+ item_size);
+ return 0;
+}
+
+/*
* delete the pointer from a given node.
*
* the tree should have been previously balanced so the deletion does not
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dff2c8a3e059..7553e9dc5f93 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,6 +48,7 @@ extern struct kmem_cache *btrfs_free_space_cachep;
extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
struct btrfs_ordered_sum;
struct btrfs_ref;
+struct btrfs_bio;
#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
@@ -217,6 +218,9 @@ struct btrfs_root_backup {
u8 unused_8[10];
} __attribute__ ((__packed__));
+#define BTRFS_SUPER_INFO_OFFSET SZ_64K
+#define BTRFS_SUPER_INFO_SIZE 4096
+
/*
* the super block basically lists the main trees of the FS
* it currently lacks any block count etc etc
@@ -269,7 +273,11 @@ struct btrfs_super_block {
__le64 reserved[28];
u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
+
+ /* Padded to 4096 bytes */
+ u8 padding[565];
} __attribute__ ((__packed__));
+static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
/*
* Compat flags that we support. If any incompat flags are set other than the
@@ -899,6 +907,7 @@ struct btrfs_fs_info {
struct btrfs_workqueue *scrub_workers;
struct btrfs_workqueue *scrub_wr_completion_workers;
struct btrfs_workqueue *scrub_parity_workers;
+ struct btrfs_subpage_info *subpage_info;
struct btrfs_discard_ctl discard_ctl;
@@ -1017,6 +1026,16 @@ struct btrfs_fs_info {
spinlock_t treelog_bg_lock;
u64 treelog_bg;
+ /*
+ * Start of the dedicated data relocation block group, protected by
+ * relocation_bg_lock.
+ */
+ spinlock_t relocation_bg_lock;
+ u64 data_reloc_bg;
+
+ spinlock_t zone_active_bgs_lock;
+ struct list_head zone_active_bgs;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -2885,16 +2904,42 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
return btrfs_del_items(trans, root, path, path->slots[0], 1);
}
-void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- int nr);
+/*
+ * Describes a batch of items to insert in a btree. This is used by
+ * btrfs_insert_empty_items().
+ */
+struct btrfs_item_batch {
+ /*
+ * Pointer to an array containing the keys of the items to insert (in
+ * sorted order).
+ */
+ const struct btrfs_key *keys;
+ /* Pointer to an array containing the data size for each item to insert. */
+ const u32 *data_sizes;
+ /*
+ * The sum of data sizes for all items. The caller can compute this while
+ * setting up the data_sizes array, so it ends up being more efficient
+ * than having btrfs_insert_empty_items() or setup_item_for_insert()
+ * doing it, as it would avoid an extra loop over a potentially large
+ * array, and in the case of setup_item_for_insert(), we would be doing
+ * it while holding a write lock on a leaf and often on upper level nodes
+ * too, unnecessarily increasing the size of a critical section.
+ */
+ u32 total_data_size;
+ /* Size of the keys and data_sizes arrays (number of items in the batch). */
+ int nr;
+};
+
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *key,
+ u32 data_size);
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, void *data, u32 data_size);
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- int nr);
+ const struct btrfs_item_batch *batch);
static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -2902,7 +2947,14 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
const struct btrfs_key *key,
u32 data_size)
{
- return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
+ struct btrfs_item_batch batch;
+
+ batch.keys = key;
+ batch.data_sizes = &data_size;
+ batch.total_data_size = data_size;
+ batch.nr = 1;
+
+ return btrfs_insert_empty_items(trans, root, path, &batch);
}
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
@@ -3030,7 +3082,7 @@ struct btrfs_dir_item *
btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
- u64 objectid, const char *name, int name_len,
+ u64 index, const char *name, int name_len,
int mod);
struct btrfs_dir_item *
btrfs_search_dir_index_item(struct btrfs_root *root,
@@ -3129,8 +3181,9 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
-unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
- struct page *page, u64 start, u64 end);
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page,
+ u64 start, u64 end);
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
@@ -3142,7 +3195,6 @@ void __btrfs_del_delalloc_inode(struct btrfs_root *root,
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *dir, struct btrfs_inode *inode,
const char *name, int name_len);
int btrfs_add_link(struct btrfs_trans_handle *trans,
@@ -3174,8 +3226,6 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
struct extent_state *other);
void btrfs_split_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split);
-int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
- unsigned long bio_flags);
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
@@ -3242,9 +3292,9 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns,
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
int __pure btrfs_is_empty_uuid(u8 *uuid);
-int btrfs_defrag_file(struct inode *inode, struct file *file,
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
- u64 newer_than, unsigned long max_pages);
+ u64 newer_than, unsigned long max_to_defrag);
void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
@@ -3563,6 +3613,9 @@ do { \
(errno), fmt, ##args); \
} while (0)
+#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
+ &(fs_info)->fs_state)))
+
__printf(5, 6)
__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
@@ -3842,6 +3895,11 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
return fs_info->zoned != 0;
}
+static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
+{
+ return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
+}
+
/*
* We use page status Private2 to indicate there is an ordered extent with
* unfinished IO.
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 1e08eb2b27f0..e164766dcc38 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -679,19 +679,18 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_delayed_item *first_item)
{
- LIST_HEAD(batch);
+ LIST_HEAD(item_list);
struct btrfs_delayed_item *curr;
struct btrfs_delayed_item *next;
const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
+ struct btrfs_item_batch batch;
int total_size;
- int nitems;
char *ins_data = NULL;
- struct btrfs_key *ins_keys;
- u32 *ins_sizes;
int ret;
- list_add_tail(&first_item->tree_list, &batch);
- nitems = 1;
+ list_add_tail(&first_item->tree_list, &item_list);
+ batch.total_data_size = first_item->data_len;
+ batch.nr = 1;
total_size = first_item->data_len + sizeof(struct btrfs_item);
curr = first_item;
@@ -706,39 +705,43 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
if (total_size + next_size > max_size)
break;
- list_add_tail(&next->tree_list, &batch);
- nitems++;
+ list_add_tail(&next->tree_list, &item_list);
+ batch.nr++;
total_size += next_size;
+ batch.total_data_size += next->data_len;
curr = next;
}
- if (nitems == 1) {
- ins_keys = &first_item->key;
- ins_sizes = &first_item->data_len;
+ if (batch.nr == 1) {
+ batch.keys = &first_item->key;
+ batch.data_sizes = &first_item->data_len;
} else {
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
int i = 0;
- ins_data = kmalloc(nitems * sizeof(u32) +
- nitems * sizeof(struct btrfs_key), GFP_NOFS);
+ ins_data = kmalloc(batch.nr * sizeof(u32) +
+ batch.nr * sizeof(struct btrfs_key), GFP_NOFS);
if (!ins_data) {
ret = -ENOMEM;
goto out;
}
ins_sizes = (u32 *)ins_data;
- ins_keys = (struct btrfs_key *)(ins_data + nitems * sizeof(u32));
- list_for_each_entry(curr, &batch, tree_list) {
+ ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ list_for_each_entry(curr, &item_list, tree_list) {
ins_keys[i] = curr->key;
ins_sizes[i] = curr->data_len;
i++;
}
}
- ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes,
- nitems);
+ ret = btrfs_insert_empty_items(trans, root, path, &batch);
if (ret)
goto out;
- list_for_each_entry(curr, &batch, tree_list) {
+ list_for_each_entry(curr, &item_list, tree_list) {
char *data_ptr;
data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
@@ -754,7 +757,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
*/
btrfs_release_path(path);
- list_for_each_entry_safe(curr, next, &batch, tree_list) {
+ list_for_each_entry_safe(curr, next, &item_list, tree_list) {
list_del(&curr->tree_list);
btrfs_delayed_item_release_metadata(root, curr);
btrfs_release_delayed_item(curr);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ca848b183474..cca7e85e32dd 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -906,7 +906,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
u64 parent = generic_ref->parent;
u8 ref_type;
- is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID);
+ is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);
ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
BUG_ON(extent_op && extent_op->is_data);
@@ -921,8 +921,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
}
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- is_fstree(generic_ref->real_root) &&
- is_fstree(generic_ref->tree_ref.root) &&
!generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
@@ -938,14 +936,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
ref_type = BTRFS_TREE_BLOCK_REF_KEY;
init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
- generic_ref->tree_ref.root, action, ref_type);
- ref->root = generic_ref->tree_ref.root;
+ generic_ref->tree_ref.owning_root, action,
+ ref_type);
+ ref->root = generic_ref->tree_ref.owning_root;
ref->parent = parent;
ref->level = level;
init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
- generic_ref->tree_ref.root, 0, action, false,
- is_system);
+ generic_ref->tree_ref.owning_root, 0, action,
+ false, is_system);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
@@ -997,7 +996,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
u64 bytenr = generic_ref->bytenr;
u64 num_bytes = generic_ref->len;
u64 parent = generic_ref->parent;
- u64 ref_root = generic_ref->data_ref.ref_root;
+ u64 ref_root = generic_ref->data_ref.owning_root;
u64 owner = generic_ref->data_ref.ino;
u64 offset = generic_ref->data_ref.offset;
u8 ref_type;
@@ -1026,8 +1025,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
}
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- is_fstree(ref_root) &&
- is_fstree(generic_ref->real_root) &&
!generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index e22fba272e4f..91a3aabad150 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -186,8 +186,8 @@ enum btrfs_ref_type {
struct btrfs_data_ref {
/* For EXTENT_DATA_REF */
- /* Root which refers to this data extent */
- u64 ref_root;
+ /* Original root this data extent belongs to */
+ u64 owning_root;
/* Inode which refers to this data extent */
u64 ino;
@@ -210,11 +210,11 @@ struct btrfs_tree_ref {
int level;
/*
- * Root which refers to this tree block.
+ * Root which owns this tree block.
*
* For TREE_BLOCK_REF (skinny metadata, either inline or keyed)
*/
- u64 root;
+ u64 owning_root;
/* For non-skinny metadata, no special member needed */
};
@@ -231,17 +231,10 @@ struct btrfs_ref {
*/
bool skip_qgroup;
- /*
- * Optional. For which root is this modification.
- * Mostly used for qgroup optimization.
- *
- * When unset, data/tree ref init code will populate it.
- * In certain cases, we're modifying reference for a different root.
- * E.g. COW fs tree blocks for balance.
- * In that case, tree_ref::root will be fs tree, but we're doing this
- * for reloc tree, then we should set @real_root to reloc tree.
- */
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ /* Through which root is this modification. */
u64 real_root;
+#endif
u64 bytenr;
u64 len;
@@ -271,26 +264,40 @@ static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
}
static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref,
- int level, u64 root)
+ int level, u64 root, u64 mod_root, bool skip_qgroup)
{
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
/* If @real_root not set, use @root as fallback */
- if (!generic_ref->real_root)
- generic_ref->real_root = root;
+ generic_ref->real_root = mod_root ?: root;
+#endif
generic_ref->tree_ref.level = level;
- generic_ref->tree_ref.root = root;
+ generic_ref->tree_ref.owning_root = root;
generic_ref->type = BTRFS_REF_METADATA;
+ if (skip_qgroup || !(is_fstree(root) &&
+ (!mod_root || is_fstree(mod_root))))
+ generic_ref->skip_qgroup = true;
+ else
+ generic_ref->skip_qgroup = false;
+
}
static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref,
- u64 ref_root, u64 ino, u64 offset)
+ u64 ref_root, u64 ino, u64 offset, u64 mod_root,
+ bool skip_qgroup)
{
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
/* If @real_root not set, use @root as fallback */
- if (!generic_ref->real_root)
- generic_ref->real_root = ref_root;
- generic_ref->data_ref.ref_root = ref_root;
+ generic_ref->real_root = mod_root ?: ref_root;
+#endif
+ generic_ref->data_ref.owning_root = ref_root;
generic_ref->data_ref.ino = ino;
generic_ref->data_ref.offset = offset;
generic_ref->type = BTRFS_REF_DATA;
+ if (skip_qgroup || !(is_fstree(ref_root) &&
+ (!mod_root || is_fstree(mod_root))))
+ generic_ref->skip_qgroup = true;
+ else
+ generic_ref->skip_qgroup = false;
}
static inline struct btrfs_delayed_extent_op *
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index d029be40ea6f..c85a7d44da79 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -70,6 +70,7 @@ static int btrfs_dev_replace_kthread(void *data);
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
{
+ struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID };
struct btrfs_key key;
struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
@@ -100,8 +101,7 @@ no_valid_dev_replace_entry_found:
* We don't have a replace item or it's corrupted. If there is
* a replace target, fail the mount.
*/
- if (btrfs_find_device(fs_info->fs_devices,
- BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+ if (btrfs_find_device(fs_info->fs_devices, &args)) {
btrfs_err(fs_info,
"found replace target device without a valid replace item");
ret = -EUCLEAN;
@@ -163,8 +163,7 @@ no_valid_dev_replace_entry_found:
* We don't have an active replace item but if there is a
* replace target, fail the mount.
*/
- if (btrfs_find_device(fs_info->fs_devices,
- BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+ if (btrfs_find_device(fs_info->fs_devices, &args)) {
btrfs_err(fs_info,
"replace devid present without an active replace item");
ret = -EUCLEAN;
@@ -175,11 +174,10 @@ no_valid_dev_replace_entry_found:
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
- dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
- src_devid, NULL, NULL);
- dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
- BTRFS_DEV_REPLACE_DEVID,
- NULL, NULL);
+ dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args);
+ args.devid = src_devid;
+ dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args);
+
/*
* allow 'btrfs dev replace_cancel' if src/tgt device is
* missing
@@ -283,8 +281,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
}
- if (i_size_read(bdev->bd_inode) <
- btrfs_device_get_total_bytes(srcdev)) {
+ if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) {
btrfs_err(fs_info,
"target device is smaller than source device!");
ret = -EINVAL;
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index f1274d5c3805..7721ce0c0604 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -190,9 +190,20 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir(
}
/*
- * lookup a directory item based on name. 'dir' is the objectid
- * we're searching in, and 'mod' tells us if you plan on deleting the
- * item (use mod < 0) or changing the options (use mod > 0)
+ * Lookup for a directory item by name.
+ *
+ * @trans: The transaction handle to use. Can be NULL if @mod is 0.
+ * @root: The root of the target tree.
+ * @path: Path to use for the search.
+ * @dir: The inode number (objectid) of the directory.
+ * @name: The name associated to the directory entry we are looking for.
+ * @name_len: The length of the name.
+ * @mod: Used to indicate if the tree search is meant for a read only
+ * lookup, for a modification lookup or for a deletion lookup, so
+ * its value should be 0, 1 or -1, respectively.
+ *
+ * Returns: NULL if the dir item does not exists, an error pointer if an error
+ * happened, or a pointer to a dir item if a dir item exists for the given name.
*/
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -273,27 +284,42 @@ out:
}
/*
- * lookup a directory item based on index. 'dir' is the objectid
- * we're searching in, and 'mod' tells us if you plan on deleting the
- * item (use mod < 0) or changing the options (use mod > 0)
+ * Lookup for a directory index item by name and index number.
*
- * The name is used to make sure the index really points to the name you were
- * looking for.
+ * @trans: The transaction handle to use. Can be NULL if @mod is 0.
+ * @root: The root of the target tree.
+ * @path: Path to use for the search.
+ * @dir: The inode number (objectid) of the directory.
+ * @index: The index number.
+ * @name: The name associated to the directory entry we are looking for.
+ * @name_len: The length of the name.
+ * @mod: Used to indicate if the tree search is meant for a read only
+ * lookup, for a modification lookup or for a deletion lookup, so
+ * its value should be 0, 1 or -1, respectively.
+ *
+ * Returns: NULL if the dir index item does not exists, an error pointer if an
+ * error happened, or a pointer to a dir item if the dir index item exists and
+ * matches the criteria (name and index number).
*/
struct btrfs_dir_item *
btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
- u64 objectid, const char *name, int name_len,
+ u64 index, const char *name, int name_len,
int mod)
{
+ struct btrfs_dir_item *di;
struct btrfs_key key;
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
- key.offset = objectid;
+ key.offset = index;
- return btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ if (di == ERR_PTR(-ENOENT))
+ return NULL;
+
+ return di;
}
struct btrfs_dir_item *
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 355ea88d5c5f..59c3be8c1f4c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -683,7 +683,7 @@ err:
return ret;
}
-int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
+int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
struct page *page, u64 start, u64 end,
int mirror)
{
@@ -1036,7 +1036,7 @@ static int btree_set_page_dirty(struct page *page)
BUG_ON(!eb);
BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
BUG_ON(!atomic_read(&eb->refs));
- btrfs_assert_tree_locked(eb);
+ btrfs_assert_tree_write_locked(eb);
return __set_page_dirty_nobuffers(page);
}
ASSERT(PagePrivate(page) && page->private);
@@ -1061,7 +1061,7 @@ static int btree_set_page_dirty(struct page *page)
ASSERT(eb);
ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
ASSERT(atomic_read(&eb->refs));
- btrfs_assert_tree_locked(eb);
+ btrfs_assert_tree_write_locked(eb);
free_extent_buffer(eb);
cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
@@ -1125,7 +1125,7 @@ void btrfs_clean_tree_block(struct extent_buffer *buf)
struct btrfs_fs_info *fs_info = buf->fs_info;
if (btrfs_header_generation(buf) ==
fs_info->running_transaction->transid) {
- btrfs_assert_tree_locked(buf);
+ btrfs_assert_tree_write_locked(buf);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
@@ -1500,7 +1500,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
goto fail;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
- root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ !btrfs_is_data_reloc_root(root)) {
set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
btrfs_check_and_init_root_item(&root->root_item);
}
@@ -1644,6 +1644,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
+ kfree(fs_info->subpage_info);
kvfree(fs_info);
}
@@ -1953,8 +1954,7 @@ sleep:
wake_up_process(fs_info->cleaner_kthread);
mutex_unlock(&fs_info->transaction_kthread_mutex);
- if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
- &fs_info->fs_state)))
+ if (BTRFS_FS_ERROR(fs_info))
btrfs_cleanup_transaction(fs_info);
if (!kthread_should_stop() &&
(!btrfs_transaction_blocked(fs_info) ||
@@ -2592,8 +2592,7 @@ static int validate_super(struct btrfs_fs_info *fs_info,
/*
* For 4K page size, we only support 4K sector size.
- * For 64K page size, we support read-write for 64K sector size, and
- * read-only for 4K sector size.
+ * For 64K page size, we support 64K and 4K sector sizes.
*/
if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
(PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
@@ -2883,6 +2882,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
spin_lock_init(&fs_info->treelog_bg_lock);
+ spin_lock_init(&fs_info->zone_active_bgs_lock);
+ spin_lock_init(&fs_info->relocation_bg_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
mutex_init(&fs_info->reclaim_bgs_lock);
@@ -2896,6 +2897,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->unused_bgs);
INIT_LIST_HEAD(&fs_info->reclaim_bgs);
+ INIT_LIST_HEAD(&fs_info->zone_active_bgs);
#ifdef CONFIG_BTRFS_DEBUG
INIT_LIST_HEAD(&fs_info->allocated_roots);
INIT_LIST_HEAD(&fs_info->allocated_ebs);
@@ -3228,12 +3230,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
btrfs_init_btree_inode(fs_info);
- invalidate_bdev(fs_devices->latest_bdev);
+ invalidate_bdev(fs_devices->latest_dev->bdev);
/*
* Read super block and check the signature bytes only
*/
- disk_super = btrfs_read_dev_super(fs_devices->latest_bdev);
+ disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
if (IS_ERR(disk_super)) {
err = PTR_ERR(disk_super);
goto fail_alloc;
@@ -3392,12 +3394,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
- if (sectorsize != PAGE_SIZE) {
+ if (sectorsize < PAGE_SIZE) {
+ struct btrfs_subpage_info *subpage_info;
+
btrfs_warn(fs_info,
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
- }
- if (sectorsize != PAGE_SIZE) {
if (btrfs_super_incompat_flags(fs_info->super_copy) &
BTRFS_FEATURE_INCOMPAT_RAID56) {
btrfs_err(fs_info,
@@ -3406,6 +3408,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
err = -EINVAL;
goto fail_alloc;
}
+ subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
+ if (!subpage_info)
+ goto fail_alloc;
+ btrfs_init_subpage_info(subpage_info, sectorsize);
+ fs_info->subpage_info = subpage_info;
}
ret = btrfs_init_workqueues(fs_info, fs_devices);
@@ -3465,7 +3472,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
* below in btrfs_init_dev_replace().
*/
btrfs_free_extra_devids(fs_devices);
- if (!fs_devices->latest_bdev) {
+ if (!fs_devices->latest_dev->bdev) {
btrfs_err(fs_info, "failed to read devices");
goto fail_tree_roots;
}
@@ -3556,7 +3563,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_sysfs;
}
- if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
+ if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
+ !btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
"writable mount is not allowed due to too many missing devices");
goto fail_sysfs;
@@ -3740,7 +3748,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
else if (ret)
return ERR_PTR(ret);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
return ERR_PTR(-EINVAL);
page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
@@ -3881,7 +3889,9 @@ static int write_dev_supers(struct btrfs_device *device,
bio->bi_opf |= REQ_FUA;
btrfsic_submit_bio(bio);
- btrfs_advance_sb_log(device, i);
+
+ if (btrfs_advance_sb_log(device, i))
+ errors++;
}
return errors < i ? 0 : -1;
}
@@ -4221,7 +4231,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
drop_ref = true;
spin_unlock(&fs_info->fs_roots_radix_lock);
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
ASSERT(root->log_root == NULL);
if (root->reloc_root) {
btrfs_put_root(root->reloc_root);
@@ -4372,8 +4382,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info, "commit super ret %d", ret);
}
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) ||
- test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
btrfs_error_commit_super(fs_info);
kthread_stop(fs_info->transaction_kthread);
@@ -4470,7 +4479,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
return;
#endif
- btrfs_assert_tree_locked(buf);
+ btrfs_assert_tree_write_locked(buf);
if (transid != fs_info->generation)
WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
buf->start, transid, fs_info->generation);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 0e7e9526b6a8..a2b5db4ba262 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -6,9 +6,6 @@
#ifndef BTRFS_DISK_IO_H
#define BTRFS_DISK_IO_H
-#define BTRFS_SUPER_INFO_OFFSET SZ_64K
-#define BTRFS_SUPER_INFO_SIZE 4096
-
#define BTRFS_SUPER_MIRROR_MAX 3
#define BTRFS_SUPER_MIRROR_SHIFT 12
@@ -81,7 +78,7 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
-int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
+int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
struct page *page, u64 start, u64 end,
int mirror);
blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fc3da7585fb7..3fd736a02c1e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1266,7 +1266,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
return ret;
}
-static int do_discard_extent(struct btrfs_bio_stripe *stripe, u64 *bytes)
+static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes)
{
struct btrfs_device *dev = stripe->dev;
struct btrfs_fs_info *fs_info = dev->fs_info;
@@ -1313,22 +1313,21 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 discarded_bytes = 0;
u64 end = bytenr + num_bytes;
u64 cur = bytenr;
- struct btrfs_bio *bbio = NULL;
-
+ struct btrfs_io_context *bioc = NULL;
/*
- * Avoid races with device replace and make sure our bbio has devices
+ * Avoid races with device replace and make sure our bioc has devices
* associated to its stripes that don't go away while we are discarding.
*/
btrfs_bio_counter_inc_blocked(fs_info);
while (cur < end) {
- struct btrfs_bio_stripe *stripe;
+ struct btrfs_io_stripe *stripe;
int i;
num_bytes = end - cur;
/* Tell the block device(s) that the sectors can be discarded */
ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
- &num_bytes, &bbio, 0);
+ &num_bytes, &bioc, 0);
/*
* Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
* -EOPNOTSUPP. For any such error, @num_bytes is not updated,
@@ -1337,8 +1336,8 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
if (ret < 0)
goto out;
- stripe = bbio->stripes;
- for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+ stripe = bioc->stripes;
+ for (i = 0; i < bioc->num_stripes; i++, stripe++) {
u64 bytes;
struct btrfs_device *device = stripe->dev;
@@ -1361,7 +1360,7 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
* And since there are two loops, explicitly
* go to out to avoid confusion.
*/
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
goto out;
}
@@ -1372,7 +1371,7 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
*/
ret = 0;
}
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
cur += num_bytes;
}
out:
@@ -1397,7 +1396,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
generic_ref->action);
BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
- generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
+ generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
if (generic_ref->type == BTRFS_REF_METADATA)
ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
@@ -2376,7 +2375,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
out:
btrfs_free_path(path);
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
WARN_ON(ret > 0);
return ret;
}
@@ -2438,10 +2437,9 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(buf, fi);
btrfs_init_generic_ref(&generic_ref, action, bytenr,
num_bytes, parent);
- generic_ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
- key.offset);
- generic_ref.skip_qgroup = for_reloc;
+ key.offset, root->root_key.objectid,
+ for_reloc);
if (inc)
ret = btrfs_inc_extent_ref(trans, &generic_ref);
else
@@ -2453,9 +2451,8 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
num_bytes = fs_info->nodesize;
btrfs_init_generic_ref(&generic_ref, action, bytenr,
num_bytes, parent);
- generic_ref.real_root = root->root_key.objectid;
- btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
- generic_ref.skip_qgroup = for_reloc;
+ btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
+ root->root_key.objectid, for_reloc);
if (inc)
ret = btrfs_inc_extent_ref(trans, &generic_ref);
else
@@ -3196,7 +3193,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto out;
}
- ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
+ ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3289,7 +3286,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
buf->start, buf->len, parent);
btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
- root->root_key.objectid);
+ root->root_key.objectid, 0, false);
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
btrfs_ref_tree_mod(fs_info, &generic_ref);
@@ -3373,9 +3370,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
* tree, just update pinning info and exit early.
*/
if ((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+ ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
(ref->type == BTRFS_REF_DATA &&
- ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
+ ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
/* unlocks the pinned mutex */
btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
ret = 0;
@@ -3386,9 +3383,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
}
if (!((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+ ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
(ref->type == BTRFS_REF_DATA &&
- ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
+ ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
btrfs_ref_tree_mod(fs_info, ref);
return ret;
@@ -3476,7 +3473,9 @@ enum btrfs_extent_allocation_policy {
*/
struct find_free_extent_ctl {
/* Basic allocation info */
+ u64 ram_bytes;
u64 num_bytes;
+ u64 min_alloc_size;
u64 empty_size;
u64 flags;
int delalloc;
@@ -3495,6 +3494,9 @@ struct find_free_extent_ctl {
/* Allocation is called for tree-log */
bool for_treelog;
+ /* Allocation is called for data relocation */
+ bool for_data_reloc;
+
/* RAID index, converted from flags */
int index;
@@ -3756,8 +3758,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
u64 avail;
u64 bytenr = block_group->start;
u64 log_bytenr;
+ u64 data_reloc_bytenr;
int ret = 0;
- bool skip;
+ bool skip = false;
ASSERT(btrfs_is_zoned(block_group->fs_info));
@@ -3767,19 +3770,49 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
*/
spin_lock(&fs_info->treelog_bg_lock);
log_bytenr = fs_info->treelog_bg;
- skip = log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
- (!ffe_ctl->for_treelog && bytenr == log_bytenr));
+ if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
+ (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
+ skip = true;
spin_unlock(&fs_info->treelog_bg_lock);
if (skip)
return 1;
+ /*
+ * Do not allow non-relocation blocks in the dedicated relocation block
+ * group, and vice versa.
+ */
+ spin_lock(&fs_info->relocation_bg_lock);
+ data_reloc_bytenr = fs_info->data_reloc_bg;
+ if (data_reloc_bytenr &&
+ ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
+ (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
+ skip = true;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ if (skip)
+ return 1;
+ /* Check RO and no space case before trying to activate it */
+ spin_lock(&block_group->lock);
+ if (block_group->ro ||
+ block_group->alloc_offset == block_group->zone_capacity) {
+ spin_unlock(&block_group->lock);
+ return 1;
+ }
+ spin_unlock(&block_group->lock);
+
+ if (!btrfs_zone_activate(block_group))
+ return 1;
+
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
spin_lock(&fs_info->treelog_bg_lock);
+ spin_lock(&fs_info->relocation_bg_lock);
ASSERT(!ffe_ctl->for_treelog ||
block_group->start == fs_info->treelog_bg ||
fs_info->treelog_bg == 0);
+ ASSERT(!ffe_ctl->for_data_reloc ||
+ block_group->start == fs_info->data_reloc_bg ||
+ fs_info->data_reloc_bg == 0);
if (block_group->ro) {
ret = 1;
@@ -3796,7 +3829,18 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
goto out;
}
- avail = block_group->length - block_group->alloc_offset;
+ /*
+ * Do not allow currently used block group to be the data relocation
+ * dedicated block group.
+ */
+ if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
+ (block_group->used || block_group->reserved)) {
+ ret = 1;
+ goto out;
+ }
+
+ WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
+ avail = block_group->zone_capacity - block_group->alloc_offset;
if (avail < num_bytes) {
if (ffe_ctl->max_extent_size < avail) {
/*
@@ -3813,6 +3857,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
fs_info->treelog_bg = block_group->start;
+ if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg)
+ fs_info->data_reloc_bg = block_group->start;
+
ffe_ctl->found_offset = start + block_group->alloc_offset;
block_group->alloc_offset += num_bytes;
spin_lock(&ctl->tree_lock);
@@ -3829,6 +3876,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
out:
if (ret && ffe_ctl->for_treelog)
fs_info->treelog_bg = 0;
+ if (ret && ffe_ctl->for_data_reloc)
+ fs_info->data_reloc_bg = 0;
+ spin_unlock(&fs_info->relocation_bg_lock);
spin_unlock(&fs_info->treelog_bg_lock);
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
@@ -3932,18 +3982,30 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
ffe_ctl->orig_have_caching_bg = true;
- if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
- ffe_ctl->have_caching_bg)
- return 1;
-
- if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
- return 1;
-
if (ins->objectid) {
found_extent(ffe_ctl, ins);
return 0;
}
+ if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
+ !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->index)) {
+ /*
+ * If we have enough free space left in an already active block
+ * group and we can't activate any other zone now, retry the
+ * active ones with a smaller allocation size. Returning early
+ * from here will tell btrfs_reserve_extent() to haven the
+ * size.
+ */
+ return -ENOSPC;
+ }
+
+ if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
+ return 1;
+
+ ffe_ctl->index++;
+ if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
+ return 1;
+
/*
* LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
* caching kthreads as we move along
@@ -4085,6 +4147,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
ffe_ctl->hint_byte = fs_info->treelog_bg;
spin_unlock(&fs_info->treelog_bg_lock);
}
+ if (ffe_ctl->for_data_reloc) {
+ spin_lock(&fs_info->relocation_bg_lock);
+ if (fs_info->data_reloc_bg)
+ ffe_ctl->hint_byte = fs_info->data_reloc_bg;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ }
return 0;
default:
BUG();
@@ -4117,65 +4185,62 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
* |- If not found, re-iterate all block groups
*/
static noinline int find_free_extent(struct btrfs_root *root,
- u64 ram_bytes, u64 num_bytes, u64 empty_size,
- u64 hint_byte_orig, struct btrfs_key *ins,
- u64 flags, int delalloc)
+ struct btrfs_key *ins,
+ struct find_free_extent_ctl *ffe_ctl)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
int cache_block_group_error = 0;
struct btrfs_block_group *block_group = NULL;
- struct find_free_extent_ctl ffe_ctl = {0};
struct btrfs_space_info *space_info;
bool full_search = false;
- bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
- WARN_ON(num_bytes < fs_info->sectorsize);
-
- ffe_ctl.num_bytes = num_bytes;
- ffe_ctl.empty_size = empty_size;
- ffe_ctl.flags = flags;
- ffe_ctl.search_start = 0;
- ffe_ctl.delalloc = delalloc;
- ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
- ffe_ctl.have_caching_bg = false;
- ffe_ctl.orig_have_caching_bg = false;
- ffe_ctl.found_offset = 0;
- ffe_ctl.hint_byte = hint_byte_orig;
- ffe_ctl.for_treelog = for_treelog;
- ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
+ WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
+ ffe_ctl->search_start = 0;
/* For clustered allocation */
- ffe_ctl.retry_clustered = false;
- ffe_ctl.retry_unclustered = false;
- ffe_ctl.last_ptr = NULL;
- ffe_ctl.use_cluster = true;
+ ffe_ctl->empty_cluster = 0;
+ ffe_ctl->last_ptr = NULL;
+ ffe_ctl->use_cluster = true;
+ ffe_ctl->have_caching_bg = false;
+ ffe_ctl->orig_have_caching_bg = false;
+ ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
+ ffe_ctl->loop = 0;
+ /* For clustered allocation */
+ ffe_ctl->retry_clustered = false;
+ ffe_ctl->retry_unclustered = false;
+ ffe_ctl->cached = 0;
+ ffe_ctl->max_extent_size = 0;
+ ffe_ctl->total_free_space = 0;
+ ffe_ctl->found_offset = 0;
+ ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
if (btrfs_is_zoned(fs_info))
- ffe_ctl.policy = BTRFS_EXTENT_ALLOC_ZONED;
+ ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
ins->offset = 0;
- trace_find_free_extent(root, num_bytes, empty_size, flags);
+ trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size,
+ ffe_ctl->flags);
- space_info = btrfs_find_space_info(fs_info, flags);
+ space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
if (!space_info) {
- btrfs_err(fs_info, "No space info for %llu", flags);
+ btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
return -ENOSPC;
}
- ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins);
+ ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
if (ret < 0)
return ret;
- ffe_ctl.search_start = max(ffe_ctl.search_start,
- first_logical_byte(fs_info, 0));
- ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte);
- if (ffe_ctl.search_start == ffe_ctl.hint_byte) {
+ ffe_ctl->search_start = max(ffe_ctl->search_start,
+ first_logical_byte(fs_info, 0));
+ ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
+ if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
block_group = btrfs_lookup_block_group(fs_info,
- ffe_ctl.search_start);
+ ffe_ctl->search_start);
/*
* we don't want to use the block group if it doesn't match our
* allocation bits, or if its not cached.
@@ -4183,7 +4248,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
* However if we are re-searching with an ideal block group
* picked out then we don't care that the block group is cached.
*/
- if (block_group && block_group_bits(block_group, flags) &&
+ if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
block_group->cached != BTRFS_CACHE_NO) {
down_read(&space_info->groups_sem);
if (list_empty(&block_group->list) ||
@@ -4197,9 +4262,10 @@ static noinline int find_free_extent(struct btrfs_root *root,
btrfs_put_block_group(block_group);
up_read(&space_info->groups_sem);
} else {
- ffe_ctl.index = btrfs_bg_flags_to_raid_index(
- block_group->flags);
- btrfs_lock_block_group(block_group, delalloc);
+ ffe_ctl->index = btrfs_bg_flags_to_raid_index(
+ block_group->flags);
+ btrfs_lock_block_group(block_group,
+ ffe_ctl->delalloc);
goto have_block_group;
}
} else if (block_group) {
@@ -4207,31 +4273,33 @@ static noinline int find_free_extent(struct btrfs_root *root,
}
}
search:
- ffe_ctl.have_caching_bg = false;
- if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
- ffe_ctl.index == 0)
+ ffe_ctl->have_caching_bg = false;
+ if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
+ ffe_ctl->index == 0)
full_search = true;
down_read(&space_info->groups_sem);
list_for_each_entry(block_group,
- &space_info->block_groups[ffe_ctl.index], list) {
+ &space_info->block_groups[ffe_ctl->index], list) {
struct btrfs_block_group *bg_ret;
/* If the block group is read-only, we can skip it entirely. */
if (unlikely(block_group->ro)) {
- if (for_treelog)
+ if (ffe_ctl->for_treelog)
btrfs_clear_treelog_bg(block_group);
+ if (ffe_ctl->for_data_reloc)
+ btrfs_clear_data_reloc_bg(block_group);
continue;
}
- btrfs_grab_block_group(block_group, delalloc);
- ffe_ctl.search_start = block_group->start;
+ btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
+ ffe_ctl->search_start = block_group->start;
/*
* this can happen if we end up cycling through all the
* raid types, but we want to make sure we only allocate
* for the proper type.
*/
- if (!block_group_bits(block_group, flags)) {
+ if (!block_group_bits(block_group, ffe_ctl->flags)) {
u64 extra = BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1_MASK |
BTRFS_BLOCK_GROUP_RAID56_MASK |
@@ -4242,7 +4310,7 @@ search:
* doesn't provide them, bail. This does allow us to
* fill raid0 from raid1.
*/
- if ((flags & extra) && !(block_group->flags & extra))
+ if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
goto loop;
/*
@@ -4250,14 +4318,14 @@ search:
* It's possible that we have MIXED_GROUP flag but no
* block group is mixed. Just skip such block group.
*/
- btrfs_release_block_group(block_group, delalloc);
+ btrfs_release_block_group(block_group, ffe_ctl->delalloc);
continue;
}
have_block_group:
- ffe_ctl.cached = btrfs_block_group_done(block_group);
- if (unlikely(!ffe_ctl.cached)) {
- ffe_ctl.have_caching_bg = true;
+ ffe_ctl->cached = btrfs_block_group_done(block_group);
+ if (unlikely(!ffe_ctl->cached)) {
+ ffe_ctl->have_caching_bg = true;
ret = btrfs_cache_block_group(block_group, 0);
/*
@@ -4280,10 +4348,11 @@ have_block_group:
goto loop;
bg_ret = NULL;
- ret = do_allocation(block_group, &ffe_ctl, &bg_ret);
+ ret = do_allocation(block_group, ffe_ctl, &bg_ret);
if (ret == 0) {
if (bg_ret && bg_ret != block_group) {
- btrfs_release_block_group(block_group, delalloc);
+ btrfs_release_block_group(block_group,
+ ffe_ctl->delalloc);
block_group = bg_ret;
}
} else if (ret == -EAGAIN) {
@@ -4293,46 +4362,49 @@ have_block_group:
}
/* Checks */
- ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
- fs_info->stripesize);
+ ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
+ fs_info->stripesize);
/* move on to the next group */
- if (ffe_ctl.search_start + num_bytes >
+ if (ffe_ctl->search_start + ffe_ctl->num_bytes >
block_group->start + block_group->length) {
btrfs_add_free_space_unused(block_group,
- ffe_ctl.found_offset, num_bytes);
+ ffe_ctl->found_offset,
+ ffe_ctl->num_bytes);
goto loop;
}
- if (ffe_ctl.found_offset < ffe_ctl.search_start)
+ if (ffe_ctl->found_offset < ffe_ctl->search_start)
btrfs_add_free_space_unused(block_group,
- ffe_ctl.found_offset,
- ffe_ctl.search_start - ffe_ctl.found_offset);
+ ffe_ctl->found_offset,
+ ffe_ctl->search_start - ffe_ctl->found_offset);
- ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
- num_bytes, delalloc);
+ ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
+ ffe_ctl->num_bytes,
+ ffe_ctl->delalloc);
if (ret == -EAGAIN) {
btrfs_add_free_space_unused(block_group,
- ffe_ctl.found_offset, num_bytes);
+ ffe_ctl->found_offset,
+ ffe_ctl->num_bytes);
goto loop;
}
btrfs_inc_block_group_reservations(block_group);
/* we are all good, lets return */
- ins->objectid = ffe_ctl.search_start;
- ins->offset = num_bytes;
+ ins->objectid = ffe_ctl->search_start;
+ ins->offset = ffe_ctl->num_bytes;
- trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
- num_bytes);
- btrfs_release_block_group(block_group, delalloc);
+ trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start,
+ ffe_ctl->num_bytes);
+ btrfs_release_block_group(block_group, ffe_ctl->delalloc);
break;
loop:
- release_block_group(block_group, &ffe_ctl, delalloc);
+ release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
cond_resched();
}
up_read(&space_info->groups_sem);
- ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search);
+ ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
if (ret > 0)
goto search;
@@ -4341,12 +4413,12 @@ loop:
* Use ffe_ctl->total_free_space as fallback if we can't find
* any contiguous hole.
*/
- if (!ffe_ctl.max_extent_size)
- ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
+ if (!ffe_ctl->max_extent_size)
+ ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
spin_lock(&space_info->lock);
- space_info->max_extent_size = ffe_ctl.max_extent_size;
+ space_info->max_extent_size = ffe_ctl->max_extent_size;
spin_unlock(&space_info->lock);
- ins->offset = ffe_ctl.max_extent_size;
+ ins->offset = ffe_ctl->max_extent_size;
} else if (ret == -ENOSPC) {
ret = cache_block_group_error;
}
@@ -4404,16 +4476,28 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
struct btrfs_key *ins, int is_data, int delalloc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct find_free_extent_ctl ffe_ctl = {};
bool final_tried = num_bytes == min_alloc_size;
u64 flags;
int ret;
bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+ bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
flags = get_alloc_profile_by_root(root, is_data);
again:
WARN_ON(num_bytes < fs_info->sectorsize);
- ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
- hint_byte, ins, flags, delalloc);
+
+ ffe_ctl.ram_bytes = ram_bytes;
+ ffe_ctl.num_bytes = num_bytes;
+ ffe_ctl.min_alloc_size = min_alloc_size;
+ ffe_ctl.empty_size = empty_size;
+ ffe_ctl.flags = flags;
+ ffe_ctl.delalloc = delalloc;
+ ffe_ctl.hint_byte = hint_byte;
+ ffe_ctl.for_treelog = for_treelog;
+ ffe_ctl.for_data_reloc = for_data_reloc;
+
+ ret = find_free_extent(root, ins, &ffe_ctl);
if (!ret && !is_data) {
btrfs_dec_block_group_reservations(fs_info, ins->objectid);
} else if (ret == -ENOSPC) {
@@ -4431,8 +4515,8 @@ again:
sinfo = btrfs_find_space_info(fs_info, flags);
btrfs_err(fs_info,
- "allocation failed flags %llu, wanted %llu tree-log %d",
- flags, num_bytes, for_treelog);
+ "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
+ flags, num_bytes, for_treelog, for_data_reloc);
if (sinfo)
btrfs_dump_space_info(fs_info, sinfo,
num_bytes, 1);
@@ -4543,7 +4627,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
+ ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, true);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
@@ -4632,7 +4716,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
return ret;
ret = btrfs_update_block_group(trans, extent_key.objectid,
- fs_info->nodesize, 1);
+ fs_info->nodesize, true);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
extent_key.objectid, extent_key.offset);
@@ -4655,7 +4739,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
ins->objectid, ins->offset, 0);
- btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
+ btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
+ offset, 0, false);
btrfs_ref_tree_mod(root->fs_info, &generic_ref);
return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
@@ -4847,8 +4932,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
ins.objectid, ins.offset, parent);
- generic_ref.real_root = root->root_key.objectid;
- btrfs_init_tree_ref(&generic_ref, level, root_objectid);
+ btrfs_init_tree_ref(&generic_ref, level, root_objectid,
+ root->root_key.objectid, false);
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
if (ret)
@@ -4859,6 +4944,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
out_free_delayed:
btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
+ btrfs_tree_unlock(buf);
free_extent_buffer(buf);
out_free_reserved:
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
@@ -5264,7 +5350,8 @@ skip:
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
fs_info->nodesize, parent);
- btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
+ btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
+ 0, false);
ret = btrfs_free_extent(trans, &ref);
if (ret)
goto out_unlock;
@@ -5749,13 +5836,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- btrfs_assert_tree_locked(parent);
+ btrfs_assert_tree_write_locked(parent);
parent_level = btrfs_header_level(parent);
atomic_inc(&parent->refs);
path->nodes[parent_level] = parent;
path->slots[parent_level] = btrfs_header_nritems(parent);
- btrfs_assert_tree_locked(node);
+ btrfs_assert_tree_write_locked(node);
level = btrfs_header_level(node);
path->nodes[level] = node;
path->slots[level] = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index aaddd7225348..4e03a6d3aa32 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -241,7 +241,7 @@ int __init extent_io_init(void)
return -ENOMEM;
if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
- offsetof(struct btrfs_io_bio, bio),
+ offsetof(struct btrfs_bio, bio),
BIOSET_NEED_BVECS))
goto free_buffer_cache;
@@ -1975,10 +1975,18 @@ static noinline int lock_delalloc_pages(struct inode *inode,
/*
* Find and lock a contiguous range of bytes in the file marked as delalloc, no
- * more than @max_bytes. @Start and @end are used to return the range,
+ * more than @max_bytes.
*
- * Return: true if we find something
- * false if nothing was in the tree
+ * @start: The original start bytenr to search.
+ * Will store the extent range start bytenr.
+ * @end: The original end bytenr of the search range
+ * Will store the extent range end bytenr.
+ *
+ * Return true if we find a delalloc range which starts inside the original
+ * range, and @start/@end will store the delalloc range start/end.
+ *
+ * Return false if we can't find any delalloc range which starts inside the
+ * original range, and @start/@end will be the non-delalloc range start/end.
*/
EXPORT_FOR_TESTS
noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
@@ -1986,6 +1994,8 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
u64 *end)
{
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ const u64 orig_start = *start;
+ const u64 orig_end = *end;
u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
u64 delalloc_start;
u64 delalloc_end;
@@ -1994,15 +2004,23 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
int ret;
int loops = 0;
+ /* Caller should pass a valid @end to indicate the search range end */
+ ASSERT(orig_end > orig_start);
+
+ /* The range should at least cover part of the page */
+ ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE ||
+ orig_end <= page_offset(locked_page)));
again:
/* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
delalloc_end = 0;
found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
max_bytes, &cached_state);
- if (!found || delalloc_end <= *start) {
+ if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
*start = delalloc_start;
- *end = delalloc_end;
+
+ /* @delalloc_end can be -1, never go beyond @orig_end */
+ *end = min(delalloc_end, orig_end);
free_extent_state(cached_state);
return false;
}
@@ -2282,15 +2300,15 @@ int free_io_failure(struct extent_io_tree *failure_tree,
* currently, there can be no more than two copies of every data bit. thus,
* exactly one rewrite is required.
*/
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num)
+static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ u64 length, u64 logical, struct page *page,
+ unsigned int pg_offset, int mirror_num)
{
struct bio *bio;
struct btrfs_device *dev;
u64 map_length = 0;
u64 sector;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
int ret;
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
@@ -2299,12 +2317,12 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
if (btrfs_is_zoned(fs_info))
return btrfs_repair_one_zone(fs_info, logical);
- bio = btrfs_io_bio_alloc(1);
+ bio = btrfs_bio_alloc(1);
bio->bi_iter.bi_size = 0;
map_length = length;
/*
- * Avoid races with device replace and make sure our bbio has devices
+ * Avoid races with device replace and make sure our bioc has devices
* associated to its stripes that don't go away while we are doing the
* read repair operation.
*/
@@ -2317,28 +2335,28 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
* stripe's dev and sector.
*/
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
- &map_length, &bbio, 0);
+ &map_length, &bioc, 0);
if (ret) {
btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return -EIO;
}
- ASSERT(bbio->mirror_num == 1);
+ ASSERT(bioc->mirror_num == 1);
} else {
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
- &map_length, &bbio, mirror_num);
+ &map_length, &bioc, mirror_num);
if (ret) {
btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return -EIO;
}
- BUG_ON(mirror_num != bbio->mirror_num);
+ BUG_ON(mirror_num != bioc->mirror_num);
}
- sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
+ sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
bio->bi_iter.bi_sector = sector;
- dev = bbio->stripes[bbio->mirror_num - 1].dev;
- btrfs_put_bbio(bbio);
+ dev = bioc->stripes[bioc->mirror_num - 1].dev;
+ btrfs_put_bioc(bioc);
if (!dev || !dev->bdev ||
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
btrfs_bio_counter_dec(fs_info);
@@ -2618,10 +2636,10 @@ int btrfs_repair_one_sector(struct inode *inode,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
+ struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio);
const int icsum = bio_offset >> fs_info->sectorsize_bits;
struct bio *repair_bio;
- struct btrfs_io_bio *repair_io_bio;
+ struct btrfs_bio *repair_bbio;
blk_status_t status;
btrfs_debug(fs_info,
@@ -2639,24 +2657,23 @@ int btrfs_repair_one_sector(struct inode *inode,
return -EIO;
}
- repair_bio = btrfs_io_bio_alloc(1);
- repair_io_bio = btrfs_io_bio(repair_bio);
+ repair_bio = btrfs_bio_alloc(1);
+ repair_bbio = btrfs_bio(repair_bio);
repair_bio->bi_opf = REQ_OP_READ;
repair_bio->bi_end_io = failed_bio->bi_end_io;
repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
repair_bio->bi_private = failed_bio->bi_private;
- if (failed_io_bio->csum) {
+ if (failed_bbio->csum) {
const u32 csum_size = fs_info->csum_size;
- repair_io_bio->csum = repair_io_bio->csum_inline;
- memcpy(repair_io_bio->csum,
- failed_io_bio->csum + csum_size * icsum, csum_size);
+ repair_bbio->csum = repair_bbio->csum_inline;
+ memcpy(repair_bbio->csum,
+ failed_bbio->csum + csum_size * icsum, csum_size);
}
bio_add_page(repair_bio, page, failrec->len, pgoff);
- repair_io_bio->logical = failrec->start;
- repair_io_bio->iter = repair_bio->bi_iter;
+ repair_bbio->iter = repair_bio->bi_iter;
btrfs_debug(btrfs_sb(inode->i_sb),
"repair read error: submitting new read to mirror %d",
@@ -2976,7 +2993,7 @@ static struct extent_buffer *find_extent_buffer_readpage(
static void end_bio_extent_readpage(struct bio *bio)
{
struct bio_vec *bvec;
- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct btrfs_bio *bbio = btrfs_bio(bio);
struct extent_io_tree *tree, *failure_tree;
struct processed_extent processed = { 0 };
/*
@@ -3003,7 +3020,7 @@ static void end_bio_extent_readpage(struct bio *bio)
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
bio->bi_iter.bi_sector, bio->bi_status,
- io_bio->mirror_num);
+ bbio->mirror_num);
tree = &BTRFS_I(inode)->io_tree;
failure_tree = &BTRFS_I(inode)->io_failure_tree;
@@ -3028,14 +3045,14 @@ static void end_bio_extent_readpage(struct bio *bio)
end = start + bvec->bv_len - 1;
len = bvec->bv_len;
- mirror = io_bio->mirror_num;
+ mirror = bbio->mirror_num;
if (likely(uptodate)) {
if (is_data_inode(inode)) {
- error_bitmap = btrfs_verify_data_csum(io_bio,
+ error_bitmap = btrfs_verify_data_csum(bbio,
bio_offset, page, start, end);
ret = error_bitmap;
} else {
- ret = btrfs_validate_metadata_buffer(io_bio,
+ ret = btrfs_validate_metadata_buffer(bbio,
page, start, end, mirror);
}
if (ret)
@@ -3106,7 +3123,7 @@ readpage_ok:
}
/* Release the last extent */
endio_readpage_release_extent(&processed, NULL, 0, 0, false);
- btrfs_io_bio_free_csum(io_bio);
+ btrfs_bio_free_csum(bbio);
bio_put(bio);
}
@@ -3115,53 +3132,43 @@ readpage_ok:
* new bio by bio_alloc_bioset as it does not initialize the bytes outside of
* 'bio' because use of __GFP_ZERO is not supported.
*/
-static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
+static inline void btrfs_bio_init(struct btrfs_bio *bbio)
{
- memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
+ memset(bbio, 0, offsetof(struct btrfs_bio, bio));
}
/*
- * The following helpers allocate a bio. As it's backed by a bioset, it'll
- * never fail. We're returning a bio right now but you can call btrfs_io_bio
- * for the appropriate container_of magic
+ * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs.
+ *
+ * The bio allocation is backed by bioset and does not fail.
*/
-struct bio *btrfs_bio_alloc(u64 first_byte)
+struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
{
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &btrfs_bioset);
- bio->bi_iter.bi_sector = first_byte >> 9;
- btrfs_io_bio_init(btrfs_io_bio(bio));
+ ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
+ bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
+ btrfs_bio_init(btrfs_bio(bio));
return bio;
}
struct bio *btrfs_bio_clone(struct bio *bio)
{
- struct btrfs_io_bio *btrfs_bio;
+ struct btrfs_bio *bbio;
struct bio *new;
/* Bio allocation backed by a bioset does not fail */
new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
- btrfs_bio = btrfs_io_bio(new);
- btrfs_io_bio_init(btrfs_bio);
- btrfs_bio->iter = bio->bi_iter;
+ bbio = btrfs_bio(new);
+ btrfs_bio_init(bbio);
+ bbio->iter = bio->bi_iter;
return new;
}
-struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
-{
- struct bio *bio;
-
- /* Bio allocation backed by a bioset does not fail */
- bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
- btrfs_io_bio_init(btrfs_io_bio(bio));
- return bio;
-}
-
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
{
struct bio *bio;
- struct btrfs_io_bio *btrfs_bio;
+ struct btrfs_bio *bbio;
ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
@@ -3169,11 +3176,11 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
ASSERT(bio);
- btrfs_bio = btrfs_io_bio(bio);
- btrfs_io_bio_init(btrfs_bio);
+ bbio = btrfs_bio(bio);
+ btrfs_bio_init(bbio);
bio_trim(bio, offset >> 9, size >> 9);
- btrfs_bio->iter = bio->bi_iter;
+ bbio->iter = bio->bi_iter;
return bio;
}
@@ -3307,14 +3314,15 @@ static int alloc_new_bio(struct btrfs_inode *inode,
struct bio *bio;
int ret;
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
/*
* For compressed page range, its disk_bytenr is always @disk_bytenr
* passed in, no matter if we have added any range into previous bio.
*/
if (bio_flags & EXTENT_BIO_COMPRESSED)
- bio = btrfs_bio_alloc(disk_bytenr);
+ bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
else
- bio = btrfs_bio_alloc(disk_bytenr + offset);
+ bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
bio_ctrl->bio = bio;
bio_ctrl->bio_flags = bio_flags;
bio->bi_end_io = end_io_func;
@@ -3327,7 +3335,7 @@ static int alloc_new_bio(struct btrfs_inode *inode,
if (wbc) {
struct block_device *bdev;
- bdev = fs_info->fs_devices->latest_bdev;
+ bdev = fs_info->fs_devices->latest_dev->bdev;
bio_set_dev(bio, bdev);
wbc_init_bio(wbc, bio);
}
@@ -3341,7 +3349,7 @@ static int alloc_new_bio(struct btrfs_inode *inode,
goto error;
}
- btrfs_io_bio(bio)->device = device;
+ btrfs_bio(bio)->device = device;
}
return 0;
error:
@@ -3599,6 +3607,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
bool force_bio_submit = false;
u64 disk_bytenr;
+ ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
if (cur >= last_byte) {
struct extent_state *cached = NULL;
@@ -3777,17 +3786,18 @@ static void update_nr_written(struct writeback_control *wbc,
*/
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct page *page, struct writeback_control *wbc,
- u64 delalloc_start, unsigned long *nr_written)
+ unsigned long *nr_written)
{
- u64 page_end = delalloc_start + PAGE_SIZE - 1;
- bool found;
+ const u64 page_end = page_offset(page) + PAGE_SIZE - 1;
+ u64 delalloc_start = page_offset(page);
u64 delalloc_to_write = 0;
- u64 delalloc_end = 0;
int ret;
int page_started = 0;
+ while (delalloc_start < page_end) {
+ u64 delalloc_end = page_end;
+ bool found;
- while (delalloc_end < page_end) {
found = find_lock_delalloc_range(&inode->vfs_inode, page,
&delalloc_start,
&delalloc_end);
@@ -3854,12 +3864,11 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
struct page *page, u64 *start, u64 *end)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage_info *spi = fs_info->subpage_info;
u64 orig_start = *start;
/* Declare as unsigned long so we can use bitmap ops */
- unsigned long dirty_bitmap;
unsigned long flags;
- int nbits = (orig_start - page_offset(page)) >> fs_info->sectorsize_bits;
- int range_start_bit = nbits;
+ int range_start_bit;
int range_end_bit;
/*
@@ -3872,13 +3881,18 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
return;
}
+ range_start_bit = spi->dirty_offset +
+ (offset_in_page(orig_start) >> fs_info->sectorsize_bits);
+
/* We should have the page locked, but just in case */
spin_lock_irqsave(&subpage->lock, flags);
- dirty_bitmap = subpage->dirty_bitmap;
+ bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
+ spi->dirty_offset + spi->bitmap_nr_bits);
spin_unlock_irqrestore(&subpage->lock, flags);
- bitmap_next_set_region(&dirty_bitmap, &range_start_bit, &range_end_bit,
- BTRFS_SUBPAGE_BITMAP_SIZE);
+ range_start_bit -= spi->dirty_offset;
+ range_end_bit -= spi->dirty_offset;
+
*start = page_offset(page) + range_start_bit * fs_info->sectorsize;
*end = page_offset(page) + range_end_bit * fs_info->sectorsize;
}
@@ -4054,8 +4068,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
struct extent_page_data *epd)
{
struct inode *inode = page->mapping->host;
- u64 start = page_offset(page);
- u64 page_end = start + PAGE_SIZE - 1;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const u64 page_start = page_offset(page);
+ const u64 page_end = page_start + PAGE_SIZE - 1;
int ret;
int nr = 0;
size_t pg_offset;
@@ -4090,8 +4105,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
}
if (!epd->extent_locked) {
- ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
- &nr_written);
+ ret = writepage_delalloc(BTRFS_I(inode), page, wbc, &nr_written);
if (ret == 1)
return 0;
if (ret)
@@ -4141,8 +4155,20 @@ done:
* capable of that.
*/
if (PageError(page))
- end_extent_writepage(page, ret, start, page_end);
- unlock_page(page);
+ end_extent_writepage(page, ret, page_start, page_end);
+ if (epd->extent_locked) {
+ /*
+ * If epd->extent_locked, it's from extent_write_locked_range(),
+ * the page can either be locked by lock_page() or
+ * process_one_page().
+ * Let btrfs_page_unlock_writer() handle both cases.
+ */
+ ASSERT(wbc);
+ btrfs_page_unlock_writer(fs_info, page, wbc->range_start,
+ wbc->range_end + 1 - wbc->range_start);
+ } else {
+ unlock_page(page);
+ }
ASSERT(ret <= 0);
return ret;
}
@@ -4155,6 +4181,9 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
static void end_extent_buffer_writeback(struct extent_buffer *eb)
{
+ if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags))
+ btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len);
+
clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
smp_mb__after_atomic();
wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
@@ -4602,12 +4631,11 @@ static int submit_eb_subpage(struct page *page,
int submitted = 0;
u64 page_start = page_offset(page);
int bit_start = 0;
- const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE;
int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
int ret;
/* Lock and write each dirty extent buffers in the range */
- while (bit_start < nbits) {
+ while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
struct extent_buffer *eb;
unsigned long flags;
@@ -4623,7 +4651,8 @@ static int submit_eb_subpage(struct page *page,
break;
}
spin_lock_irqsave(&subpage->lock, flags);
- if (!((1 << bit_start) & subpage->dirty_bitmap)) {
+ if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
+ subpage->bitmaps)) {
spin_unlock_irqrestore(&subpage->lock, flags);
spin_unlock(&page->mapping->private_lock);
bit_start++;
@@ -4756,8 +4785,13 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
free_extent_buffer(eb);
return ret;
}
- if (cache)
+ if (cache) {
+ /* Impiles write in zoned mode */
btrfs_put_block_group(cache);
+ /* Mark the last eb in a block group */
+ if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity)
+ set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags);
+ }
ret = write_one_eb(eb, wbc, epd);
free_extent_buffer(eb);
if (ret < 0)
@@ -4873,7 +4907,7 @@ retry:
* extent io tree. Thus we don't want to submit such wild eb
* if the fs already has error.
*/
- if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (!BTRFS_FS_ERROR(fs_info)) {
ret = flush_write_bio(&epd);
} else {
ret = -EROFS;
@@ -5069,23 +5103,28 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc)
return ret;
}
-int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
- int mode)
+/*
+ * Submit the pages in the range to bio for call sites which delalloc range has
+ * already been ran (aka, ordered extent inserted) and all pages are still
+ * locked.
+ */
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
{
+ bool found_error = false;
+ int first_error = 0;
int ret = 0;
struct address_space *mapping = inode->i_mapping;
struct page *page;
- unsigned long nr_pages = (end - start + PAGE_SIZE) >>
- PAGE_SHIFT;
-
+ u64 cur = start;
+ unsigned long nr_pages;
+ const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
struct extent_page_data epd = {
.bio_ctrl = { 0 },
.extent_locked = 1,
- .sync_io = mode == WB_SYNC_ALL,
+ .sync_io = 1,
};
struct writeback_control wbc_writepages = {
- .sync_mode = mode,
- .nr_to_write = nr_pages * 2,
+ .sync_mode = WB_SYNC_ALL,
.range_start = start,
.range_end = end + 1,
/* We're called from an async helper function */
@@ -5093,33 +5132,51 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
.no_cgroup_owner = 1,
};
+ ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
+ nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
+ PAGE_SHIFT;
+ wbc_writepages.nr_to_write = nr_pages * 2;
+
wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
- while (start <= end) {
- page = find_get_page(mapping, start >> PAGE_SHIFT);
- if (clear_page_dirty_for_io(page))
- ret = __extent_writepage(page, &wbc_writepages, &epd);
- else {
- btrfs_writepage_endio_finish_ordered(BTRFS_I(inode),
- page, start, start + PAGE_SIZE - 1, true);
- unlock_page(page);
+ while (cur <= end) {
+ u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
+
+ page = find_get_page(mapping, cur >> PAGE_SHIFT);
+ /*
+ * All pages in the range are locked since
+ * btrfs_run_delalloc_range(), thus there is no way to clear
+ * the page dirty flag.
+ */
+ ASSERT(PageLocked(page));
+ ASSERT(PageDirty(page));
+ clear_page_dirty_for_io(page);
+ ret = __extent_writepage(page, &wbc_writepages, &epd);
+ ASSERT(ret <= 0);
+ if (ret < 0) {
+ found_error = true;
+ first_error = ret;
}
put_page(page);
- start += PAGE_SIZE;
+ cur = cur_end + 1;
}
- ASSERT(ret <= 0);
- if (ret == 0)
+ if (!found_error)
ret = flush_write_bio(&epd);
else
end_write_bio(&epd, ret);
wbc_detach_inode(&wbc_writepages);
+ if (found_error)
+ return first_error;
return ret;
}
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct inode *inode = mapping->host;
+ const bool data_reloc = btrfs_is_data_reloc_root(BTRFS_I(inode)->root);
+ const bool zoned = btrfs_is_zoned(BTRFS_I(inode)->root->fs_info);
int ret = 0;
struct extent_page_data epd = {
.bio_ctrl = { 0 },
@@ -5127,7 +5184,15 @@ int extent_writepages(struct address_space *mapping,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
+ /*
+ * Allow only a single thread to do the reloc work in zoned mode to
+ * protect the write pointer updates.
+ */
+ if (data_reloc && zoned)
+ btrfs_inode_lock(inode, 0);
ret = extent_write_cache_pages(mapping, wbc, &epd);
+ if (data_reloc && zoned)
+ btrfs_inode_unlock(inode, 0);
ASSERT(ret <= 0);
if (ret < 0) {
end_write_bio(&epd, ret);
@@ -6137,13 +6202,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
* page, but it may change in the future for 16K page size
* support, so we still preallocate the memory in the loop.
*/
- ret = btrfs_alloc_subpage(fs_info, &prealloc,
- BTRFS_SUBPAGE_METADATA);
- if (ret < 0) {
- unlock_page(p);
- put_page(p);
- exists = ERR_PTR(ret);
- goto free_eb;
+ if (fs_info->sectorsize < PAGE_SIZE) {
+ prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
+ if (IS_ERR(prealloc)) {
+ ret = PTR_ERR(prealloc);
+ unlock_page(p);
+ put_page(p);
+ exists = ERR_PTR(ret);
+ goto free_eb;
+ }
}
spin_lock(&mapping->private_lock);
@@ -7167,32 +7234,41 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
}
}
+#define GANG_LOOKUP_SIZE 16
static struct extent_buffer *get_next_extent_buffer(
struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
{
- struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE];
+ struct extent_buffer *gang[GANG_LOOKUP_SIZE];
struct extent_buffer *found = NULL;
u64 page_start = page_offset(page);
- int ret;
- int i;
+ u64 cur = page_start;
ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
- ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE);
lockdep_assert_held(&fs_info->buffer_lock);
- ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang,
- bytenr >> fs_info->sectorsize_bits,
- PAGE_SIZE / fs_info->nodesize);
- for (i = 0; i < ret; i++) {
- /* Already beyond page end */
- if (gang[i]->start >= page_start + PAGE_SIZE)
- break;
- /* Found one */
- if (gang[i]->start >= bytenr) {
- found = gang[i];
- break;
+ while (cur < page_start + PAGE_SIZE) {
+ int ret;
+ int i;
+
+ ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
+ (void **)gang, cur >> fs_info->sectorsize_bits,
+ min_t(unsigned int, GANG_LOOKUP_SIZE,
+ PAGE_SIZE / fs_info->nodesize));
+ if (ret == 0)
+ goto out;
+ for (i = 0; i < ret; i++) {
+ /* Already beyond page end */
+ if (gang[i]->start >= page_start + PAGE_SIZE)
+ goto out;
+ /* Found one */
+ if (gang[i]->start >= bytenr) {
+ found = gang[i];
+ goto out;
+ }
}
+ cur = gang[ret - 1]->start + gang[ret - 1]->len;
}
+out:
return found;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 53abdc280451..0399cf8e3c32 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -32,6 +32,7 @@ enum {
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
EXTENT_BUFFER_NO_CHECK,
+ EXTENT_BUFFER_ZONE_FINISH,
};
/* these are flags for __process_pages_contig */
@@ -183,8 +184,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
struct btrfs_bio_ctrl *bio_ctrl,
unsigned int read_flags, u64 *prev_em_start);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
-int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
- int mode);
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end);
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
@@ -277,14 +277,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
u32 bits_to_clear, unsigned long page_ops);
-struct bio *btrfs_bio_alloc(u64 first_byte);
-struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
+struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
struct bio *btrfs_bio_clone(struct bio *bio);
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 4a8e02f7b6c7..5a36add21305 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -360,7 +360,7 @@ static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
int i;
for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_bio_stripe *stripe = &map->stripes[i];
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
set_extent_bits_nowait(&device->alloc_state, stripe->physical,
@@ -375,7 +375,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
int i;
for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_bio_stripe *stripe = &map->stripes[i];
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
__clear_extent_bit(&device->alloc_state, stripe->physical,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 0b9401a5afd3..d1cbb64a78f3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -358,7 +358,7 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
* @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return
* checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
* NULL, the checksum buffer is allocated and returned in
- * btrfs_io_bio(bio)->csum instead.
+ * btrfs_bio(bio)->csum instead.
*
* Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
*/
@@ -397,19 +397,18 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
return BLK_STS_RESOURCE;
if (!dst) {
- struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
+ struct btrfs_bio *bbio = btrfs_bio(bio);
if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
- btrfs_bio->csum = kmalloc_array(nblocks, csum_size,
- GFP_NOFS);
- if (!btrfs_bio->csum) {
+ bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
+ if (!bbio->csum) {
btrfs_free_path(path);
return BLK_STS_RESOURCE;
}
} else {
- btrfs_bio->csum = btrfs_bio->csum_inline;
+ bbio->csum = bbio->csum_inline;
}
- csum = btrfs_bio->csum;
+ csum = bbio->csum;
} else {
csum = dst;
}
@@ -709,12 +708,12 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
index = 0;
}
- data = kmap_atomic(bvec.bv_page);
- crypto_shash_digest(shash, data + bvec.bv_offset
- + (i * fs_info->sectorsize),
+ data = bvec_kmap_local(&bvec);
+ crypto_shash_digest(shash,
+ data + (i * fs_info->sectorsize),
fs_info->sectorsize,
sums->sums + index);
- kunmap_atomic(data);
+ kunmap_local(data);
index += fs_info->csum_size;
offset += fs_info->sectorsize;
this_sum_bytes += fs_info->sectorsize;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 7ff577005d0f..9a3db1365ae8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -437,9 +437,15 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
/*
* unlocks pages after btrfs_file_write is done with them
*/
-static void btrfs_drop_pages(struct page **pages, size_t num_pages)
+static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
+ struct page **pages, size_t num_pages,
+ u64 pos, u64 copied)
{
size_t i;
+ u64 block_start = round_down(pos, fs_info->sectorsize);
+ u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
+
+ ASSERT(block_len <= U32_MAX);
for (i = 0; i < num_pages; i++) {
/* page checked is some magic around finding pages that
* have been modified without going through btrfs_set_page_dirty
@@ -447,7 +453,8 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
* accessed as prepare_pages should have marked them accessed
* in prepare_pages via find_or_create_page()
*/
- ClearPageChecked(pages[i]);
+ btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
+ block_len);
unlock_page(pages[i]);
put_page(pages[i]);
}
@@ -504,7 +511,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
struct page *p = pages[i];
btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
- ClearPageChecked(p);
+ btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
}
@@ -734,8 +741,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
if (args->start >= inode->disk_i_size && !args->replace_extent)
modify_tree = 0;
- update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
- root == fs_info->tree_root);
+ update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
while (1) {
recow = 0;
ret = btrfs_lookup_file_extent(trans, root, path, ino,
@@ -870,7 +876,8 @@ next_slot:
btrfs_init_data_ref(&ref,
root->root_key.objectid,
new_key.objectid,
- args->start - extent_offset);
+ args->start - extent_offset,
+ 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
}
@@ -956,7 +963,8 @@ delete_extent_item:
btrfs_init_data_ref(&ref,
root->root_key.objectid,
key.objectid,
- key.offset - extent_offset);
+ key.offset - extent_offset, 0,
+ false);
ret = btrfs_free_extent(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
args->bytes_found += extent_end - key.offset;
@@ -1021,8 +1029,7 @@ delete_extent_item:
if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
path->slots[0]++;
}
- setup_items_for_insert(root, path, &key,
- &args->extent_item_size, 1);
+ btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size);
args->extent_inserted = true;
}
@@ -1233,7 +1240,7 @@ again:
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
num_bytes, 0);
btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
- orig_offset);
+ orig_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1258,7 +1265,8 @@ again:
other_end = 0;
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
num_bytes, 0);
- btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
+ btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
+ 0, false);
if (extent_mergeable(leaf, path->slots[0] + 1,
ino, bytenr, orig_offset,
&other_start, &other_end)) {
@@ -1845,7 +1853,7 @@ again:
btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
if (ret) {
- btrfs_drop_pages(pages, num_pages);
+ btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
break;
}
@@ -1853,7 +1861,7 @@ again:
if (only_release_metadata)
btrfs_check_nocow_unlock(BTRFS_I(inode));
- btrfs_drop_pages(pages, num_pages);
+ btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
cond_resched();
@@ -2013,7 +2021,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
* have opened a file as writable, we have to stop this write operation
* to ensure consistency.
*/
- if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state))
+ if (BTRFS_FS_ERROR(inode->root->fs_info))
return -EROFS;
if (!(iocb->ki_flags & IOCB_DIRECT) &&
@@ -2621,7 +2629,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
extent_info->disk_len, 0);
ref_offset = extent_info->file_offset - extent_info->data_offset;
btrfs_init_data_ref(&ref, root->root_key.objectid,
- btrfs_ino(inode), ref_offset);
+ btrfs_ino(inode), ref_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
}
@@ -2704,14 +2712,16 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
drop_args.bytes_found);
if (ret != -ENOSPC) {
/*
- * When cloning we want to avoid transaction aborts when
- * nothing was done and we are attempting to clone parts
- * of inline extents, in such cases -EOPNOTSUPP is
- * returned by __btrfs_drop_extents() without having
- * changed anything in the file.
+ * The only time we don't want to abort is if we are
+ * attempting to clone a partial inline extent, in which
+ * case we'll get EOPNOTSUPP. However if we aren't
+ * clone we need to abort no matter what, because if we
+ * got EOPNOTSUPP via prealloc then we messed up and
+ * need to abort.
*/
- if (extent_info && !extent_info->is_new_extent &&
- ret && ret != -EOPNOTSUPP)
+ if (ret &&
+ (ret != -EOPNOTSUPP ||
+ (extent_info && extent_info->is_new_extent)))
btrfs_abort_transaction(trans, ret);
break;
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index da0eee7c9e5f..f3fee88c8ee0 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -22,6 +22,7 @@
#include "delalloc-space.h"
#include "block-group.h"
#include "discard.h"
+#include "subpage.h"
#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
#define MAX_CACHE_BYTES_PER_GIG SZ_64K
@@ -411,7 +412,10 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
for (i = 0; i < io_ctl->num_pages; i++) {
if (io_ctl->pages[i]) {
- ClearPageChecked(io_ctl->pages[i]);
+ btrfs_page_clear_checked(io_ctl->fs_info,
+ io_ctl->pages[i],
+ page_offset(io_ctl->pages[i]),
+ PAGE_SIZE);
unlock_page(io_ctl->pages[i]);
put_page(io_ctl->pages[i]);
}
@@ -2539,10 +2543,16 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
+ bool initial = (size == block_group->length);
+ u64 reclaimable_unusable;
+
+ WARN_ON(!initial && offset + size > block_group->zone_capacity);
spin_lock(&ctl->tree_lock);
if (!used)
to_free = size;
+ else if (initial)
+ to_free = block_group->zone_capacity;
else if (offset >= block_group->alloc_offset)
to_free = size;
else if (offset + size <= block_group->alloc_offset)
@@ -2565,12 +2575,15 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
spin_unlock(&block_group->lock);
}
+ reclaimable_unusable = block_group->zone_unusable -
+ (block_group->length - block_group->zone_capacity);
/* All the region is now unusable. Mark it as unused and reclaim */
if (block_group->zone_unusable == block_group->length) {
btrfs_mark_bg_unused(block_group);
} else if (bg_reclaim_threshold &&
- block_group->zone_unusable >=
- div_factor_fine(block_group->length, bg_reclaim_threshold)) {
+ reclaimable_unusable >=
+ div_factor_fine(block_group->zone_capacity,
+ bg_reclaim_threshold)) {
btrfs_mark_bg_to_reclaim(block_group);
}
@@ -2754,8 +2767,9 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
* out the free space after the allocation offset.
*/
if (btrfs_is_zoned(fs_info)) {
- btrfs_info(fs_info, "free space %llu",
- block_group->length - block_group->alloc_offset);
+ btrfs_info(fs_info, "free space %llu active %d",
+ block_group->zone_capacity - block_group->alloc_offset,
+ block_group->zone_is_active);
return;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 487533c35ddb..b8c911a4a320 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6,6 +6,7 @@
#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/bio.h>
+#include <linux/blk-cgroup.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -287,8 +288,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
cur_size = min_t(unsigned long, compressed_size,
PAGE_SIZE);
- kaddr = page_address(cpage);
+ kaddr = kmap_atomic(cpage);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
+ kunmap_atomic(kaddr);
i++;
ptr += cur_size;
@@ -455,11 +457,10 @@ struct async_chunk {
struct list_head extents;
struct cgroup_subsys_state *blkcg_css;
struct btrfs_work work;
- atomic_t *pending;
+ struct async_cow *async_cow;
};
struct async_cow {
- /* Number of chunks in flight; must be first in the structure */
atomic_t num_chunks;
struct async_chunk chunks[];
};
@@ -490,9 +491,6 @@ static noinline int add_async_extent(struct async_chunk *cow,
*/
static inline bool inode_can_compress(struct btrfs_inode *inode)
{
- /* Subpage doesn't support compression yet */
- if (inode->root->fs_info->sectorsize < PAGE_SIZE)
- return false;
if (inode->flags & BTRFS_INODE_NODATACOW ||
inode->flags & BTRFS_INODE_NODATASUM)
return false;
@@ -514,6 +512,38 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
btrfs_ino(inode));
return 0;
}
+ /*
+ * Special check for subpage.
+ *
+ * We lock the full page then run each delalloc range in the page, thus
+ * for the following case, we will hit some subpage specific corner case:
+ *
+ * 0 32K 64K
+ * | |///////| |///////|
+ * \- A \- B
+ *
+ * In above case, both range A and range B will try to unlock the full
+ * page [0, 64K), causing the one finished later will have page
+ * unlocked already, triggering various page lock requirement BUG_ON()s.
+ *
+ * So here we add an artificial limit that subpage compression can only
+ * if the range is fully page aligned.
+ *
+ * In theory we only need to ensure the first page is fully covered, but
+ * the tailing partial page will be locked until the full compression
+ * finishes, delaying the write of other range.
+ *
+ * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
+ * first to prevent any submitted async extent to unlock the full page.
+ * By this, we can ensure for subpage case that only the last async_cow
+ * will unlock the full page.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE) {
+ if (!IS_ALIGNED(start, PAGE_SIZE) ||
+ !IS_ALIGNED(end + 1, PAGE_SIZE))
+ return 0;
+ }
+
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;
@@ -615,13 +645,24 @@ again:
total_compressed = actual_end - start;
/*
- * skip compression for a small file range(<=blocksize) that
+ * Skip compression for a small file range(<=blocksize) that
* isn't an inline extent, since it doesn't save disk space at all.
*/
if (total_compressed <= blocksize &&
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
goto cleanup_and_bail_uncompressed;
+ /*
+ * For subpage case, we require full page alignment for the sector
+ * aligned range.
+ * Thus we must also check against @actual_end, not just @end.
+ */
+ if (blocksize < PAGE_SIZE) {
+ if (!IS_ALIGNED(start, PAGE_SIZE) ||
+ !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE))
+ goto cleanup_and_bail_uncompressed;
+ }
+
total_compressed = min_t(unsigned long, total_compressed,
BTRFS_MAX_UNCOMPRESSED);
total_in = 0;
@@ -759,7 +800,7 @@ cont:
* win, compare the page count read with the blocks on disk,
* compression must free at least one sector size
*/
- total_in = ALIGN(total_in, PAGE_SIZE);
+ total_in = round_up(total_in, fs_info->sectorsize);
if (total_compressed + blocksize <= total_in) {
compressed_extents++;
@@ -840,166 +881,148 @@ static void free_async_extent_pages(struct async_extent *async_extent)
async_extent->pages = NULL;
}
-/*
- * phase two of compressed writeback. This is the ordered portion
- * of the code, which only gets called in the order the work was
- * queued. We walk all the async extents created by compress_file_range
- * and send them down to the disk.
- */
-static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
+static int submit_uncompressed_range(struct btrfs_inode *inode,
+ struct async_extent *async_extent,
+ struct page *locked_page)
{
- struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct async_extent *async_extent;
- u64 alloc_hint = 0;
- struct btrfs_key ins;
- struct extent_map *em;
- struct btrfs_root *root = inode->root;
- struct extent_io_tree *io_tree = &inode->io_tree;
- int ret = 0;
-
-again:
- while (!list_empty(&async_chunk->extents)) {
- async_extent = list_entry(async_chunk->extents.next,
- struct async_extent, list);
- list_del(&async_extent->list);
-
-retry:
- lock_extent(io_tree, async_extent->start,
- async_extent->start + async_extent->ram_size - 1);
- /* did the compression code fall back to uncompressed IO? */
- if (!async_extent->pages) {
- int page_started = 0;
- unsigned long nr_written = 0;
+ u64 start = async_extent->start;
+ u64 end = async_extent->start + async_extent->ram_size - 1;
+ unsigned long nr_written = 0;
+ int page_started = 0;
+ int ret;
- /* allocate blocks */
- ret = cow_file_range(inode, async_chunk->locked_page,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- &page_started, &nr_written, 0);
+ /*
+ * Call cow_file_range() to run the delalloc range directly, since we
+ * won't go to NOCOW or async path again.
+ *
+ * Also we call cow_file_range() with @unlock_page == 0, so that we
+ * can directly submit them without interruption.
+ */
+ ret = cow_file_range(inode, locked_page, start, end, &page_started,
+ &nr_written, 0);
+ /* Inline extent inserted, page gets unlocked and everything is done */
+ if (page_started) {
+ ret = 0;
+ goto out;
+ }
+ if (ret < 0) {
+ if (locked_page)
+ unlock_page(locked_page);
+ goto out;
+ }
- /* JDM XXX */
+ ret = extent_write_locked_range(&inode->vfs_inode, start, end);
+ /* All pages will be unlocked, including @locked_page */
+out:
+ kfree(async_extent);
+ return ret;
+}
- /*
- * if page_started, cow_file_range inserted an
- * inline extent and took care of all the unlocking
- * and IO for us. Otherwise, we need to submit
- * all those pages down to the drive.
- */
- if (!page_started && !ret)
- extent_write_locked_range(&inode->vfs_inode,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- WB_SYNC_ALL);
- else if (ret && async_chunk->locked_page)
- unlock_page(async_chunk->locked_page);
- kfree(async_extent);
- cond_resched();
- continue;
- }
+static int submit_one_async_extent(struct btrfs_inode *inode,
+ struct async_chunk *async_chunk,
+ struct async_extent *async_extent,
+ u64 *alloc_hint)
+{
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key ins;
+ struct page *locked_page = NULL;
+ struct extent_map *em;
+ int ret = 0;
+ u64 start = async_extent->start;
+ u64 end = async_extent->start + async_extent->ram_size - 1;
- ret = btrfs_reserve_extent(root, async_extent->ram_size,
- async_extent->compressed_size,
- async_extent->compressed_size,
- 0, alloc_hint, &ins, 1, 1);
- if (ret) {
- free_async_extent_pages(async_extent);
+ /*
+ * If async_chunk->locked_page is in the async_extent range, we need to
+ * handle it.
+ */
+ if (async_chunk->locked_page) {
+ u64 locked_page_start = page_offset(async_chunk->locked_page);
+ u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
- if (ret == -ENOSPC) {
- unlock_extent(io_tree, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1);
+ if (!(start >= locked_page_end || end <= locked_page_start))
+ locked_page = async_chunk->locked_page;
+ }
+ lock_extent(io_tree, start, end);
- /*
- * we need to redirty the pages if we decide to
- * fallback to uncompressed IO, otherwise we
- * will not submit these pages down to lower
- * layers.
- */
- extent_range_redirty_for_io(&inode->vfs_inode,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1);
+ /* We have fall back to uncompressed write */
+ if (!async_extent->pages)
+ return submit_uncompressed_range(inode, async_extent, locked_page);
- goto retry;
- }
- goto out_free;
- }
+ ret = btrfs_reserve_extent(root, async_extent->ram_size,
+ async_extent->compressed_size,
+ async_extent->compressed_size,
+ 0, *alloc_hint, &ins, 1, 1);
+ if (ret) {
+ free_async_extent_pages(async_extent);
/*
- * here we're doing allocation and writeback of the
- * compressed pages
+ * Here we used to try again by going back to non-compressed
+ * path for ENOSPC. But we can't reserve space even for
+ * compressed size, how could it work for uncompressed size
+ * which requires larger size? So here we directly go error
+ * path.
*/
- em = create_io_em(inode, async_extent->start,
- async_extent->ram_size, /* len */
- async_extent->start, /* orig_start */
- ins.objectid, /* block_start */
- ins.offset, /* block_len */
- ins.offset, /* orig_block_len */
- async_extent->ram_size, /* ram_bytes */
- async_extent->compress_type,
- BTRFS_ORDERED_COMPRESSED);
- if (IS_ERR(em))
- /* ret value is not necessary due to void function */
- goto out_free_reserve;
- free_extent_map(em);
-
- ret = btrfs_add_ordered_extent_compress(inode,
- async_extent->start,
- ins.objectid,
- async_extent->ram_size,
- ins.offset,
- async_extent->compress_type);
- if (ret) {
- btrfs_drop_extent_cache(inode, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1, 0);
- goto out_free_reserve;
- }
- btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ goto out_free;
+ }
+
+ /* Here we're doing allocation and writeback of the compressed pages */
+ em = create_io_em(inode, start,
+ async_extent->ram_size, /* len */
+ start, /* orig_start */
+ ins.objectid, /* block_start */
+ ins.offset, /* block_len */
+ ins.offset, /* orig_block_len */
+ async_extent->ram_size, /* ram_bytes */
+ async_extent->compress_type,
+ BTRFS_ORDERED_COMPRESSED);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_free_reserve;
+ }
+ free_extent_map(em);
- /*
- * clear dirty, set writeback and unlock the pages.
- */
- extent_clear_unlock_delalloc(inode, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
- PAGE_UNLOCK | PAGE_START_WRITEBACK);
- if (btrfs_submit_compressed_write(inode, async_extent->start,
- async_extent->ram_size,
- ins.objectid,
- ins.offset, async_extent->pages,
- async_extent->nr_pages,
- async_chunk->write_flags,
- async_chunk->blkcg_css)) {
- struct page *p = async_extent->pages[0];
- const u64 start = async_extent->start;
- const u64 end = start + async_extent->ram_size - 1;
-
- p->mapping = inode->vfs_inode.i_mapping;
- btrfs_writepage_endio_finish_ordered(inode, p, start,
- end, false);
-
- p->mapping = NULL;
- extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
- PAGE_END_WRITEBACK |
- PAGE_SET_ERROR);
- free_async_extent_pages(async_extent);
- }
- alloc_hint = ins.objectid + ins.offset;
- kfree(async_extent);
- cond_resched();
+ ret = btrfs_add_ordered_extent_compress(inode, start, /* file_offset */
+ ins.objectid, /* disk_bytenr */
+ async_extent->ram_size, /* num_bytes */
+ ins.offset, /* disk_num_bytes */
+ async_extent->compress_type);
+ if (ret) {
+ btrfs_drop_extent_cache(inode, start, end, 0);
+ goto out_free_reserve;
}
- return;
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+
+ /* Clear dirty, set writeback and unlock the pages. */
+ extent_clear_unlock_delalloc(inode, start, end,
+ NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+ PAGE_UNLOCK | PAGE_START_WRITEBACK);
+ if (btrfs_submit_compressed_write(inode, start, /* file_offset */
+ async_extent->ram_size, /* num_bytes */
+ ins.objectid, /* disk_bytenr */
+ ins.offset, /* compressed_len */
+ async_extent->pages, /* compressed_pages */
+ async_extent->nr_pages,
+ async_chunk->write_flags,
+ async_chunk->blkcg_css)) {
+ const u64 start = async_extent->start;
+ const u64 end = start + async_extent->ram_size - 1;
+
+ btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);
+
+ extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+ PAGE_END_WRITEBACK | PAGE_SET_ERROR);
+ free_async_extent_pages(async_extent);
+ }
+ *alloc_hint = ins.objectid + ins.offset;
+ kfree(async_extent);
+ return ret;
+
out_free_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_free:
- extent_clear_unlock_delalloc(inode, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
+ extent_clear_unlock_delalloc(inode, start, end,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
@@ -1007,7 +1030,39 @@ out_free:
PAGE_END_WRITEBACK | PAGE_SET_ERROR);
free_async_extent_pages(async_extent);
kfree(async_extent);
- goto again;
+ return ret;
+}
+
+/*
+ * Phase two of compressed writeback. This is the ordered portion of the code,
+ * which only gets called in the order the work was queued. We walk all the
+ * async extents created by compress_file_range and send them down to the disk.
+ */
+static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
+{
+ struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct async_extent *async_extent;
+ u64 alloc_hint = 0;
+ int ret = 0;
+
+ while (!list_empty(&async_chunk->extents)) {
+ u64 extent_start;
+ u64 ram_size;
+
+ async_extent = list_entry(async_chunk->extents.next,
+ struct async_extent, list);
+ list_del(&async_extent->list);
+ extent_start = async_extent->start;
+ ram_size = async_extent->ram_size;
+
+ ret = submit_one_async_extent(inode, async_chunk, async_extent,
+ &alloc_hint);
+ btrfs_debug(fs_info,
+"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
+ inode->root->root_key.objectid,
+ btrfs_ino(inode), extent_start, ram_size, ret);
+ }
}
static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
@@ -1150,7 +1205,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* fails during the stage where it updates the bytenr of file extent
* items.
*/
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
min_alloc_size = num_bytes;
else
min_alloc_size = fs_info->sectorsize;
@@ -1186,8 +1241,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
if (ret)
goto out_drop_extent_cache;
- if (root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ if (btrfs_is_data_reloc_root(root)) {
ret = btrfs_reloc_clone_csums(inode, start,
cur_alloc_size);
/*
@@ -1325,18 +1379,17 @@ static noinline void async_cow_submit(struct btrfs_work *work)
static noinline void async_cow_free(struct btrfs_work *work)
{
struct async_chunk *async_chunk;
+ struct async_cow *async_cow;
async_chunk = container_of(work, struct async_chunk, work);
if (async_chunk->inode)
btrfs_add_delayed_iput(async_chunk->inode);
if (async_chunk->blkcg_css)
css_put(async_chunk->blkcg_css);
- /*
- * Since the pointer to 'pending' is at the beginning of the array of
- * async_chunk's, freeing it ensures the whole array has been freed.
- */
- if (atomic_dec_and_test(async_chunk->pending))
- kvfree(async_chunk->pending);
+
+ async_cow = async_chunk->async_cow;
+ if (atomic_dec_and_test(&async_cow->num_chunks))
+ kvfree(async_cow);
}
static int cow_file_range_async(struct btrfs_inode *inode,
@@ -1397,7 +1450,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
* lightweight reference for the callback lifetime
*/
ihold(&inode->vfs_inode);
- async_chunk[i].pending = &ctx->num_chunks;
+ async_chunk[i].async_cow = ctx;
async_chunk[i].inode = &inode->vfs_inode;
async_chunk[i].start = start;
async_chunk[i].end = cur_end;
@@ -1470,7 +1523,7 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
__set_page_dirty_nobuffers(locked_page);
account_page_redirty(locked_page);
- extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL);
+ extent_write_locked_range(&inode->vfs_inode, start, end);
*page_started = 1;
return 0;
@@ -1503,8 +1556,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
int *page_started, unsigned long *nr_written)
{
const bool is_space_ino = btrfs_is_free_space_inode(inode);
- const bool is_reloc_ino = (inode->root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID);
+ const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
const u64 range_bytes = end + 1 - start;
struct extent_io_tree *io_tree = &inode->io_tree;
u64 range_start = start;
@@ -1866,8 +1918,7 @@ out_check:
btrfs_dec_nocow_writers(fs_info, disk_bytenr);
nocow = false;
- if (root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
/*
* Error handled later, as we must prevent
* extent_clear_unlock_delalloc() in error handler
@@ -1946,8 +1997,23 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
int ret;
const bool zoned = btrfs_is_zoned(inode->root->fs_info);
+ /*
+ * The range must cover part of the @locked_page, or the returned
+ * @page_started can confuse the caller.
+ */
+ ASSERT(!(end <= page_offset(locked_page) ||
+ start >= page_offset(locked_page) + PAGE_SIZE));
+
if (should_nocow(inode, start, end)) {
- ASSERT(!zoned);
+ /*
+ * Normally on a zoned device we're only doing COW writes, but
+ * in case of relocation on a zoned filesystem we have taken
+ * precaution, that we're only writing sequentially. It's safe
+ * to use run_delalloc_nocow() here, like for regular
+ * preallocated inodes.
+ */
+ ASSERT(!zoned ||
+ (zoned && btrfs_is_data_reloc_root(inode->root)));
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, nr_written);
} else if (!inode_can_compress(inode) ||
@@ -2206,7 +2272,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
if (btrfs_is_testing(fs_info))
return;
- if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ if (!btrfs_is_data_reloc_root(root) &&
do_list && !(state->state & EXTENT_NORESERVE) &&
(*bits & EXTENT_CLEAR_DATA_RESV))
btrfs_free_reserved_data_space_noquota(fs_info, len);
@@ -2234,48 +2300,6 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
}
/*
- * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
- * in a chunk's stripe. This function ensures that bios do not span a
- * stripe/chunk
- *
- * @page - The page we are about to add to the bio
- * @size - size we want to add to the bio
- * @bio - bio we want to ensure is smaller than a stripe
- * @bio_flags - flags of the bio
- *
- * return 1 if page cannot be added to the bio
- * return 0 if page can be added to the bio
- * return error otherwise
- */
-int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
- unsigned long bio_flags)
-{
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- u64 logical = bio->bi_iter.bi_sector << 9;
- u32 bio_len = bio->bi_iter.bi_size;
- struct extent_map *em;
- int ret = 0;
- struct btrfs_io_geometry geom;
-
- if (bio_flags & EXTENT_BIO_COMPRESSED)
- return 0;
-
- em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
- if (IS_ERR(em))
- return PTR_ERR(em);
- ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom);
- if (ret < 0)
- goto out;
-
- if (geom.len < bio_len + size)
- ret = 1;
-out:
- free_extent_map(em);
- return ret;
-}
-
-/*
* in order to insert checksums into the metadata in large chunks,
* we wait until bio submission time. All the pages in the bio are
* checksummed and sums are attached onto the ordered extent record.
@@ -2531,7 +2555,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
goto mapit;
} else if (async && !skip_sum) {
/* csum items have already been cloned */
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
goto mapit;
/* we're doing a write, do the async checksumming */
ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
@@ -2764,7 +2788,7 @@ out_page:
clear_page_dirty_for_io(page);
SetPageError(page);
}
- ClearPageChecked(page);
+ btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);
unlock_page(page);
put_page(page);
kfree(fixup);
@@ -2819,7 +2843,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
* page->mapping outside of the page lock.
*/
ihold(inode);
- SetPageChecked(page);
+ btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
get_page(page);
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
@@ -3010,8 +3034,12 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- if (ordered_extent->bdev)
+ /* A valid bdev implies a write on a sequential zone */
+ if (ordered_extent->bdev) {
btrfs_rewrite_logical_zoned(ordered_extent);
+ btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
+ }
btrfs_free_io_failure_record(inode, start, end);
@@ -3208,7 +3236,7 @@ void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
*
* The length of such check is always one sector size.
*/
-static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
+static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
u32 bio_offset, struct page *page, u32 pgoff,
u64 start)
{
@@ -3224,7 +3252,7 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
ASSERT(pgoff + len <= PAGE_SIZE);
offset_sectors = bio_offset >> fs_info->sectorsize_bits;
- csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size;
+ csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size;
kaddr = kmap_atomic(page);
shash->tfm = fs_info->csum_shash;
@@ -3238,9 +3266,9 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
return 0;
zeroit:
btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
- io_bio->mirror_num);
- if (io_bio->device)
- btrfs_dev_stat_inc_and_print(io_bio->device,
+ bbio->mirror_num);
+ if (bbio->device)
+ btrfs_dev_stat_inc_and_print(bbio->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
memset(kaddr + pgoff, 1, len);
flush_dcache_page(page);
@@ -3260,33 +3288,29 @@ zeroit:
* Return a bitmap where bit set means a csum mismatch, and bit not set means
* csum match.
*/
-unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
- struct page *page, u64 start, u64 end)
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page,
+ u64 start, u64 end)
{
struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
const u32 sectorsize = root->fs_info->sectorsize;
u32 pg_off;
unsigned int result = 0;
- if (PageChecked(page)) {
- ClearPageChecked(page);
+ if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) {
+ btrfs_page_clear_checked(fs_info, page, start, end + 1 - start);
return 0;
}
/*
- * For subpage case, above PageChecked is not safe as it's not subpage
- * compatible.
- * But for now only cow fixup and compressed read utilize PageChecked
- * flag, while in this context we can easily use io_bio->csum to
- * determine if we really need to do csum verification.
- *
- * So for now, just exit if io_bio->csum is NULL, as it means it's
- * compressed read, and its compressed data csum has already been
- * verified.
+ * This only happens for NODATASUM or compressed read.
+ * Normally this should be covered by above check for compressed read
+ * or the next check for NODATASUM. Just do a quicker exit here.
*/
- if (io_bio->csum == NULL)
+ if (bbio->csum == NULL)
return 0;
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
@@ -3303,7 +3327,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
u64 file_offset = pg_off + page_offset(page);
int ret;
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ if (btrfs_is_data_reloc_root(root) &&
test_range_bit(io_tree, file_offset,
file_offset + sectorsize - 1,
EXTENT_NODATASUM, 1, NULL)) {
@@ -3313,7 +3337,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
EXTENT_NODATASUM);
continue;
}
- ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
+ ret = check_data_csum(inode, bbio, bio_offset, page, pg_off,
page_offset(page) + pg_off);
if (ret < 0) {
const int nr_bit = (pg_off - offset_in_page(start)) >>
@@ -4004,7 +4028,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
* without delay
*/
if (!btrfs_is_free_space_inode(inode)
- && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+ && !btrfs_is_data_reloc_root(root)
&& !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
btrfs_update_root_times(trans, root);
@@ -4034,11 +4058,11 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
* also drops the back refs in the inode to the directory
*/
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
const char *name, int name_len)
{
+ struct btrfs_root *root = dir->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
int ret = 0;
@@ -4098,19 +4122,9 @@ skip_backref:
goto err;
}
- ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
- dir_ino);
- if (ret != 0 && ret != -ENOENT) {
- btrfs_abort_transaction(trans, ret);
- goto err;
- }
-
- ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
- index);
- if (ret == -ENOENT)
- ret = 0;
- else if (ret)
- btrfs_abort_transaction(trans, ret);
+ btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
+ dir_ino);
+ btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index);
/*
* If we have a pending delayed iput we could end up with the final iput
@@ -4138,15 +4152,14 @@ out:
}
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *dir, struct btrfs_inode *inode,
const char *name, int name_len)
{
int ret;
- ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+ ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len);
if (!ret) {
drop_nlink(&inode->vfs_inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode->root, inode);
}
return ret;
}
@@ -4175,7 +4188,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
{
- struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
struct inode *inode = d_inode(dentry);
int ret;
@@ -4187,7 +4199,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
0);
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
BTRFS_I(d_inode(dentry)), dentry->d_name.name,
dentry->d_name.len);
if (ret)
@@ -4201,7 +4213,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
out:
btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(root->fs_info);
+ btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
return ret;
}
@@ -4368,7 +4380,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root)
struct inode *inode;
u64 objectid = 0;
- if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (!BTRFS_FS_ERROR(fs_info))
WARN_ON(btrfs_root_refs(&root->root_item) != 0);
spin_lock(&root->inode_lock);
@@ -4552,7 +4564,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
int err = 0;
- struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
u64 last_unlink_trans;
@@ -4577,7 +4588,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
/* now the directory is empty */
- err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ err = btrfs_unlink_inode(trans, BTRFS_I(dir),
BTRFS_I(d_inode(dentry)), dentry->d_name.name,
dentry->d_name.len);
if (!err) {
@@ -4598,7 +4609,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
}
out:
btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(root->fs_info);
+ btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
return err;
}
@@ -4907,9 +4918,9 @@ delete:
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
extent_start, extent_num_bytes, 0);
- ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- ino, extent_offset);
+ ino, extent_offset,
+ root->root_key.objectid, false);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -5105,7 +5116,8 @@ again:
len);
flush_dcache_page(page);
}
- ClearPageChecked(page);
+ btrfs_page_clear_checked(fs_info, page, block_start,
+ block_end + 1 - block_start);
btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
@@ -6435,7 +6447,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode_ref *ref;
struct btrfs_key key[2];
u32 sizes[2];
- int nitems = name ? 2 : 1;
+ struct btrfs_item_batch batch;
unsigned long ptr;
unsigned int nofs_flag;
int ret;
@@ -6527,7 +6539,11 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
goto fail;
}
- ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
+ batch.keys = &key[0];
+ batch.data_sizes = &sizes[0];
+ batch.total_data_size = sizes[0] + (name ? sizes[1] : 0);
+ batch.nr = name ? 2 : 1;
+ ret = btrfs_insert_empty_items(trans, root, path, &batch);
if (ret != 0)
goto fail_unlock;
@@ -7961,7 +7977,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
iomap->type = IOMAP_MAPPED;
}
iomap->offset = start;
- iomap->bdev = fs_info->fs_devices->latest_bdev;
+ iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
iomap->length = len;
if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
@@ -8038,13 +8054,13 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) {
__endio_write_update_ordered(BTRFS_I(dip->inode),
- dip->logical_offset,
+ dip->file_offset,
dip->bytes,
!dip->dio_bio->bi_status);
} else {
unlock_extent(&BTRFS_I(dip->inode)->io_tree,
- dip->logical_offset,
- dip->logical_offset + dip->bytes - 1);
+ dip->file_offset,
+ dip->file_offset + dip->bytes - 1);
}
bio_endio(dip->dio_bio);
@@ -8072,10 +8088,11 @@ static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio,
return ret;
}
-static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
- struct btrfs_io_bio *io_bio,
+static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
+ struct btrfs_bio *bbio,
const bool uptodate)
{
+ struct inode *inode = dip->inode;
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
const u32 sectorsize = fs_info->sectorsize;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
@@ -8083,11 +8100,12 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
struct bio_vec bvec;
struct bvec_iter iter;
- u64 start = io_bio->logical;
+ const u64 orig_file_offset = dip->file_offset;
+ u64 start = orig_file_offset;
u32 bio_offset = 0;
blk_status_t err = BLK_STS_OK;
- __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) {
+ __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) {
unsigned int i, nr_sectors, pgoff;
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
@@ -8095,7 +8113,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
for (i = 0; i < nr_sectors; i++) {
ASSERT(pgoff < PAGE_SIZE);
if (uptodate &&
- (!csum || !check_data_csum(inode, io_bio,
+ (!csum || !check_data_csum(inode, bbio,
bio_offset, bvec.bv_page,
pgoff, start))) {
clean_io_failure(fs_info, failure_tree, io_tree,
@@ -8105,12 +8123,12 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
} else {
int ret;
- ASSERT((start - io_bio->logical) < UINT_MAX);
+ ASSERT((start - orig_file_offset) < UINT_MAX);
ret = btrfs_repair_one_sector(inode,
- &io_bio->bio,
- start - io_bio->logical,
+ &bbio->bio,
+ start - orig_file_offset,
bvec.bv_page, pgoff,
- start, io_bio->mirror_num,
+ start, bbio->mirror_num,
submit_dio_repair_bio);
if (ret)
err = errno_to_blk_status(ret);
@@ -8151,15 +8169,13 @@ static void btrfs_end_dio_bio(struct bio *bio)
bio->bi_opf, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, err);
- if (bio_op(bio) == REQ_OP_READ) {
- err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio),
- !err);
- }
+ if (bio_op(bio) == REQ_OP_READ)
+ err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err);
if (err)
dip->dio_bio->bi_status = err;
- btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio);
+ btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio);
bio_put(bio);
btrfs_dio_private_put(dip);
@@ -8201,10 +8217,10 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
} else {
u64 csum_offset;
- csum_offset = file_offset - dip->logical_offset;
+ csum_offset = file_offset - dip->file_offset;
csum_offset >>= fs_info->sectorsize_bits;
csum_offset *= fs_info->csum_size;
- btrfs_io_bio(bio)->csum = dip->csums + csum_offset;
+ btrfs_bio(bio)->csum = dip->csums + csum_offset;
}
map:
ret = btrfs_map_bio(fs_info, bio, 0);
@@ -8239,7 +8255,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
return NULL;
dip->inode = inode;
- dip->logical_offset = file_offset;
+ dip->file_offset = file_offset;
dip->bytes = dio_bio->bi_iter.bi_size;
dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9;
dip->dio_bio = dio_bio;
@@ -8247,7 +8263,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
return dip;
}
-static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
+static void btrfs_submit_direct(const struct iomap_iter *iter,
struct bio *dio_bio, loff_t file_offset)
{
struct inode *inode = iter->inode;
@@ -8277,7 +8293,7 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
}
dio_bio->bi_status = BLK_STS_RESOURCE;
bio_endio(dio_bio);
- return BLK_QC_T_NONE;
+ return;
}
if (!write) {
@@ -8320,7 +8336,6 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
- btrfs_io_bio(bio)->logical = file_offset;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
status = extract_ordered_extent(BTRFS_I(inode), bio,
@@ -8371,15 +8386,13 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
free_extent_map(em);
} while (submit_len > 0);
- return BLK_QC_T_NONE;
+ return;
out_err_em:
free_extent_map(em);
out_err:
dip->dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
-
- return BLK_QC_T_NONE;
}
const struct iomap_ops btrfs_dio_iomap_ops = {
@@ -8696,9 +8709,9 @@ next:
* did something wrong.
*/
ASSERT(!PageOrdered(page));
+ btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE);
if (!inode_evicting)
__btrfs_releasepage(page, GFP_NOFS);
- ClearPageChecked(page);
clear_page_extent_mapped(page);
}
@@ -8842,7 +8855,7 @@ again:
memzero_page(page, zero_start, PAGE_SIZE - zero_start);
flush_dcache_page(page);
}
- ClearPageChecked(page);
+ btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
@@ -9152,8 +9165,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
WARN_ON(inode->block_rsv.reserved);
WARN_ON(inode->block_rsv.size);
WARN_ON(inode->outstanding_extents);
- WARN_ON(inode->delalloc_bytes);
- WARN_ON(inode->new_delalloc_bytes);
+ if (!S_ISDIR(vfs_inode->i_mode)) {
+ WARN_ON(inode->delalloc_bytes);
+ WARN_ON(inode->new_delalloc_bytes);
+ }
WARN_ON(inode->csum_bytes);
WARN_ON(inode->defrag_bytes);
@@ -9450,7 +9465,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else { /* src is an inode */
- ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
old_dentry->d_name.name,
old_dentry->d_name.len);
@@ -9466,7 +9481,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
} else { /* dest is an inode */
- ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
new_dentry->d_name.name,
new_dentry->d_name.len);
@@ -9741,7 +9756,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
*/
btrfs_pin_log_trans(root);
log_pinned = true;
- ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
old_dentry->d_name.name,
old_dentry->d_name.len);
@@ -9761,7 +9776,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
BUG_ON(new_inode->i_nlink == 0);
} else {
- ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(d_inode(new_dentry)),
new_dentry->d_name.name,
new_dentry->d_name.len);
@@ -9979,7 +9994,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_conte
};
struct btrfs_fs_info *fs_info = root->fs_info;
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
@@ -9998,7 +10013,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
struct list_head splice;
int ret;
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
INIT_LIST_HEAD(&splice);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cc61813213d8..02ff085c31bf 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -48,6 +48,7 @@
#include "space-info.h"
#include "delalloc-space.h"
#include "block-group.h"
+#include "subpage.h"
#ifdef CONFIG_64BIT
/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -81,7 +82,8 @@ struct btrfs_ioctl_send_args_32 {
compat_uptr_t clone_sources; /* in */
__u64 parent_root; /* in */
__u64 flags; /* in */
- __u64 reserved[4]; /* in */
+ __u32 version; /* in */
+ __u8 reserved[28]; /* in */
} __attribute__ ((__packed__));
#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
@@ -985,129 +987,32 @@ out:
return ret;
}
-/*
- * When we're defragging a range, we don't want to kick it off again
- * if it is really just waiting for delalloc to send it down.
- * If we find a nice big extent or delalloc range for the bytes in the
- * file you want to defrag, we return 0 to let you know to skip this
- * part of the file
- */
-static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
-{
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_map *em = NULL;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- u64 end;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE);
- read_unlock(&em_tree->lock);
-
- if (em) {
- end = extent_map_end(em);
- free_extent_map(em);
- if (end - offset > thresh)
- return 0;
- }
- /* if we already have a nice delalloc here, just stop */
- thresh /= 2;
- end = count_range_bits(io_tree, &offset, offset + thresh,
- thresh, EXTENT_DELALLOC, 1);
- if (end >= thresh)
- return 0;
- return 1;
-}
-
-/*
- * helper function to walk through a file and find extents
- * newer than a specific transid, and smaller than thresh.
- *
- * This is used by the defragging code to find new and small
- * extents
- */
-static int find_new_extents(struct btrfs_root *root,
- struct inode *inode, u64 newer_than,
- u64 *off, u32 thresh)
-{
- struct btrfs_path *path;
- struct btrfs_key min_key;
- struct extent_buffer *leaf;
- struct btrfs_file_extent_item *extent;
- int type;
- int ret;
- u64 ino = btrfs_ino(BTRFS_I(inode));
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- min_key.objectid = ino;
- min_key.type = BTRFS_EXTENT_DATA_KEY;
- min_key.offset = *off;
-
- while (1) {
- ret = btrfs_search_forward(root, &min_key, path, newer_than);
- if (ret != 0)
- goto none;
-process_slot:
- if (min_key.objectid != ino)
- goto none;
- if (min_key.type != BTRFS_EXTENT_DATA_KEY)
- goto none;
-
- leaf = path->nodes[0];
- extent = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- type = btrfs_file_extent_type(leaf, extent);
- if (type == BTRFS_FILE_EXTENT_REG &&
- btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
- check_defrag_in_cache(inode, min_key.offset, thresh)) {
- *off = min_key.offset;
- btrfs_free_path(path);
- return 0;
- }
-
- path->slots[0]++;
- if (path->slots[0] < btrfs_header_nritems(leaf)) {
- btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
- goto process_slot;
- }
-
- if (min_key.offset == (u64)-1)
- goto none;
-
- min_key.offset++;
- btrfs_release_path(path);
- }
-none:
- btrfs_free_path(path);
- return -ENOENT;
-}
-
-static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
+static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
+ bool locked)
{
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_map *em;
- u64 len = PAGE_SIZE;
+ const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
/*
* hopefully we have this extent in the tree already, try without
* the full extent lock
*/
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
+ em = lookup_extent_mapping(em_tree, start, sectorsize);
read_unlock(&em_tree->lock);
if (!em) {
struct extent_state *cached = NULL;
- u64 end = start + len - 1;
+ u64 end = start + sectorsize - 1;
/* get the big lock and read metadata off disk */
- lock_extent_bits(io_tree, start, end, &cached);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
- unlock_extent_cached(io_tree, start, end, &cached);
+ if (!locked)
+ lock_extent_bits(io_tree, start, end, &cached);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize);
+ if (!locked)
+ unlock_extent_cached(io_tree, start, end, &cached);
if (IS_ERR(em))
return NULL;
@@ -1116,7 +1021,8 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
return em;
}
-static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
+static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
+ bool locked)
{
struct extent_map *next;
bool ret = true;
@@ -1125,7 +1031,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
if (em->start + em->len >= i_size_read(inode))
return false;
- next = defrag_lookup_extent(inode, em->start + em->len);
+ next = defrag_lookup_extent(inode, em->start + em->len, locked);
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
ret = false;
else if ((em->block_start + em->block_len == next->block_start) &&
@@ -1136,297 +1042,435 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
return ret;
}
-static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
- u64 *last_len, u64 *skip, u64 *defrag_end,
- int compress)
+/*
+ * Prepare one page to be defragged.
+ *
+ * This will ensure:
+ *
+ * - Returned page is locked and has been set up properly.
+ * - No ordered extent exists in the page.
+ * - The page is uptodate.
+ *
+ * NOTE: Caller should also wait for page writeback after the cluster is
+ * prepared, here we don't do writeback wait for each page.
+ */
+static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
+ pgoff_t index)
{
- struct extent_map *em;
- int ret = 1;
- bool next_mergeable = true;
- bool prev_mergeable = true;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ gfp_t mask = btrfs_alloc_write_mask(mapping);
+ u64 page_start = (u64)index << PAGE_SHIFT;
+ u64 page_end = page_start + PAGE_SIZE - 1;
+ struct extent_state *cached_state = NULL;
+ struct page *page;
+ int ret;
+
+again:
+ page = find_or_create_page(mapping, index, mask);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
/*
- * make sure that once we start defragging an extent, we keep on
- * defragging it
+ * Since we can defragment files opened read-only, we can encounter
+ * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
+ * can't do I/O using huge pages yet, so return an error for now.
+ * Filesystem transparent huge pages are typically only used for
+ * executables that explicitly enable them, so this isn't very
+ * restrictive.
*/
- if (start < *defrag_end)
- return 1;
+ if (PageCompound(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-ETXTBSY);
+ }
- *skip = 0;
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(ret);
+ }
- em = defrag_lookup_extent(inode, start);
- if (!em)
- return 0;
+ /* Wait for any existing ordered extent in the range */
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
- /* this will cover holes, and inline extents */
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- ret = 0;
- goto out;
- }
+ lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
+ unlock_extent_cached(&inode->io_tree, page_start, page_end,
+ &cached_state);
+ if (!ordered)
+ break;
- if (!*defrag_end)
- prev_mergeable = false;
+ unlock_page(page);
+ btrfs_start_ordered_extent(ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ lock_page(page);
+ /*
+ * We unlocked the page above, so we need check if it was
+ * released or not.
+ */
+ if (page->mapping != mapping || !PagePrivate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+ }
- next_mergeable = defrag_check_next_extent(inode, em);
- /*
- * we hit a real extent, if it is big or the next extent is not a
- * real extent, don't bother defragging it
- */
- if (!compress && (*last_len == 0 || *last_len >= thresh) &&
- (em->len >= thresh || (!next_mergeable && !prev_mergeable)))
- ret = 0;
-out:
/*
- * last_len ends up being a counter of how many bytes we've defragged.
- * every time we choose not to defrag an extent, we reset *last_len
- * so that the next tiny extent will force a defrag.
- *
- * The end result of this is that tiny extents before a single big
- * extent will force at least part of that big extent to be defragged.
+ * Now the page range has no ordered extent any more. Read the page to
+ * make it uptodate.
*/
- if (ret) {
- *defrag_end = extent_map_end(em);
- } else {
- *last_len = 0;
- *skip = extent_map_end(em);
- *defrag_end = 0;
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (page->mapping != mapping || !PagePrivate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
}
-
- free_extent_map(em);
- return ret;
+ return page;
}
+struct defrag_target_range {
+ struct list_head list;
+ u64 start;
+ u64 len;
+};
+
/*
- * it doesn't do much good to defrag one or two pages
- * at a time. This pulls in a nice chunk of pages
- * to COW and defrag.
- *
- * It also makes sure the delalloc code has enough
- * dirty data to avoid making new small extents as part
- * of the defrag
+ * Collect all valid target extents.
*
- * It's a good idea to start RA on this range
- * before calling this.
+ * @start: file offset to lookup
+ * @len: length to lookup
+ * @extent_thresh: file extent size threshold, any extent size >= this value
+ * will be ignored
+ * @newer_than: only defrag extents newer than this value
+ * @do_compress: whether the defrag is doing compression
+ * if true, @extent_thresh will be ignored and all regular
+ * file extents meeting @newer_than will be targets.
+ * @locked: if the range has already held extent lock
+ * @target_list: list of targets file extents
*/
-static int cluster_pages_for_defrag(struct inode *inode,
- struct page **pages,
- unsigned long start_index,
- unsigned long num_pages)
+static int defrag_collect_targets(struct btrfs_inode *inode,
+ u64 start, u64 len, u32 extent_thresh,
+ u64 newer_than, bool do_compress,
+ bool locked, struct list_head *target_list)
{
- unsigned long file_end;
- u64 isize = i_size_read(inode);
- u64 page_start;
- u64 page_end;
- u64 page_cnt;
- u64 start = (u64)start_index << PAGE_SHIFT;
- u64 search_start;
- int ret;
- int i;
- int i_done;
- struct btrfs_ordered_extent *ordered;
- struct extent_state *cached_state = NULL;
- struct extent_io_tree *tree;
- struct extent_changeset *data_reserved = NULL;
- gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ u64 cur = start;
+ int ret = 0;
- file_end = (isize - 1) >> PAGE_SHIFT;
- if (!isize || start_index > file_end)
- return 0;
+ while (cur < start + len) {
+ struct extent_map *em;
+ struct defrag_target_range *new;
+ bool next_mergeable = true;
+ u64 range_len;
- page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
+ em = defrag_lookup_extent(&inode->vfs_inode, cur, locked);
+ if (!em)
+ break;
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- start, page_cnt << PAGE_SHIFT);
- if (ret)
- return ret;
- i_done = 0;
- tree = &BTRFS_I(inode)->io_tree;
+ /* Skip hole/inline/preallocated extents */
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ goto next;
- /* step one, lock all the pages */
- for (i = 0; i < page_cnt; i++) {
- struct page *page;
-again:
- page = find_or_create_page(inode->i_mapping,
- start_index + i, mask);
- if (!page)
- break;
+ /* Skip older extent */
+ if (em->generation < newer_than)
+ goto next;
- ret = set_page_extent_mapped(page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- break;
+ /*
+ * For do_compress case, we want to compress all valid file
+ * extents, thus no @extent_thresh or mergeable check.
+ */
+ if (do_compress)
+ goto add;
+
+ /* Skip too large extent */
+ if (em->len >= extent_thresh)
+ goto next;
+
+ next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
+ locked);
+ if (!next_mergeable) {
+ struct defrag_target_range *last;
+
+ /* Empty target list, no way to merge with last entry */
+ if (list_empty(target_list))
+ goto next;
+ last = list_entry(target_list->prev,
+ struct defrag_target_range, list);
+ /* Not mergeable with last entry */
+ if (last->start + last->len != cur)
+ goto next;
+
+ /* Mergeable, fall through to add it to @target_list. */
}
- page_start = page_offset(page);
- page_end = page_start + PAGE_SIZE - 1;
- while (1) {
- lock_extent_bits(tree, page_start, page_end,
- &cached_state);
- ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode),
- page_start);
- unlock_extent_cached(tree, page_start, page_end,
- &cached_state);
- if (!ordered)
- break;
-
- unlock_page(page);
- btrfs_start_ordered_extent(ordered, 1);
- btrfs_put_ordered_extent(ordered);
- lock_page(page);
- /*
- * we unlocked the page above, so we need check if
- * it was released or not.
- */
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
- put_page(page);
- goto again;
+add:
+ range_len = min(extent_map_end(em), start + len) - cur;
+ /*
+ * This one is a good target, check if it can be merged into
+ * last range of the target list.
+ */
+ if (!list_empty(target_list)) {
+ struct defrag_target_range *last;
+
+ last = list_entry(target_list->prev,
+ struct defrag_target_range, list);
+ ASSERT(last->start + last->len <= cur);
+ if (last->start + last->len == cur) {
+ /* Mergeable, enlarge the last entry */
+ last->len += range_len;
+ goto next;
}
+ /* Fall through to allocate a new entry */
}
- if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
- put_page(page);
- ret = -EIO;
- break;
- }
+ /* Allocate new defrag_target_range */
+ new = kmalloc(sizeof(*new), GFP_NOFS);
+ if (!new) {
+ free_extent_map(em);
+ ret = -ENOMEM;
+ break;
}
+ new->start = cur;
+ new->len = range_len;
+ list_add_tail(&new->list, target_list);
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
- put_page(page);
- goto again;
+next:
+ cur = extent_map_end(em);
+ free_extent_map(em);
+ }
+ if (ret < 0) {
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+
+ list_for_each_entry_safe(entry, tmp, target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
}
+ }
+ return ret;
+}
+
+#define CLUSTER_SIZE (SZ_256K)
+
+/*
+ * Defrag one contiguous target range.
+ *
+ * @inode: target inode
+ * @target: target range to defrag
+ * @pages: locked pages covering the defrag range
+ * @nr_pages: number of locked pages
+ *
+ * Caller should ensure:
+ *
+ * - Pages are prepared
+ * Pages should be locked, no ordered extent in the pages range,
+ * no writeback.
+ *
+ * - Extent bits are locked
+ */
+static int defrag_one_locked_target(struct btrfs_inode *inode,
+ struct defrag_target_range *target,
+ struct page **pages, int nr_pages,
+ struct extent_state **cached_state)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_changeset *data_reserved = NULL;
+ const u64 start = target->start;
+ const u64 len = target->len;
+ unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
+ unsigned long start_index = start >> PAGE_SHIFT;
+ unsigned long first_index = page_index(pages[0]);
+ int ret = 0;
+ int i;
+
+ ASSERT(last_index - first_index + 1 <= nr_pages);
+
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
+ if (ret < 0)
+ return ret;
+ clear_extent_bit(&inode->io_tree, start, start + len - 1,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, 0, 0, cached_state);
+ set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
- pages[i] = page;
- i_done++;
+ /* Update the page status */
+ for (i = start_index - first_index; i <= last_index - first_index; i++) {
+ ClearPageChecked(pages[i]);
+ btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
}
- if (!i_done || ret)
- goto out;
+ btrfs_delalloc_release_extents(inode, len);
+ extent_changeset_free(data_reserved);
- if (!(inode->i_sb->s_flags & SB_ACTIVE))
- goto out;
+ return ret;
+}
- /*
- * so now we have a nice long stream of locked
- * and up to date pages, lets wait on them
- */
- for (i = 0; i < i_done; i++)
- wait_on_page_writeback(pages[i]);
+static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+ u32 extent_thresh, u64 newer_than, bool do_compress)
+{
+ struct extent_state *cached_state = NULL;
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+ LIST_HEAD(target_list);
+ struct page **pages;
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ u64 last_index = (start + len - 1) >> PAGE_SHIFT;
+ u64 start_index = start >> PAGE_SHIFT;
+ unsigned int nr_pages = last_index - start_index + 1;
+ int ret = 0;
+ int i;
+
+ ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
+ ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
- page_start = page_offset(pages[0]);
- page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE;
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
- lock_extent_bits(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, &cached_state);
+ /* Prepare all pages */
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = defrag_prepare_one_page(inode, start_index + i);
+ if (IS_ERR(pages[i])) {
+ ret = PTR_ERR(pages[i]);
+ pages[i] = NULL;
+ goto free_pages;
+ }
+ }
+ for (i = 0; i < nr_pages; i++)
+ wait_on_page_writeback(pages[i]);
+ /* Lock the pages range */
+ lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
/*
- * When defragmenting we skip ranges that have holes or inline extents,
- * (check should_defrag_range()), to avoid unnecessary IO and wasting
- * space. At btrfs_defrag_file(), we check if a range should be defragged
- * before locking the inode and then, if it should, we trigger a sync
- * page cache readahead - we lock the inode only after that to avoid
- * blocking for too long other tasks that possibly want to operate on
- * other file ranges. But before we were able to get the inode lock,
- * some other task may have punched a hole in the range, or we may have
- * now an inline extent, in which case we should not defrag. So check
- * for that here, where we have the inode and the range locked, and bail
- * out if that happened.
+ * Now we have a consistent view about the extent map, re-check
+ * which range really needs to be defragged.
+ *
+ * And this time we have extent locked already, pass @locked = true
+ * so that we won't relock the extent range and cause deadlock.
*/
- search_start = page_start;
- while (search_start < page_end) {
- struct extent_map *em;
+ ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ newer_than, do_compress, true,
+ &target_list);
+ if (ret < 0)
+ goto unlock_extent;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start,
- page_end - search_start);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out_unlock_range;
- }
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- free_extent_map(em);
- /* Ok, 0 means we did not defrag anything */
- ret = 0;
- goto out_unlock_range;
+ list_for_each_entry(entry, &target_list, list) {
+ ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
+ &cached_state);
+ if (ret < 0)
+ break;
+ }
+
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
+ }
+unlock_extent:
+ unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
+free_pages:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i]) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
}
- search_start = extent_map_end(em);
- free_extent_map(em);
}
+ kfree(pages);
+ return ret;
+}
- clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
- page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 0, 0, &cached_state);
+static int defrag_one_cluster(struct btrfs_inode *inode,
+ struct file_ra_state *ra,
+ u64 start, u32 len, u32 extent_thresh,
+ u64 newer_than, bool do_compress,
+ unsigned long *sectors_defragged,
+ unsigned long max_sectors)
+{
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+ LIST_HEAD(target_list);
+ int ret;
- if (i_done != page_cnt) {
- spin_lock(&BTRFS_I(inode)->lock);
- btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
- spin_unlock(&BTRFS_I(inode)->lock);
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- start, (page_cnt - i_done) << PAGE_SHIFT, true);
- }
+ BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+ ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ newer_than, do_compress, false,
+ &target_list);
+ if (ret < 0)
+ goto out;
+ list_for_each_entry(entry, &target_list, list) {
+ u32 range_len = entry->len;
- set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
- &cached_state);
+ /* Reached the limit */
+ if (max_sectors && max_sectors == *sectors_defragged)
+ break;
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, &cached_state);
+ if (max_sectors)
+ range_len = min_t(u32, range_len,
+ (max_sectors - *sectors_defragged) * sectorsize);
- for (i = 0; i < i_done; i++) {
- clear_page_dirty_for_io(pages[i]);
- ClearPageChecked(pages[i]);
- set_page_dirty(pages[i]);
- unlock_page(pages[i]);
- put_page(pages[i]);
+ if (ra)
+ page_cache_sync_readahead(inode->vfs_inode.i_mapping,
+ ra, NULL, entry->start >> PAGE_SHIFT,
+ ((entry->start + range_len - 1) >> PAGE_SHIFT) -
+ (entry->start >> PAGE_SHIFT) + 1);
+ /*
+ * Here we may not defrag any range if holes are punched before
+ * we locked the pages.
+ * But that's fine, it only affects the @sectors_defragged
+ * accounting.
+ */
+ ret = defrag_one_range(inode, entry->start, range_len,
+ extent_thresh, newer_than, do_compress);
+ if (ret < 0)
+ break;
+ *sectors_defragged += range_len;
}
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
- extent_changeset_free(data_reserved);
- return i_done;
-
-out_unlock_range:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, &cached_state);
out:
- for (i = 0; i < i_done; i++) {
- unlock_page(pages[i]);
- put_page(pages[i]);
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
}
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- start, page_cnt << PAGE_SHIFT, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
- extent_changeset_free(data_reserved);
return ret;
-
}
-int btrfs_defrag_file(struct inode *inode, struct file *file,
+/*
+ * Entry point to file defragmentation.
+ *
+ * @inode: inode to be defragged
+ * @ra: readahead state (can be NUL)
+ * @range: defrag options including range and flags
+ * @newer_than: minimum transid to defrag
+ * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
+ * will be defragged.
+ */
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct file_ra_state *ra = NULL;
- unsigned long last_index;
+ unsigned long sectors_defragged = 0;
u64 isize = i_size_read(inode);
- u64 last_len = 0;
- u64 skip = 0;
- u64 defrag_end = 0;
- u64 newer_off = range->start;
- unsigned long i;
- unsigned long ra_index = 0;
- int ret;
- int defrag_count = 0;
+ u64 cur;
+ u64 last_byte;
+ bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
+ bool ra_allocated = false;
int compress_type = BTRFS_COMPRESS_ZLIB;
+ int ret = 0;
u32 extent_thresh = range->extent_thresh;
- unsigned long max_cluster = SZ_256K >> PAGE_SHIFT;
- unsigned long cluster = max_cluster;
- u64 new_align = ~((u64)SZ_128K - 1);
- struct page **pages = NULL;
- bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
if (isize == 0)
return 0;
@@ -1444,172 +1488,87 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
if (extent_thresh == 0)
extent_thresh = SZ_256K;
+ if (range->start + range->len > range->start) {
+ /* Got a specific range */
+ last_byte = min(isize, range->start + range->len) - 1;
+ } else {
+ /* Defrag until file end */
+ last_byte = isize - 1;
+ }
+
/*
- * If we were not given a file, allocate a readahead context. As
+ * If we were not given a ra, allocate a readahead context. As
* readahead is just an optimization, defrag will work without it so
* we don't error out.
*/
- if (!file) {
+ if (!ra) {
+ ra_allocated = true;
ra = kzalloc(sizeof(*ra), GFP_KERNEL);
if (ra)
file_ra_state_init(ra, inode->i_mapping);
- } else {
- ra = &file->f_ra;
- }
-
- pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL);
- if (!pages) {
- ret = -ENOMEM;
- goto out_ra;
- }
-
- /* find the last page to defrag */
- if (range->start + range->len > range->start) {
- last_index = min_t(u64, isize - 1,
- range->start + range->len - 1) >> PAGE_SHIFT;
- } else {
- last_index = (isize - 1) >> PAGE_SHIFT;
- }
-
- if (newer_than) {
- ret = find_new_extents(root, inode, newer_than,
- &newer_off, SZ_64K);
- if (!ret) {
- range->start = newer_off;
- /*
- * we always align our defrag to help keep
- * the extents in the file evenly spaced
- */
- i = (newer_off & new_align) >> PAGE_SHIFT;
- } else
- goto out_ra;
- } else {
- i = range->start >> PAGE_SHIFT;
}
- if (!max_to_defrag)
- max_to_defrag = last_index - i + 1;
- /*
- * make writeback starts from i, so the defrag range can be
- * written sequentially.
- */
- if (i < inode->i_mapping->writeback_index)
- inode->i_mapping->writeback_index = i;
-
- while (i <= last_index && defrag_count < max_to_defrag &&
- (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) {
- /*
- * make sure we stop running if someone unmounts
- * the FS
- */
- if (!(inode->i_sb->s_flags & SB_ACTIVE))
- break;
-
- if (btrfs_defrag_cancelled(fs_info)) {
- btrfs_debug(fs_info, "defrag_file cancelled");
- ret = -EAGAIN;
- goto error;
- }
+ /* Align the range */
+ cur = round_down(range->start, fs_info->sectorsize);
+ last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
- if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT,
- extent_thresh, &last_len, &skip,
- &defrag_end, do_compress)){
- unsigned long next;
- /*
- * the should_defrag function tells us how much to skip
- * bump our counter by the suggested amount
- */
- next = DIV_ROUND_UP(skip, PAGE_SIZE);
- i = max(i + 1, next);
- continue;
- }
+ while (cur < last_byte) {
+ u64 cluster_end;
- if (!newer_than) {
- cluster = (PAGE_ALIGN(defrag_end) >>
- PAGE_SHIFT) - i;
- cluster = min(cluster, max_cluster);
- } else {
- cluster = max_cluster;
- }
+ /* The cluster size 256K should always be page aligned */
+ BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
- if (i + cluster > ra_index) {
- ra_index = max(i, ra_index);
- if (ra)
- page_cache_sync_readahead(inode->i_mapping, ra,
- file, ra_index, cluster);
- ra_index += cluster;
- }
+ /* We want the cluster end at page boundary when possible */
+ cluster_end = (((cur >> PAGE_SHIFT) +
+ (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
+ cluster_end = min(cluster_end, last_byte);
btrfs_inode_lock(inode, 0);
if (IS_SWAPFILE(inode)) {
ret = -ETXTBSY;
- } else {
- if (do_compress)
- BTRFS_I(inode)->defrag_compress = compress_type;
- ret = cluster_pages_for_defrag(inode, pages, i, cluster);
+ btrfs_inode_unlock(inode, 0);
+ break;
}
- if (ret < 0) {
+ if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
btrfs_inode_unlock(inode, 0);
- goto out_ra;
+ break;
}
-
- defrag_count += ret;
- balance_dirty_pages_ratelimited(inode->i_mapping);
+ if (do_compress)
+ BTRFS_I(inode)->defrag_compress = compress_type;
+ ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+ cluster_end + 1 - cur, extent_thresh,
+ newer_than, do_compress,
+ &sectors_defragged, max_to_defrag);
btrfs_inode_unlock(inode, 0);
-
- if (newer_than) {
- if (newer_off == (u64)-1)
- break;
-
- if (ret > 0)
- i += ret;
-
- newer_off = max(newer_off + 1,
- (u64)i << PAGE_SHIFT);
-
- ret = find_new_extents(root, inode, newer_than,
- &newer_off, SZ_64K);
- if (!ret) {
- range->start = newer_off;
- i = (newer_off & new_align) >> PAGE_SHIFT;
- } else {
- break;
- }
- } else {
- if (ret > 0) {
- i += ret;
- last_len += ret << PAGE_SHIFT;
- } else {
- i++;
- last_len = 0;
- }
- }
+ if (ret < 0)
+ break;
+ cur = cluster_end + 1;
}
- ret = defrag_count;
-error:
- if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
- filemap_flush(inode->i_mapping);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
+ if (ra_allocated)
+ kfree(ra);
+ if (sectors_defragged) {
+ /*
+ * We have defragged some sectors, for compression case they
+ * need to be written back immediately.
+ */
+ if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
+ }
+ if (range->compress_type == BTRFS_COMPRESS_LZO)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+ else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+ ret = sectors_defragged;
}
-
- if (range->compress_type == BTRFS_COMPRESS_LZO) {
- btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
- } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) {
- btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
- }
-
-out_ra:
if (do_compress) {
btrfs_inode_lock(inode, 0);
BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
btrfs_inode_unlock(inode, 0);
}
- if (!file)
- kfree(ra);
- kfree(pages);
return ret;
}
@@ -1658,6 +1617,7 @@ static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info,
static noinline int btrfs_ioctl_resize(struct file *file,
void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 new_size;
@@ -1713,7 +1673,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
btrfs_info(fs_info, "resizing devid %llu", devid);
}
- device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+ args.devid = devid;
+ device = btrfs_find_device(fs_info->fs_devices, &args);
if (!device) {
btrfs_info(fs_info, "resizer unable to find device %llu",
devid);
@@ -1730,7 +1691,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
}
if (!strcmp(sizestr, "max"))
- new_size = device->bdev->bd_inode->i_size;
+ new_size = bdev_nr_bytes(device->bdev);
else {
if (sizestr[0] == '-') {
mod = -1;
@@ -1771,7 +1732,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
ret = -EINVAL;
goto out_finish;
}
- if (new_size > device->bdev->bd_inode->i_size) {
+ if (new_size > bdev_nr_bytes(device->bdev)) {
ret = -EFBIG;
goto out_finish;
}
@@ -3136,12 +3097,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
goto out;
}
- /* Subpage defrag will be supported in later commits */
- if (root->fs_info->sectorsize < PAGE_SIZE) {
- ret = -ENOTTY;
- goto out;
- }
-
switch (inode->i_mode & S_IFMT) {
case S_IFDIR:
if (!capable(CAP_SYS_ADMIN)) {
@@ -3176,7 +3131,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
/* the rest are all set to zero by kzalloc */
range.len = (u64)-1;
}
- ret = btrfs_defrag_file(file_inode(file), file,
+ ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
&range, BTRFS_OLDEST_GENERATION, 0);
if (ret > 0)
ret = 0;
@@ -3220,6 +3175,7 @@ out:
static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args_v2 *vol_args;
@@ -3231,35 +3187,39 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
- goto err_drop;
+ goto out;
}
if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {
ret = -EOPNOTSUPP;
goto out;
}
+
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
- if (!(vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) &&
- strcmp("cancel", vol_args->name) == 0)
+ if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
+ args.devid = vol_args->devid;
+ } else if (!strcmp("cancel", vol_args->name)) {
cancel = true;
+ } else {
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+ if (ret)
+ goto out;
+ }
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
cancel);
if (ret)
- goto out;
- /* Exclusive operation is now claimed */
+ goto err_drop;
- if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
- ret = btrfs_rm_device(fs_info, NULL, vol_args->devid, &bdev, &mode);
- else
- ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
+ /* Exclusive operation is now claimed */
+ ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
btrfs_exclop_finish(fs_info);
@@ -3271,17 +3231,19 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
btrfs_info(fs_info, "device deleted: %s",
vol_args->name);
}
-out:
- kfree(vol_args);
err_drop:
mnt_drop_write_file(file);
if (bdev)
blkdev_put(bdev, mode);
+out:
+ btrfs_put_dev_args_from_path(&args);
+ kfree(vol_args);
return ret;
}
static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args *vol_args;
@@ -3293,32 +3255,38 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args)) {
- ret = PTR_ERR(vol_args);
- goto out_drop_write;
- }
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- cancel = (strcmp("cancel", vol_args->name) == 0);
+ if (!strcmp("cancel", vol_args->name)) {
+ cancel = true;
+ } else {
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+ if (ret)
+ goto out;
+ }
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
cancel);
if (ret == 0) {
- ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
+ ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
if (!ret)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
btrfs_exclop_finish(fs_info);
}
- kfree(vol_args);
-out_drop_write:
mnt_drop_write_file(file);
if (bdev)
blkdev_put(bdev, mode);
+out:
+ btrfs_put_dev_args_from_path(&args);
+ kfree(vol_args);
return ret;
}
@@ -3379,22 +3347,21 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_ioctl_dev_info_args *di_args;
struct btrfs_device *dev;
int ret = 0;
- char *s_uuid = NULL;
di_args = memdup_user(arg, sizeof(*di_args));
if (IS_ERR(di_args))
return PTR_ERR(di_args);
+ args.devid = di_args->devid;
if (!btrfs_is_empty_uuid(di_args->uuid))
- s_uuid = di_args->uuid;
+ args.uuid = di_args->uuid;
rcu_read_lock();
- dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
- NULL);
-
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!dev) {
ret = -ENODEV;
goto out;
@@ -4430,7 +4397,6 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_ioctl_quota_rescan_args qsa = {0};
- int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4441,9 +4407,9 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
}
if (copy_to_user(arg, &qsa, sizeof(qsa)))
- ret = -EFAULT;
+ return -EFAULT;
- return ret;
+ return 0;
}
static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index a2e1f1f5c6e3..bbc45534ae9a 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -96,11 +96,12 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
#ifdef CONFIG_BTRFS_DEBUG
-static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) {
- lockdep_assert_held(&eb->lock);
+static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
+{
+ lockdep_assert_held_write(&eb->lock);
}
#else
-static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { }
+static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { }
#endif
void btrfs_unlock_up_safe(struct btrfs_path *path, int level);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index c25dfd1a8a54..65cb0766e62d 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -32,19 +32,19 @@
* payload.
* One regular LZO compressed extent can have one or more segments.
* For inlined LZO compressed extent, only one segment is allowed.
- * One segment represents at most one page of uncompressed data.
+ * One segment represents at most one sector of uncompressed data.
*
* 2.1 Segment header
* Fixed size. LZO_LEN (4) bytes long, LE32.
* Records the total size of the segment (not including the header).
- * Segment header never crosses page boundary, thus it's possible to
- * have at most 3 padding zeros at the end of the page.
+ * Segment header never crosses sector boundary, thus it's possible to
+ * have at most 3 padding zeros at the end of the sector.
*
* 2.2 Data Payload
- * Variable size. Size up limit should be lzo1x_worst_compress(PAGE_SIZE)
- * which is 4419 for a 4KiB page.
+ * Variable size. Size up limit should be lzo1x_worst_compress(sectorsize)
+ * which is 4419 for a 4KiB sectorsize.
*
- * Example:
+ * Example with 4K sectorsize:
* Page 1:
* 0 0x2 0x4 0x6 0x8 0xa 0xc 0xe 0x10
* 0x0000 | Header | SegHdr 01 | Data payload 01 ... |
@@ -112,163 +112,174 @@ static inline size_t read_compress_length(const char *buf)
return le32_to_cpu(dlen);
}
+/*
+ * Will do:
+ *
+ * - Write a segment header into the destination
+ * - Copy the compressed buffer into the destination
+ * - Make sure we have enough space in the last sector to fit a segment header
+ * If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros.
+ *
+ * Will allocate new pages when needed.
+ */
+static int copy_compressed_data_to_page(char *compressed_data,
+ size_t compressed_size,
+ struct page **out_pages,
+ u32 *cur_out,
+ const u32 sectorsize)
+{
+ u32 sector_bytes_left;
+ u32 orig_out;
+ struct page *cur_page;
+ char *kaddr;
+
+ /*
+ * We never allow a segment header crossing sector boundary, previous
+ * run should ensure we have enough space left inside the sector.
+ */
+ ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
+
+ cur_page = out_pages[*cur_out / PAGE_SIZE];
+ /* Allocate a new page */
+ if (!cur_page) {
+ cur_page = alloc_page(GFP_NOFS);
+ if (!cur_page)
+ return -ENOMEM;
+ out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ }
+
+ kaddr = kmap(cur_page);
+ write_compress_length(kaddr + offset_in_page(*cur_out),
+ compressed_size);
+ *cur_out += LZO_LEN;
+
+ orig_out = *cur_out;
+
+ /* Copy compressed data */
+ while (*cur_out - orig_out < compressed_size) {
+ u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize,
+ orig_out + compressed_size - *cur_out);
+
+ kunmap(cur_page);
+ cur_page = out_pages[*cur_out / PAGE_SIZE];
+ /* Allocate a new page */
+ if (!cur_page) {
+ cur_page = alloc_page(GFP_NOFS);
+ if (!cur_page)
+ return -ENOMEM;
+ out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ }
+ kaddr = kmap(cur_page);
+
+ memcpy(kaddr + offset_in_page(*cur_out),
+ compressed_data + *cur_out - orig_out, copy_len);
+
+ *cur_out += copy_len;
+ }
+
+ /*
+ * Check if we can fit the next segment header into the remaining space
+ * of the sector.
+ */
+ sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out;
+ if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0)
+ goto out;
+
+ /* The remaining size is not enough, pad it with zeros */
+ memset(kaddr + offset_in_page(*cur_out), 0,
+ sector_bytes_left);
+ *cur_out += sector_bytes_left;
+
+out:
+ kunmap(cur_page);
+ return 0;
+}
+
int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
u64 start, struct page **pages, unsigned long *out_pages,
unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize;
+ struct page *page_in = NULL;
+ char *sizes_ptr;
int ret = 0;
- char *data_in;
- char *cpage_out, *sizes_ptr;
- int nr_pages = 0;
- struct page *in_page = NULL;
- struct page *out_page = NULL;
- unsigned long bytes_left;
- unsigned long len = *total_out;
- unsigned long nr_dest_pages = *out_pages;
- const unsigned long max_out = nr_dest_pages * PAGE_SIZE;
- size_t in_len;
- size_t out_len;
- char *buf;
- unsigned long tot_in = 0;
- unsigned long tot_out = 0;
- unsigned long pg_bytes_left;
- unsigned long out_offset;
- unsigned long bytes;
+ /* Points to the file offset of input data */
+ u64 cur_in = start;
+ /* Points to the current output byte */
+ u32 cur_out = 0;
+ u32 len = *total_out;
*out_pages = 0;
*total_out = 0;
*total_in = 0;
- in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = page_address(in_page);
-
/*
- * store the size of all chunks of compressed data in
- * the first 4 bytes
+ * Skip the header for now, we will later come back and write the total
+ * compressed size
*/
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- cpage_out = page_address(out_page);
- out_offset = LZO_LEN;
- tot_out = LZO_LEN;
- pages[0] = out_page;
- nr_pages = 1;
- pg_bytes_left = PAGE_SIZE - LZO_LEN;
-
- /* compress at most one page of data each time */
- in_len = min(len, PAGE_SIZE);
- while (tot_in < len) {
- ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
- &out_len, workspace->mem);
- if (ret != LZO_E_OK) {
- pr_debug("BTRFS: lzo in loop returned %d\n",
- ret);
+ cur_out += LZO_LEN;
+ while (cur_in < start + len) {
+ char *data_in;
+ const u32 sectorsize_mask = sectorsize - 1;
+ u32 sector_off = (cur_in - start) & sectorsize_mask;
+ u32 in_len;
+ size_t out_len;
+
+ /* Get the input page first */
+ if (!page_in) {
+ page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT);
+ ASSERT(page_in);
+ }
+
+ /* Compress at most one sector of data each time */
+ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
+ ASSERT(in_len);
+ data_in = kmap(page_in);
+ ret = lzo1x_1_compress(data_in +
+ offset_in_page(cur_in), in_len,
+ workspace->cbuf, &out_len,
+ workspace->mem);
+ kunmap(page_in);
+ if (ret < 0) {
+ pr_debug("BTRFS: lzo in loop returned %d\n", ret);
ret = -EIO;
goto out;
}
- /* store the size of this chunk of compressed data */
- write_compress_length(cpage_out + out_offset, out_len);
- tot_out += LZO_LEN;
- out_offset += LZO_LEN;
- pg_bytes_left -= LZO_LEN;
-
- tot_in += in_len;
- tot_out += out_len;
-
- /* copy bytes from the working buffer into the pages */
- buf = workspace->cbuf;
- while (out_len) {
- bytes = min_t(unsigned long, pg_bytes_left, out_len);
-
- memcpy(cpage_out + out_offset, buf, bytes);
-
- out_len -= bytes;
- pg_bytes_left -= bytes;
- buf += bytes;
- out_offset += bytes;
-
- /*
- * we need another page for writing out.
- *
- * Note if there's less than 4 bytes left, we just
- * skip to a new page.
- */
- if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
- pg_bytes_left == 0) {
- if (pg_bytes_left) {
- memset(cpage_out + out_offset, 0,
- pg_bytes_left);
- tot_out += pg_bytes_left;
- }
-
- /* we're done, don't allocate new page */
- if (out_len == 0 && tot_in >= len)
- break;
-
- if (nr_pages == nr_dest_pages) {
- out_page = NULL;
- ret = -E2BIG;
- goto out;
- }
-
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- cpage_out = page_address(out_page);
- pages[nr_pages++] = out_page;
-
- pg_bytes_left = PAGE_SIZE;
- out_offset = 0;
- }
- }
+ ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
+ pages, &cur_out, sectorsize);
+ if (ret < 0)
+ goto out;
+
+ cur_in += in_len;
- /* we're making it bigger, give up */
- if (tot_in > 8192 && tot_in < tot_out) {
+ /*
+ * Check if we're making it bigger after two sectors. And if
+ * it is so, give up.
+ */
+ if (cur_in - start > sectorsize * 2 && cur_in - start < cur_out) {
ret = -E2BIG;
goto out;
}
- /* we're all done */
- if (tot_in >= len)
- break;
-
- if (tot_out > max_out)
- break;
-
- bytes_left = len - tot_in;
- put_page(in_page);
-
- start += PAGE_SIZE;
- in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = page_address(in_page);
- in_len = min(bytes_left, PAGE_SIZE);
- }
-
- if (tot_out >= tot_in) {
- ret = -E2BIG;
- goto out;
+ /* Check if we have reached page boundary */
+ if (IS_ALIGNED(cur_in, PAGE_SIZE)) {
+ put_page(page_in);
+ page_in = NULL;
+ }
}
- /* store the size of all chunks of compressed data */
- sizes_ptr = page_address(pages[0]);
- write_compress_length(sizes_ptr, tot_out);
+ /* Store the size of all chunks of compressed data */
+ sizes_ptr = kmap_local_page(pages[0]);
+ write_compress_length(sizes_ptr, cur_out);
+ kunmap_local(sizes_ptr);
ret = 0;
- *total_out = tot_out;
- *total_in = tot_in;
+ *total_out = cur_out;
+ *total_in = cur_in - start;
out:
- *out_pages = nr_pages;
-
- if (in_page)
- put_page(in_page);
-
+ *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE);
return ret;
}
@@ -283,6 +294,7 @@ static void copy_compressed_segment(struct compressed_bio *cb,
u32 orig_in = *cur_in;
while (*cur_in < orig_in + len) {
+ char *kaddr;
struct page *cur_page;
u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
orig_in + len - *cur_in);
@@ -290,9 +302,11 @@ static void copy_compressed_segment(struct compressed_bio *cb,
ASSERT(copy_len);
cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
+ kaddr = kmap(cur_page);
memcpy(dest + *cur_in - orig_in,
- page_address(cur_page) + offset_in_page(*cur_in),
+ kaddr + offset_in_page(*cur_in),
copy_len);
+ kunmap(cur_page);
*cur_in += copy_len;
}
@@ -303,6 +317,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
struct workspace *workspace = list_entry(ws, struct workspace, list);
const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
+ char *kaddr;
int ret;
/* Compressed data length, can be unaligned */
u32 len_in;
@@ -311,7 +326,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
/* Bytes decompressed so far */
u32 cur_out = 0;
- len_in = read_compress_length(page_address(cb->compressed_pages[0]));
+ kaddr = kmap(cb->compressed_pages[0]);
+ len_in = read_compress_length(kaddr);
+ kunmap(cb->compressed_pages[0]);
cur_in += LZO_LEN;
/*
@@ -345,8 +362,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
(cur_in + LZO_LEN - 1) / sectorsize);
cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
ASSERT(cur_page);
- seg_len = read_compress_length(page_address(cur_page) +
- offset_in_page(cur_in));
+ kaddr = kmap(cur_page);
+ seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
+ kunmap(cur_page);
cur_in += LZO_LEN;
/* Copy the compressed segment payload into workspace */
@@ -431,7 +449,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
destlen = min_t(unsigned long, destlen, PAGE_SIZE);
bytes = min_t(unsigned long, destlen, out_len - start_byte);
- kaddr = page_address(dest_page);
+ kaddr = kmap_local_page(dest_page);
memcpy(kaddr, workspace->buf + start_byte, bytes);
/*
@@ -441,6 +459,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
*/
if (bytes < destlen)
memset(kaddr+bytes, 0, destlen-bytes);
+ kunmap_local(kaddr);
out:
return ret;
}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index d8d268ca8aa7..0e239a4c3b26 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -60,8 +60,7 @@ enum btrfs_rbio_ops {
};
struct btrfs_raid_bio {
- struct btrfs_fs_info *fs_info;
- struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
/* while we're doing rmw on a stripe
* we put it into a hash table so we can
@@ -192,7 +191,7 @@ static void scrub_parity_work(struct btrfs_work *work);
static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
{
btrfs_init_work(&rbio->work, work_func, NULL, NULL);
- btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
+ btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
}
/*
@@ -271,7 +270,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
*/
static int rbio_bucket(struct btrfs_raid_bio *rbio)
{
- u64 num = rbio->bbio->raid_map[0];
+ u64 num = rbio->bioc->raid_map[0];
/*
* we shift down quite a bit. We're using byte
@@ -345,7 +344,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
return;
- table = rbio->fs_info->stripe_hash_table;
+ table = rbio->bioc->fs_info->stripe_hash_table;
h = table->table + bucket;
/* hold the lock for the bucket because we may be
@@ -400,7 +399,7 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
return;
- table = rbio->fs_info->stripe_hash_table;
+ table = rbio->bioc->fs_info->stripe_hash_table;
spin_lock_irqsave(&table->cache_lock, flags);
__remove_rbio_from_cache(rbio);
@@ -460,7 +459,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
return;
- table = rbio->fs_info->stripe_hash_table;
+ table = rbio->bioc->fs_info->stripe_hash_table;
spin_lock_irqsave(&table->cache_lock, flags);
spin_lock(&rbio->bio_list_lock);
@@ -559,8 +558,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
test_bit(RBIO_CACHE_BIT, &cur->flags))
return 0;
- if (last->bbio->raid_map[0] !=
- cur->bbio->raid_map[0])
+ if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
return 0;
/* we can't merge with different operations */
@@ -669,11 +667,11 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
struct btrfs_raid_bio *cache_drop = NULL;
int ret = 0;
- h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
+ h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
spin_lock_irqsave(&h->lock, flags);
list_for_each_entry(cur, &h->hash_list, hash_list) {
- if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0])
+ if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
continue;
spin_lock(&cur->bio_list_lock);
@@ -751,7 +749,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
int keep_cache = 0;
bucket = rbio_bucket(rbio);
- h = rbio->fs_info->stripe_hash_table->table + bucket;
+ h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
if (list_empty(&rbio->plug_list))
cache_rbio(rbio);
@@ -838,7 +836,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
}
}
- btrfs_put_bbio(rbio->bbio);
+ btrfs_put_bioc(rbio->bioc);
kfree(rbio);
}
@@ -865,7 +863,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
struct bio *extra;
if (rbio->generic_bio_cnt)
- btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
+ btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
/*
* At this moment, rbio->bio_list is empty, however since rbio does not
@@ -906,7 +904,7 @@ static void raid_write_end_io(struct bio *bio)
/* OK, we have read all the stripes we need to. */
max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
- 0 : rbio->bbio->max_errors;
+ 0 : rbio->bioc->max_errors;
if (atomic_read(&rbio->error) > max_errors)
err = BLK_STS_IOERR;
@@ -961,12 +959,12 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
* this does not allocate any pages for rbio->pages.
*/
static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
- struct btrfs_bio *bbio,
+ struct btrfs_io_context *bioc,
u64 stripe_len)
{
struct btrfs_raid_bio *rbio;
int nr_data = 0;
- int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+ int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
int num_pages = rbio_nr_pages(stripe_len, real_stripes);
int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
void *p;
@@ -987,8 +985,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
spin_lock_init(&rbio->bio_list_lock);
INIT_LIST_HEAD(&rbio->stripe_cache);
INIT_LIST_HEAD(&rbio->hash_list);
- rbio->bbio = bbio;
- rbio->fs_info = fs_info;
+ rbio->bioc = bioc;
rbio->stripe_len = stripe_len;
rbio->nr_pages = num_pages;
rbio->real_stripes = real_stripes;
@@ -1015,9 +1012,9 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
#undef CONSUME_ALLOC
- if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
nr_data = real_stripes - 1;
- else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+ else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
nr_data = real_stripes - 2;
else
BUG();
@@ -1077,10 +1074,10 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
struct bio *last = bio_list->tail;
int ret;
struct bio *bio;
- struct btrfs_bio_stripe *stripe;
+ struct btrfs_io_stripe *stripe;
u64 disk_start;
- stripe = &rbio->bbio->stripes[stripe_nr];
+ stripe = &rbio->bioc->stripes[stripe_nr];
disk_start = stripe->physical + (page_index << PAGE_SHIFT);
/* if the device is missing, just fail this stripe */
@@ -1105,8 +1102,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
}
/* put a new bio on the list */
- bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
- btrfs_io_bio(bio)->device = stripe->dev;
+ bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
+ btrfs_bio(bio)->device = stripe->dev;
bio->bi_iter.bi_size = 0;
bio_set_dev(bio, stripe->dev->bdev);
bio->bi_iter.bi_sector = disk_start >> 9;
@@ -1155,11 +1152,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
int i = 0;
start = bio->bi_iter.bi_sector << 9;
- stripe_offset = start - rbio->bbio->raid_map[0];
+ stripe_offset = start - rbio->bioc->raid_map[0];
page_index = stripe_offset >> PAGE_SHIFT;
if (bio_flagged(bio, BIO_CLONED))
- bio->bi_iter = btrfs_io_bio(bio)->iter;
+ bio->bi_iter = btrfs_bio(bio)->iter;
bio_for_each_segment(bvec, bio, iter) {
rbio->bio_pages[page_index + i] = bvec.bv_page;
@@ -1179,7 +1176,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
*/
static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
{
- struct btrfs_bio *bbio = rbio->bbio;
+ struct btrfs_io_context *bioc = rbio->bioc;
void **pointers = rbio->finish_pointers;
int nr_data = rbio->nr_data;
int stripe;
@@ -1284,11 +1281,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
}
}
- if (likely(!bbio->num_tgtdevs))
+ if (likely(!bioc->num_tgtdevs))
goto write_data;
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- if (!bbio->tgtdev_map[stripe])
+ if (!bioc->tgtdev_map[stripe])
continue;
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
@@ -1302,7 +1299,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
}
ret = rbio_add_io_page(rbio, &bio_list, page,
- rbio->bbio->tgtdev_map[stripe],
+ rbio->bioc->tgtdev_map[stripe],
pagenr, rbio->stripe_len);
if (ret)
goto cleanup;
@@ -1339,12 +1336,12 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
{
u64 physical = bio->bi_iter.bi_sector;
int i;
- struct btrfs_bio_stripe *stripe;
+ struct btrfs_io_stripe *stripe;
physical <<= 9;
- for (i = 0; i < rbio->bbio->num_stripes; i++) {
- stripe = &rbio->bbio->stripes[i];
+ for (i = 0; i < rbio->bioc->num_stripes; i++) {
+ stripe = &rbio->bioc->stripes[i];
if (in_range(physical, stripe->physical, rbio->stripe_len) &&
stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
return i;
@@ -1365,7 +1362,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
int i;
for (i = 0; i < rbio->nr_data; i++) {
- u64 stripe_start = rbio->bbio->raid_map[i];
+ u64 stripe_start = rbio->bioc->raid_map[i];
if (in_range(logical, stripe_start, rbio->stripe_len))
return i;
@@ -1456,7 +1453,7 @@ static void raid_rmw_end_io(struct bio *bio)
if (!atomic_dec_and_test(&rbio->stripes_pending))
return;
- if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
goto cleanup;
/*
@@ -1538,8 +1535,8 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
}
/*
- * the bbio may be freed once we submit the last bio. Make sure
- * not to touch it after that
+ * The bioc may be freed once we submit the last bio. Make sure not to
+ * touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
@@ -1547,7 +1544,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
bio->bi_end_io = raid_rmw_end_io;
bio->bi_opf = REQ_OP_READ;
- btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
submit_bio(bio);
}
@@ -1719,17 +1716,18 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
/*
* our main entry point for writes from the rest of the FS.
*/
-int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len)
+int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 stripe_len)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
struct btrfs_plug_cb *plug = NULL;
struct blk_plug_cb *cb;
int ret;
- rbio = alloc_rbio(fs_info, bbio, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc, stripe_len);
if (IS_ERR(rbio)) {
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
return PTR_ERR(rbio);
}
bio_list_add(&rbio->bio_list, bio);
@@ -1842,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
}
/* all raid6 handling here */
- if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
+ if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
/*
* single failure, rebuild from parity raid5
* style
@@ -1874,8 +1872,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
* here due to a crc mismatch and we can't give them the
* data they want
*/
- if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
- if (rbio->bbio->raid_map[faila] ==
+ if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
+ if (rbio->bioc->raid_map[faila] ==
RAID5_P_STRIPE) {
err = BLK_STS_IOERR;
goto cleanup;
@@ -1887,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
goto pstripe;
}
- if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
+ if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
raid6_datap_recov(rbio->real_stripes,
PAGE_SIZE, faila, pointers);
} else {
@@ -2006,7 +2004,7 @@ static void raid_recover_end_io(struct bio *bio)
if (!atomic_dec_and_test(&rbio->stripes_pending))
return;
- if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
rbio_orig_end_io(rbio, BLK_STS_IOERR);
else
__raid_recover_end_io(rbio);
@@ -2074,7 +2072,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
* were up to date, or we might have no bios to read because
* the devices were gone.
*/
- if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
+ if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
__raid_recover_end_io(rbio);
return 0;
} else {
@@ -2083,8 +2081,8 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
}
/*
- * the bbio may be freed once we submit the last bio. Make sure
- * not to touch it after that
+ * The bioc may be freed once we submit the last bio. Make sure not to
+ * touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
@@ -2092,7 +2090,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
bio->bi_end_io = raid_recover_end_io;
bio->bi_opf = REQ_OP_READ;
- btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
submit_bio(bio);
}
@@ -2116,22 +2114,22 @@ cleanup:
* so we assume the bio they send down corresponds to a failed part
* of the drive.
*/
-int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- int mirror_num, int generic_io)
+int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 stripe_len, int mirror_num, int generic_io)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
int ret;
if (generic_io) {
- ASSERT(bbio->mirror_num == mirror_num);
- btrfs_io_bio(bio)->mirror_num = mirror_num;
+ ASSERT(bioc->mirror_num == mirror_num);
+ btrfs_bio(bio)->mirror_num = mirror_num;
}
- rbio = alloc_rbio(fs_info, bbio, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc, stripe_len);
if (IS_ERR(rbio)) {
if (generic_io)
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
return PTR_ERR(rbio);
}
@@ -2142,11 +2140,11 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
btrfs_warn(fs_info,
- "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
+"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
__func__, bio->bi_iter.bi_sector << 9,
- (u64)bio->bi_iter.bi_size, bbio->map_type);
+ (u64)bio->bi_iter.bi_size, bioc->map_type);
if (generic_io)
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
kfree(rbio);
return -EIO;
}
@@ -2155,7 +2153,7 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
btrfs_bio_counter_inc_noblocked(fs_info);
rbio->generic_bio_cnt = 1;
} else {
- btrfs_get_bbio(bbio);
+ btrfs_get_bioc(bioc);
}
/*
@@ -2214,23 +2212,23 @@ static void read_rebuild_work(struct btrfs_work *work)
/*
* The following code is used to scrub/replace the parity stripe
*
- * Caller must have already increased bio_counter for getting @bbio.
+ * Caller must have already increased bio_counter for getting @bioc.
*
* Note: We need make sure all the pages that add into the scrub/replace
* raid bio are correct and not be changed during the scrub/replace. That
* is those pages just hold metadata or file data with checksum.
*/
-struct btrfs_raid_bio *
-raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- struct btrfs_device *scrub_dev,
- unsigned long *dbitmap, int stripe_nsectors)
+struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
+ struct btrfs_io_context *bioc,
+ u64 stripe_len, struct btrfs_device *scrub_dev,
+ unsigned long *dbitmap, int stripe_nsectors)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
int i;
- rbio = alloc_rbio(fs_info, bbio, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc, stripe_len);
if (IS_ERR(rbio))
return NULL;
bio_list_add(&rbio->bio_list, bio);
@@ -2242,12 +2240,12 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
/*
- * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
+ * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
* to the end position, so this search can start from the first parity
* stripe.
*/
for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
- if (bbio->stripes[i].dev == scrub_dev) {
+ if (bioc->stripes[i].dev == scrub_dev) {
rbio->scrubp = i;
break;
}
@@ -2260,7 +2258,7 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
/*
- * We have already increased bio_counter when getting bbio, record it
+ * We have already increased bio_counter when getting bioc, record it
* so we can free it at rbio_orig_end_io().
*/
rbio->generic_bio_cnt = 1;
@@ -2275,10 +2273,10 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
int stripe_offset;
int index;
- ASSERT(logical >= rbio->bbio->raid_map[0]);
- ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
+ ASSERT(logical >= rbio->bioc->raid_map[0]);
+ ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] +
rbio->stripe_len * rbio->nr_data);
- stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
+ stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
index = stripe_offset >> PAGE_SHIFT;
rbio->bio_pages[index] = page;
}
@@ -2312,7 +2310,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int need_check)
{
- struct btrfs_bio *bbio = rbio->bbio;
+ struct btrfs_io_context *bioc = rbio->bioc;
void **pointers = rbio->finish_pointers;
unsigned long *pbitmap = rbio->finish_pbitmap;
int nr_data = rbio->nr_data;
@@ -2335,7 +2333,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
else
BUG();
- if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
+ if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
is_replace = 1;
bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
}
@@ -2435,7 +2433,7 @@ writeback:
page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
ret = rbio_add_io_page(rbio, &bio_list, page,
- bbio->tgtdev_map[rbio->scrubp],
+ bioc->tgtdev_map[rbio->scrubp],
pagenr, rbio->stripe_len);
if (ret)
goto cleanup;
@@ -2483,7 +2481,7 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
*/
static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
{
- if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
goto cleanup;
if (rbio->faila >= 0 || rbio->failb >= 0) {
@@ -2504,7 +2502,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
* the data, so the capability of the repair is declined.
* (In the case of RAID5, we can not repair anything)
*/
- if (dfail > rbio->bbio->max_errors - 1)
+ if (dfail > rbio->bioc->max_errors - 1)
goto cleanup;
/*
@@ -2625,8 +2623,8 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
}
/*
- * the bbio may be freed once we submit the last bio. Make sure
- * not to touch it after that
+ * The bioc may be freed once we submit the last bio. Make sure not to
+ * touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
@@ -2634,7 +2632,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
bio->bi_end_io = raid56_parity_scrub_end_io;
bio->bi_opf = REQ_OP_READ;
- btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
submit_bio(bio);
}
@@ -2670,12 +2668,13 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
/* The following code is used for dev replace of a missing RAID 5/6 device. */
struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 length)
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 length)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
- rbio = alloc_rbio(fs_info, bbio, length);
+ rbio = alloc_rbio(fs_info, bioc, length);
if (IS_ERR(rbio))
return NULL;
@@ -2695,7 +2694,7 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
}
/*
- * When we get bbio, we have already increased bio_counter, record it
+ * When we get bioc, we have already increased bio_counter, record it
* so we can free it at rbio_orig_end_io()
*/
rbio->generic_bio_cnt = 1;
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 2503485db859..72c00fc284b5 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -30,25 +30,23 @@ static inline int nr_data_stripes(const struct map_lookup *map)
struct btrfs_raid_bio;
struct btrfs_device;
-int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- int mirror_num, int generic_io);
-int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len);
+int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 stripe_len, int mirror_num, int generic_io);
+int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 stripe_len);
void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
u64 logical);
-struct btrfs_raid_bio *
-raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- struct btrfs_device *scrub_dev,
- unsigned long *dbitmap, int stripe_nsectors);
+struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
+ struct btrfs_io_context *bioc, u64 stripe_len,
+ struct btrfs_device *scrub_dev,
+ unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 length);
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 length);
void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 06713a8fe26b..eb96fdc3be25 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -227,7 +227,7 @@ start_machine:
}
static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
- struct btrfs_bio *bbio)
+ struct btrfs_io_context *bioc)
{
struct btrfs_fs_info *fs_info = dev->fs_info;
int ret;
@@ -275,11 +275,11 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
kref_init(&zone->refcnt);
zone->elems = 0;
zone->device = dev; /* our device always sits at index 0 */
- for (i = 0; i < bbio->num_stripes; ++i) {
+ for (i = 0; i < bioc->num_stripes; ++i) {
/* bounds have already been checked */
- zone->devs[i] = bbio->stripes[i].dev;
+ zone->devs[i] = bioc->stripes[i].dev;
}
- zone->ndevs = bbio->num_stripes;
+ zone->ndevs = bioc->num_stripes;
spin_lock(&fs_info->reada_lock);
ret = radix_tree_insert(&dev->reada_zones,
@@ -309,7 +309,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
int ret;
struct reada_extent *re = NULL;
struct reada_extent *re_exist = NULL;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
struct btrfs_device *dev;
struct btrfs_device *prev_dev;
u64 length;
@@ -345,28 +345,28 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
*/
length = fs_info->nodesize;
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &length, &bbio, 0);
- if (ret || !bbio || length < fs_info->nodesize)
+ &length, &bioc, 0);
+ if (ret || !bioc || length < fs_info->nodesize)
goto error;
- if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
+ if (bioc->num_stripes > BTRFS_MAX_MIRRORS) {
btrfs_err(fs_info,
"readahead: more than %d copies not supported",
BTRFS_MAX_MIRRORS);
goto error;
}
- real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+ real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
for (nzones = 0; nzones < real_stripes; ++nzones) {
struct reada_zone *zone;
- dev = bbio->stripes[nzones].dev;
+ dev = bioc->stripes[nzones].dev;
/* cannot read ahead on missing device. */
if (!dev->bdev)
continue;
- zone = reada_find_zone(dev, logical, bbio);
+ zone = reada_find_zone(dev, logical, bioc);
if (!zone)
continue;
@@ -464,7 +464,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
if (!have_zone)
goto error;
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
return re;
error:
@@ -488,7 +488,7 @@ error:
kref_put(&zone->refcnt, reada_zone_release);
spin_unlock(&fs_info->reada_lock);
}
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
kfree(re);
return re_exist;
}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index d2062d5f71dd..e2b9f8616501 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -678,10 +678,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
if (generic_ref->type == BTRFS_REF_METADATA) {
if (!parent)
- ref_root = generic_ref->tree_ref.root;
+ ref_root = generic_ref->tree_ref.owning_root;
owner = generic_ref->tree_ref.level;
} else if (!parent) {
- ref_root = generic_ref->data_ref.ref_root;
+ ref_root = generic_ref->data_ref.owning_root;
owner = generic_ref->data_ref.ino;
offset = generic_ref->data_ref.offset;
}
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 9b0814318e72..e0f93b357548 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -138,7 +138,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
}
btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
- ClearPageChecked(page);
+ btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
out_unlock:
if (page) {
@@ -649,7 +649,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
struct inode *dst, u64 dst_loff)
{
- int ret;
+ int ret = 0;
u64 i, tail_len, chunk_count;
struct btrfs_root *root_dst = BTRFS_I(dst)->root;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 914d403b4415..33a0ee7ac590 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -25,6 +25,7 @@
#include "backref.h"
#include "misc.h"
#include "subpage.h"
+#include "zoned.h"
/*
* Relocation overview
@@ -1145,9 +1146,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(leaf, fi);
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
num_bytes, parent);
- ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- key.objectid, key.offset);
+ key.objectid, key.offset,
+ root->root_key.objectid, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1156,9 +1157,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
num_bytes, parent);
- ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- key.objectid, key.offset);
+ key.objectid, key.offset,
+ root->root_key.objectid, false);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1367,8 +1368,8 @@ again:
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
blocksize, path->nodes[level]->start);
- ref.skip_qgroup = true;
- btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
+ btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
+ 0, true);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1376,8 +1377,8 @@ again:
}
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
blocksize, 0);
- ref.skip_qgroup = true;
- btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
+ btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0,
+ true);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1386,8 +1387,8 @@ again:
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
blocksize, path->nodes[level]->start);
- btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
- ref.skip_qgroup = true;
+ btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
+ 0, true);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1396,8 +1397,8 @@ again:
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
blocksize, 0);
- btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
- ref.skip_qgroup = true;
+ btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid,
+ 0, true);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -2473,9 +2474,9 @@ static int do_relocation(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
node->eb->start, blocksize,
upper->eb->start);
- ref.real_root = root->root_key.objectid;
btrfs_init_tree_ref(&ref, node->level,
- btrfs_header_owner(upper->eb));
+ btrfs_header_owner(upper->eb),
+ root->root_key.objectid, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (!ret)
ret = btrfs_drop_subtree(trans, root, eb,
@@ -2691,8 +2692,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
list_add_tail(&node->list, &rc->backref_cache.changed);
} else {
path->lowest_level = node->level;
+ if (root == root->fs_info->chunk_root)
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
btrfs_release_path(path);
+ if (root == root->fs_info->chunk_root)
+ btrfs_trans_release_chunk_metadata(trans);
if (ret > 0)
ret = 0;
}
@@ -2852,31 +2857,6 @@ static noinline_for_stack int prealloc_file_extent_cluster(
if (ret)
return ret;
- /*
- * On a zoned filesystem, we cannot preallocate the file region.
- * Instead, we dirty and fiemap_write the region.
- */
- if (btrfs_is_zoned(inode->root->fs_info)) {
- struct btrfs_root *root = inode->root;
- struct btrfs_trans_handle *trans;
-
- end = cluster->end - offset + 1;
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
- i_size_write(&inode->vfs_inode, end);
- ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- return ret;
- }
-
- return btrfs_end_transaction(trans);
- }
-
btrfs_inode_lock(&inode->vfs_inode, 0);
for (nr = 0; nr < cluster->nr; nr++) {
start = cluster->boundary[nr] - offset;
@@ -2903,9 +2883,8 @@ static noinline_for_stack int prealloc_file_extent_cluster(
return ret;
}
-static noinline_for_stack
-int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
- u64 block_start)
+static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode,
+ u64 start, u64 end, u64 block_start)
{
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
@@ -3084,7 +3063,6 @@ release_page:
static int relocate_file_extent_cluster(struct inode *inode,
struct file_extent_cluster *cluster)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 offset = BTRFS_I(inode)->index_cnt;
unsigned long index;
unsigned long last_index;
@@ -3105,7 +3083,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
file_ra_state_init(ra, inode->i_mapping);
- ret = setup_extent_mapping(inode, cluster->start - offset,
+ ret = setup_relocation_extent_mapping(inode, cluster->start - offset,
cluster->end - offset, cluster->start);
if (ret)
goto out;
@@ -3114,8 +3092,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
for (index = (cluster->start - offset) >> PAGE_SHIFT;
index <= last_index && !ret; index++)
ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
- if (btrfs_is_zoned(fs_info) && !ret)
- ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (ret == 0)
WARN_ON(cluster_nr != cluster->nr);
out:
@@ -3770,12 +3746,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct btrfs_inode_item *item;
struct extent_buffer *leaf;
- u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
int ret;
- if (btrfs_is_zoned(trans->fs_info))
- flags &= ~BTRFS_INODE_PREALLOC;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -3790,7 +3762,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_generation(leaf, item, 1);
btrfs_set_inode_size(leaf, item, 0);
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
- btrfs_set_inode_flags(leaf, item, flags);
+ btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
+ BTRFS_INODE_PREALLOC);
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
@@ -4063,6 +4036,9 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
rc->block_group->start,
rc->block_group->length);
+ ret = btrfs_zone_finish(rc->block_group);
+ WARN_ON(ret && ret != -EAGAIN);
+
while (1) {
int finishes_stage;
@@ -4386,8 +4362,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
if (!rc)
return 0;
- BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
- root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
+ BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root));
level = btrfs_header_level(buf);
if (btrfs_header_generation(buf) <=
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 088641ba7a8e..cf82ea6f54fb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -57,7 +57,7 @@ struct scrub_ctx;
struct scrub_recover {
refcount_t refs;
- struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
u64 map_length;
};
@@ -254,7 +254,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx);
static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
{
return spage->recover &&
- (spage->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+ (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
}
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
@@ -798,7 +798,7 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
{
if (refcount_dec_and_test(&recover->refs)) {
btrfs_bio_counter_dec(fs_info);
- btrfs_put_bbio(recover->bbio);
+ btrfs_put_bioc(recover->bioc);
kfree(recover);
}
}
@@ -1027,8 +1027,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
sblock_other = sblocks_for_recheck + mirror_index;
} else {
struct scrub_recover *r = sblock_bad->pagev[0]->recover;
- int max_allowed = r->bbio->num_stripes -
- r->bbio->num_tgtdevs;
+ int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
if (mirror_index >= max_allowed)
break;
@@ -1218,14 +1217,14 @@ out:
return 0;
}
-static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
+static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
{
- if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
return 2;
- else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+ else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
return 3;
else
- return (int)bbio->num_stripes;
+ return (int)bioc->num_stripes;
}
static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
@@ -1269,7 +1268,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
u64 flags = original_sblock->pagev[0]->flags;
u64 have_csum = original_sblock->pagev[0]->have_csum;
struct scrub_recover *recover;
- struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
u64 sublen;
u64 mapped_length;
u64 stripe_offset;
@@ -1288,7 +1287,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
while (length > 0) {
sublen = min_t(u64, length, fs_info->sectorsize);
mapped_length = sublen;
- bbio = NULL;
+ bioc = NULL;
/*
* With a length of sectorsize, each returned stripe represents
@@ -1296,27 +1295,27 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
*/
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &mapped_length, &bbio);
- if (ret || !bbio || mapped_length < sublen) {
- btrfs_put_bbio(bbio);
+ logical, &mapped_length, &bioc);
+ if (ret || !bioc || mapped_length < sublen) {
+ btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
return -EIO;
}
recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
if (!recover) {
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
return -ENOMEM;
}
refcount_set(&recover->refs, 1);
- recover->bbio = bbio;
+ recover->bioc = bioc;
recover->map_length = mapped_length;
BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
- nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
+ nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) {
@@ -1348,17 +1347,17 @@ leave_nomem:
sctx->fs_info->csum_size);
scrub_stripe_index_and_offset(logical,
- bbio->map_type,
- bbio->raid_map,
+ bioc->map_type,
+ bioc->raid_map,
mapped_length,
- bbio->num_stripes -
- bbio->num_tgtdevs,
+ bioc->num_stripes -
+ bioc->num_tgtdevs,
mirror_index,
&stripe_index,
&stripe_offset);
- spage->physical = bbio->stripes[stripe_index].physical +
+ spage->physical = bioc->stripes[stripe_index].physical +
stripe_offset;
- spage->dev = bbio->stripes[stripe_index].dev;
+ spage->dev = bioc->stripes[stripe_index].dev;
BUG_ON(page_index >= original_sblock->page_count);
spage->physical_for_dev_replace =
@@ -1401,7 +1400,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
bio->bi_end_io = scrub_bio_wait_endio;
mirror_num = spage->sblock->pagev[0]->mirror_num;
- ret = raid56_parity_recover(fs_info, bio, spage->recover->bbio,
+ ret = raid56_parity_recover(bio, spage->recover->bioc,
spage->recover->map_length,
mirror_num, 0);
if (ret)
@@ -1423,7 +1422,7 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
if (!first_page->dev->bdev)
goto out;
- bio = btrfs_io_bio_alloc(BIO_MAX_VECS);
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
bio_set_dev(bio, first_page->dev->bdev);
for (page_num = 0; page_num < sblock->page_count; page_num++) {
@@ -1480,7 +1479,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
}
WARN_ON(!spage->page);
- bio = btrfs_io_bio_alloc(1);
+ bio = btrfs_bio_alloc(1);
bio_set_dev(bio, spage->dev->bdev);
bio_add_page(bio, spage->page, fs_info->sectorsize, 0);
@@ -1562,7 +1561,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
return -EIO;
}
- bio = btrfs_io_bio_alloc(1);
+ bio = btrfs_bio_alloc(1);
bio_set_dev(bio, spage_bad->dev->bdev);
bio->bi_iter.bi_sector = spage_bad->physical >> 9;
bio->bi_opf = REQ_OP_WRITE;
@@ -1676,7 +1675,7 @@ again:
sbio->dev = sctx->wr_tgtdev;
bio = sbio->bio;
if (!bio) {
- bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
+ bio = btrfs_bio_alloc(sctx->pages_per_wr_bio);
sbio->bio = bio;
}
@@ -2102,7 +2101,7 @@ again:
sbio->dev = spage->dev;
bio = sbio->bio;
if (!bio) {
- bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
+ bio = btrfs_bio_alloc(sctx->pages_per_rd_bio);
sbio->bio = bio;
}
@@ -2203,7 +2202,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
struct btrfs_fs_info *fs_info = sctx->fs_info;
u64 length = sblock->page_count * PAGE_SIZE;
u64 logical = sblock->pagev[0]->logical;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
struct bio *bio;
struct btrfs_raid_bio *rbio;
int ret;
@@ -2211,27 +2210,27 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &length, &bbio);
- if (ret || !bbio || !bbio->raid_map)
- goto bbio_out;
+ &length, &bioc);
+ if (ret || !bioc || !bioc->raid_map)
+ goto bioc_out;
if (WARN_ON(!sctx->is_dev_replace ||
- !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
+ !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
/*
* We shouldn't be scrubbing a missing device. Even for dev
* replace, we should only get here for RAID 5/6. We either
* managed to mount something with no mirrors remaining or
* there's a bug in scrub_remap_extent()/btrfs_map_block().
*/
- goto bbio_out;
+ goto bioc_out;
}
- bio = btrfs_io_bio_alloc(0);
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
bio->bi_iter.bi_sector = logical >> 9;
bio->bi_private = sblock;
bio->bi_end_io = scrub_missing_raid56_end_io;
- rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
+ rbio = raid56_alloc_missing_rbio(bio, bioc, length);
if (!rbio)
goto rbio_out;
@@ -2249,9 +2248,9 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
rbio_out:
bio_put(bio);
-bbio_out:
+bioc_out:
btrfs_bio_counter_dec(fs_info);
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
@@ -2826,7 +2825,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct bio *bio;
struct btrfs_raid_bio *rbio;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
u64 length;
int ret;
@@ -2838,17 +2837,17 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
- &length, &bbio);
- if (ret || !bbio || !bbio->raid_map)
- goto bbio_out;
+ &length, &bioc);
+ if (ret || !bioc || !bioc->raid_map)
+ goto bioc_out;
- bio = btrfs_io_bio_alloc(0);
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
bio->bi_iter.bi_sector = sparity->logic_start >> 9;
bio->bi_private = sparity;
bio->bi_end_io = scrub_parity_bio_endio;
- rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
- length, sparity->scrub_dev,
+ rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length,
+ sparity->scrub_dev,
sparity->dbitmap,
sparity->nsectors);
if (!rbio)
@@ -2860,9 +2859,9 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
rbio_out:
bio_put(bio);
-bbio_out:
+bioc_out:
btrfs_bio_counter_dec(fs_info);
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
spin_lock(&sctx->stat_lock);
@@ -2901,7 +2900,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_root *csum_root = fs_info->csum_root;
struct btrfs_extent_item *extent;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
u64 flags;
int ret;
int slot;
@@ -3044,22 +3043,22 @@ again:
extent_len);
mapped_length = extent_len;
- bbio = NULL;
+ bioc = NULL;
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
- extent_logical, &mapped_length, &bbio,
+ extent_logical, &mapped_length, &bioc,
0);
if (!ret) {
- if (!bbio || mapped_length < extent_len)
+ if (!bioc || mapped_length < extent_len)
ret = -EIO;
}
if (ret) {
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
goto out;
}
- extent_physical = bbio->stripes[0].physical;
- extent_mirror_num = bbio->mirror_num;
- extent_dev = bbio->stripes[0].dev;
- btrfs_put_bbio(bbio);
+ extent_physical = bioc->stripes[0].physical;
+ extent_mirror_num = bioc->mirror_num;
+ extent_dev = bioc->stripes[0].dev;
+ btrfs_put_bioc(bioc);
ret = btrfs_lookup_csums_range(csum_root,
extent_logical,
@@ -3956,7 +3955,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
int ret;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
/* Seed devices of a new filesystem has their own generation. */
@@ -4068,6 +4067,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
int readonly, int is_dev_replace)
{
+ struct btrfs_dev_lookup_args args = { .devid = devid };
struct scrub_ctx *sctx;
int ret;
struct btrfs_device *dev;
@@ -4115,7 +4115,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
goto out_free_ctx;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
!is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -4288,11 +4288,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_scrub_progress *progress)
{
+ struct btrfs_dev_lookup_args args = { .devid = devid };
struct btrfs_device *dev;
struct scrub_ctx *sctx = NULL;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (dev)
sctx = dev->scrub_ctx;
if (sctx)
@@ -4309,20 +4310,20 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
int *extent_mirror_num)
{
u64 mapped_length;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
int ret;
mapped_length = extent_len;
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
- &mapped_length, &bbio, 0);
- if (ret || !bbio || mapped_length < extent_len ||
- !bbio->stripes[0].dev->bdev) {
- btrfs_put_bbio(bbio);
+ &mapped_length, &bioc, 0);
+ if (ret || !bioc || mapped_length < extent_len ||
+ !bioc->stripes[0].dev->bdev) {
+ btrfs_put_bioc(bioc);
return;
}
- *extent_physical = bbio->stripes[0].physical;
- *extent_mirror_num = bbio->mirror_num;
- *extent_dev = bbio->stripes[0].dev;
- btrfs_put_bbio(bbio);
+ *extent_physical = bioc->stripes[0].physical;
+ *extent_mirror_num = bioc->mirror_num;
+ *extent_dev = bioc->stripes[0].dev;
+ btrfs_put_bioc(bioc);
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 72f9b865e847..040324d71118 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -84,6 +84,8 @@ struct send_ctx {
u64 total_send_size;
u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
+ /* Protocol version compatibility requested */
+ u32 proto;
struct btrfs_root *send_root;
struct btrfs_root *parent_root;
@@ -312,6 +314,16 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx,
sctx->parent_root->root_key.objectid : 0));
}
+__maybe_unused
+static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
+{
+ switch (sctx->proto) {
+ case 1: return cmd < __BTRFS_SEND_C_MAX_V1;
+ case 2: return cmd < __BTRFS_SEND_C_MAX_V2;
+ default: return false;
+ }
+}
+
static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
static struct waiting_dir_move *
@@ -2720,19 +2732,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx)
if (S_ISDIR(sctx->cur_inode_mode)) {
ret = did_create_dir(sctx, sctx->cur_ino);
if (ret < 0)
- goto out;
- if (ret) {
- ret = 0;
- goto out;
- }
+ return ret;
+ else if (ret > 0)
+ return 0;
}
- ret = send_create_inode(sctx, sctx->cur_ino);
- if (ret < 0)
- goto out;
-
-out:
- return ret;
+ return send_create_inode(sctx, sctx->cur_ino);
}
struct recorded_ref {
@@ -7276,6 +7281,17 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
sctx->flags = arg->flags;
+ if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
+ if (arg->version > BTRFS_SEND_STREAM_VERSION) {
+ ret = -EPROTO;
+ goto out;
+ }
+ /* Zero means "use the highest version" */
+ sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION;
+ } else {
+ sctx->proto = 1;
+ }
+
sctx->send_filp = fget(arg->send_fd);
if (!sctx->send_filp) {
ret = -EBADF;
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index de91488b7cd0..23bcefc84e49 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -48,6 +48,7 @@ struct btrfs_tlv_header {
enum btrfs_send_cmd {
BTRFS_SEND_C_UNSPEC,
+ /* Version 1 */
BTRFS_SEND_C_SUBVOL,
BTRFS_SEND_C_SNAPSHOT,
@@ -76,6 +77,12 @@ enum btrfs_send_cmd {
BTRFS_SEND_C_END,
BTRFS_SEND_C_UPDATE_EXTENT,
+ __BTRFS_SEND_C_MAX_V1,
+
+ /* Version 2 */
+ __BTRFS_SEND_C_MAX_V2,
+
+ /* End */
__BTRFS_SEND_C_MAX,
};
#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index aa5be0b24987..48d77f360a24 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -885,6 +885,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
{
struct reserve_ticket *ticket;
u64 tickets_id = space_info->tickets_id;
+ const bool aborted = BTRFS_FS_ERROR(fs_info);
trace_btrfs_fail_all_tickets(fs_info, space_info);
@@ -898,16 +899,19 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
- if (ticket->steal &&
+ if (!aborted && ticket->steal &&
steal_from_global_rsv(fs_info, space_info, ticket))
return true;
- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
btrfs_info(fs_info, "failing ticket with %llu bytes",
ticket->bytes);
remove_ticket(space_info, ticket);
- ticket->error = -ENOSPC;
+ if (aborted)
+ ticket->error = -EIO;
+ else
+ ticket->error = -ENOSPC;
wake_up(&ticket->wait);
/*
@@ -916,7 +920,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
* here to see if we can make progress with the next ticket in
* the list.
*/
- btrfs_try_granting_tickets(fs_info, space_info);
+ if (!aborted)
+ btrfs_try_granting_tickets(fs_info, space_info);
}
return (tickets_id != space_info->tickets_id);
}
@@ -1172,6 +1177,10 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
spin_unlock(&space_info->lock);
return;
}
+
+ /* Something happened, fail everything and bail. */
+ if (BTRFS_FS_ERROR(fs_info))
+ goto aborted_fs;
last_tickets_id = space_info->tickets_id;
spin_unlock(&space_info->lock);
}
@@ -1202,9 +1211,20 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
} else {
flush_state = 0;
}
+
+ /* Something happened, fail everything and bail. */
+ if (BTRFS_FS_ERROR(fs_info))
+ goto aborted_fs;
+
}
spin_unlock(&space_info->lock);
}
+ return;
+
+aborted_fs:
+ maybe_fail_all_tickets(fs_info, space_info);
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
}
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index cb10e56ee31e..29bd8c7a7706 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -63,11 +63,41 @@
* This means a slightly higher tree locking latency.
*/
+void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize)
+{
+ unsigned int cur = 0;
+ unsigned int nr_bits;
+
+ ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize));
+
+ nr_bits = PAGE_SIZE / sectorsize;
+ subpage_info->bitmap_nr_bits = nr_bits;
+
+ subpage_info->uptodate_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->error_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->dirty_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->writeback_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->ordered_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->checked_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->total_nr_bits = cur;
+}
+
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct page *page, enum btrfs_subpage_type type)
{
- struct btrfs_subpage *subpage = NULL;
- int ret;
+ struct btrfs_subpage *subpage;
/*
* We have cases like a dummy extent buffer page, which is not mappped
@@ -75,13 +105,15 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
*/
if (page->mapping)
ASSERT(PageLocked(page));
+
/* Either not subpage, or the page already has private attached */
if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page))
return 0;
- ret = btrfs_alloc_subpage(fs_info, &subpage, type);
- if (ret < 0)
- return ret;
+ subpage = btrfs_alloc_subpage(fs_info, type);
+ if (IS_ERR(subpage))
+ return PTR_ERR(subpage);
+
attach_page_private(page, subpage);
return 0;
}
@@ -100,24 +132,28 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
btrfs_free_subpage(subpage);
}
-int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- struct btrfs_subpage **ret,
- enum btrfs_subpage_type type)
+struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+ enum btrfs_subpage_type type)
{
- if (fs_info->sectorsize == PAGE_SIZE)
- return 0;
+ struct btrfs_subpage *ret;
+ unsigned int real_size;
+
+ ASSERT(fs_info->sectorsize < PAGE_SIZE);
+
+ real_size = struct_size(ret, bitmaps,
+ BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits));
+ ret = kzalloc(real_size, GFP_NOFS);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
- *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
- if (!*ret)
- return -ENOMEM;
- spin_lock_init(&(*ret)->lock);
+ spin_lock_init(&ret->lock);
if (type == BTRFS_SUBPAGE_METADATA) {
- atomic_set(&(*ret)->eb_refs, 0);
+ atomic_set(&ret->eb_refs, 0);
} else {
- atomic_set(&(*ret)->readers, 0);
- atomic_set(&(*ret)->writers, 0);
+ atomic_set(&ret->readers, 0);
+ atomic_set(&ret->writers, 0);
}
- return 0;
+ return ret;
}
void btrfs_free_subpage(struct btrfs_subpage *subpage)
@@ -222,8 +258,16 @@ static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
u32 orig_len = *len;
*start = max_t(u64, page_offset(page), orig_start);
- *len = min_t(u64, page_offset(page) + PAGE_SIZE,
- orig_start + orig_len) - *start;
+ /*
+ * For certain call sites like btrfs_drop_pages(), we may have pages
+ * beyond the target range. In that case, just set @len to 0, subpage
+ * helpers can handle @len == 0 without any problem.
+ */
+ if (page_offset(page) >= orig_start + orig_len)
+ *len = 0;
+ else
+ *len = min_t(u64, page_offset(page) + PAGE_SIZE,
+ orig_start + orig_len) - *start;
}
void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
@@ -248,6 +292,16 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
btrfs_subpage_assert(fs_info, page, start, len);
+ /*
+ * We have call sites passing @lock_page into
+ * extent_clear_unlock_delalloc() for compression path.
+ *
+ * This @locked_page is locked by plain lock_page(), thus its
+ * subpage::writers is 0. Handle them in a special way.
+ */
+ if (atomic_read(&subpage->writers) == 0)
+ return true;
+
ASSERT(atomic_read(&subpage->writers) >= nbits);
return atomic_sub_and_test(nbits, &subpage->writers);
}
@@ -289,37 +343,59 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
unlock_page(page);
}
-/*
- * Convert the [start, start + len) range into a u16 bitmap
- *
- * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0.
- */
-static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+static bool bitmap_test_range_all_set(unsigned long *addr, unsigned int start,
+ unsigned int nbits)
{
- const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits;
- const int nbits = len >> fs_info->sectorsize_bits;
+ unsigned int found_zero;
- btrfs_subpage_assert(fs_info, page, start, len);
+ found_zero = find_next_zero_bit(addr, start + nbits, start);
+ if (found_zero == start + nbits)
+ return true;
+ return false;
+}
- /*
- * Here nbits can be 16, thus can go beyond u16 range. We make the
- * first left shift to be calculate in unsigned long (at least u32),
- * then truncate the result to u16.
- */
- return (u16)(((1UL << nbits) - 1) << bit_start);
+static bool bitmap_test_range_all_zero(unsigned long *addr, unsigned int start,
+ unsigned int nbits)
+{
+ unsigned int found_set;
+
+ found_set = find_next_bit(addr, start + nbits, start);
+ if (found_set == start + nbits)
+ return true;
+ return false;
}
+#define subpage_calc_start_bit(fs_info, page, name, start, len) \
+({ \
+ unsigned int start_bit; \
+ \
+ btrfs_subpage_assert(fs_info, page, start, len); \
+ start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
+ start_bit += fs_info->subpage_info->name##_offset; \
+ start_bit; \
+})
+
+#define subpage_test_bitmap_all_set(fs_info, subpage, name) \
+ bitmap_test_range_all_set(subpage->bitmaps, \
+ fs_info->subpage_info->name##_offset, \
+ fs_info->subpage_info->bitmap_nr_bits)
+
+#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \
+ bitmap_test_range_all_zero(subpage->bitmaps, \
+ fs_info->subpage_info->name##_offset, \
+ fs_info->subpage_info->bitmap_nr_bits)
+
void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ uptodate, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->uptodate_bitmap |= tmp;
- if (subpage->uptodate_bitmap == U16_MAX)
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate))
SetPageUptodate(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -328,11 +404,12 @@ void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ uptodate, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->uptodate_bitmap &= ~tmp;
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
ClearPageUptodate(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -341,11 +418,12 @@ void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ error, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->error_bitmap |= tmp;
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
SetPageError(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -354,12 +432,13 @@ void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ error, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->error_bitmap &= ~tmp;
- if (subpage->error_bitmap == 0)
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, error))
ClearPageError(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -368,11 +447,12 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ dirty, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->dirty_bitmap |= tmp;
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
spin_unlock_irqrestore(&subpage->lock, flags);
set_page_dirty(page);
}
@@ -391,13 +471,14 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ dirty, start, len);
unsigned long flags;
bool last = false;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->dirty_bitmap &= ~tmp;
- if (subpage->dirty_bitmap == 0)
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty))
last = true;
spin_unlock_irqrestore(&subpage->lock, flags);
return last;
@@ -417,11 +498,12 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ writeback, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->writeback_bitmap |= tmp;
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
set_page_writeback(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -430,12 +512,13 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ writeback, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->writeback_bitmap &= ~tmp;
- if (subpage->writeback_bitmap == 0) {
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) {
ASSERT(PageWriteback(page));
end_page_writeback(page);
}
@@ -446,11 +529,12 @@ void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ ordered, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->ordered_bitmap |= tmp;
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
SetPageOrdered(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -459,15 +543,46 @@ void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ ordered, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->ordered_bitmap &= ~tmp;
- if (subpage->ordered_bitmap == 0)
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered))
ClearPageOrdered(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
+
+void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ checked, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_set(fs_info, subpage, checked))
+ SetPageChecked(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ checked, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ ClearPageChecked(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
/*
* Unlike set/clear which is dependent on each page status, for test all bits
* are tested in the same way.
@@ -477,12 +592,14 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \
+ name, start, len); \
unsigned long flags; \
bool ret; \
\
spin_lock_irqsave(&subpage->lock, flags); \
- ret = ((subpage->name##_bitmap & tmp) == tmp); \
+ ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit, \
+ len >> fs_info->sectorsize_bits); \
spin_unlock_irqrestore(&subpage->lock, flags); \
return ret; \
}
@@ -491,6 +608,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
/*
* Note that, in selftests (extent-io-tests), we can have empty fs_info passed
@@ -561,6 +679,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
PageWriteback);
IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
PageOrdered);
+IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked);
/*
* Make sure not only the page dirty bit is cleared, but also subpage dirty bit
@@ -579,5 +698,48 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
return;
ASSERT(PagePrivate(page) && page->private);
- ASSERT(subpage->dirty_bitmap == 0);
+ ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty));
+}
+
+/*
+ * Handle different locked pages with different page sizes:
+ *
+ * - Page locked by plain lock_page()
+ * It should not have any subpage::writers count.
+ * Can be unlocked by unlock_page().
+ * This is the most common locked page for __extent_writepage() called
+ * inside extent_write_cache_pages() or extent_write_full_page().
+ * Rarer cases include the @locked_page from extent_write_locked_range().
+ *
+ * - Page locked by lock_delalloc_pages()
+ * There is only one caller, all pages except @locked_page for
+ * extent_write_locked_range().
+ * In this case, we have to call subpage helper to handle the case.
+ */
+void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
+ u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage;
+
+ ASSERT(PageLocked(page));
+ /* For regular page size case, we just unlock the page */
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return unlock_page(page);
+
+ ASSERT(PagePrivate(page) && page->private);
+ subpage = (struct btrfs_subpage *)page->private;
+
+ /*
+ * For subpage case, there are two types of locked page. With or
+ * without writers number.
+ *
+ * Since we own the page lock, no one else could touch subpage::writers
+ * and we are safe to do several atomic operations without spinlock.
+ */
+ if (atomic_read(&subpage->writers))
+ /* No writers, locked by plain lock_page() */
+ return unlock_page(page);
+
+ /* Have writers, use proper subpage helper to end it */
+ btrfs_page_end_writer_lock(fs_info, page, start, len);
}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 0120948f37a1..7accb5c40d33 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -6,10 +6,38 @@
#include <linux/spinlock.h>
/*
- * Maximum page size we support is 64K, minimum sector size is 4K, u16 bitmap
- * is sufficient. Regular bitmap_* is not used due to size reasons.
+ * Extra info for subpapge bitmap.
+ *
+ * For subpage we pack all uptodate/error/dirty/writeback/ordered bitmaps into
+ * one larger bitmap.
+ *
+ * This structure records how they are organized in the bitmap:
+ *
+ * /- uptodate_offset /- error_offset /- dirty_offset
+ * | | |
+ * v v v
+ * |u|u|u|u|........|u|u|e|e|.......|e|e| ... |o|o|
+ * |<- bitmap_nr_bits ->|
+ * |<--------------- total_nr_bits ---------------->|
*/
-#define BTRFS_SUBPAGE_BITMAP_SIZE 16
+struct btrfs_subpage_info {
+ /* Number of bits for each bitmap */
+ unsigned int bitmap_nr_bits;
+
+ /* Total number of bits for the whole bitmap */
+ unsigned int total_nr_bits;
+
+ /*
+ * *_start indicates where the bitmap starts, the length is always
+ * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize.
+ */
+ unsigned int uptodate_offset;
+ unsigned int error_offset;
+ unsigned int dirty_offset;
+ unsigned int writeback_offset;
+ unsigned int ordered_offset;
+ unsigned int checked_offset;
+};
/*
* Structure to trace status of each sector inside a page, attached to
@@ -18,10 +46,6 @@
struct btrfs_subpage {
/* Common members for both data and metadata pages */
spinlock_t lock;
- u16 uptodate_bitmap;
- u16 error_bitmap;
- u16 dirty_bitmap;
- u16 writeback_bitmap;
/*
* Both data and metadata needs to track how many readers are for the
* page.
@@ -38,14 +62,11 @@ struct btrfs_subpage {
* manages whether the subpage can be detached.
*/
atomic_t eb_refs;
- /* Structures only used by data */
- struct {
- atomic_t writers;
- /* Tracke pending ordered extent in this sector */
- u16 ordered_bitmap;
- };
+ /* Structures only used by data */
+ atomic_t writers;
};
+ unsigned long bitmaps[];
};
enum btrfs_subpage_type {
@@ -53,15 +74,15 @@ enum btrfs_subpage_type {
BTRFS_SUBPAGE_DATA,
};
+void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize);
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct page *page, enum btrfs_subpage_type type);
void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
struct page *page);
/* Allocate additional data where page represents more than one sector */
-int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- struct btrfs_subpage **ret,
- enum btrfs_subpage_type type);
+struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+ enum btrfs_subpage_type type);
void btrfs_free_subpage(struct btrfs_subpage *subpage);
void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
@@ -122,11 +143,14 @@ DECLARE_BTRFS_SUBPAGE_OPS(error);
DECLARE_BTRFS_SUBPAGE_OPS(dirty);
DECLARE_BTRFS_SUBPAGE_OPS(writeback);
DECLARE_BTRFS_SUBPAGE_OPS(ordered);
+DECLARE_BTRFS_SUBPAGE_OPS(checked);
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len);
void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
struct page *page);
+void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
+ u64 start, u32 len);
#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 537d90bf5d84..a1c54a2c787c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1705,7 +1705,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
goto error_close_devices;
}
- bdev = fs_devices->latest_bdev;
+ bdev = fs_devices->latest_dev->bdev;
s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
fs_info);
if (IS_ERR(s)) {
@@ -2006,7 +2006,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto restore;
} else {
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
btrfs_err(fs_info,
"Remounting read-write after error is not allowed");
ret = -EINVAL;
@@ -2463,30 +2463,16 @@ static int btrfs_unfreeze(struct super_block *sb)
static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
{
struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
- struct btrfs_device *dev, *first_dev = NULL;
/*
- * Lightweight locking of the devices. We should not need
- * device_list_mutex here as we only read the device data and the list
- * is protected by RCU. Even if a device is deleted during the list
- * traversals, we'll get valid data, the freeing callback will wait at
- * least until the rcu_read_unlock.
+ * There should be always a valid pointer in latest_dev, it may be stale
+ * for a short moment in case it's being deleted but still valid until
+ * the end of RCU grace period.
*/
rcu_read_lock();
- list_for_each_entry_rcu(dev, &fs_info->fs_devices->devices, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
- continue;
- if (!dev->name)
- continue;
- if (!first_dev || dev->devid < first_dev->devid)
- first_dev = dev;
- }
-
- if (first_dev)
- seq_escape(m, rcu_str_deref(first_dev->name), " \t\n\\");
- else
- WARN_ON(1);
+ seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\");
rcu_read_unlock();
+
return 0;
}
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 25a6f587852b..f9eff3b0f77c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -177,7 +177,7 @@ static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
} else
val = can_modify_feature(fa);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
@@ -330,7 +330,7 @@ static const struct attribute_group btrfs_feature_attr_group = {
static ssize_t rmdir_subvol_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return scnprintf(buf, PAGE_SIZE, "0\n");
+ return sysfs_emit(buf, "0\n");
}
BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show);
@@ -345,12 +345,12 @@ static ssize_t supported_checksums_show(struct kobject *kobj,
* This "trick" only works as long as 'enum btrfs_csum_type' has
* no holes in it
*/
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
- (i == 0 ? "" : " "), btrfs_super_csum_name(i));
+ ret += sysfs_emit_at(buf, ret, "%s%s", (i == 0 ? "" : " "),
+ btrfs_super_csum_name(i));
}
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
@@ -358,7 +358,7 @@ BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
static ssize_t send_stream_version_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION);
+ return sysfs_emit(buf, "%d\n", BTRFS_SEND_STREAM_VERSION);
}
BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
@@ -378,9 +378,8 @@ static ssize_t supported_rescue_options_show(struct kobject *kobj,
int i;
for (i = 0; i < ARRAY_SIZE(rescue_opts); i++)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
- (i ? " " : ""), rescue_opts[i]);
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ ret += sysfs_emit_at(buf, ret, "%s%s", (i ? " " : ""), rescue_opts[i]);
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
BTRFS_ATTR(static_feature, supported_rescue_options,
@@ -394,10 +393,10 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
/* 4K sector size is also supported with 64K page size */
if (PAGE_SIZE == SZ_64K)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%u ", SZ_4K);
+ ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K);
/* Only sectorsize == PAGE_SIZE is now supported */
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE);
+ ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE);
return ret;
}
@@ -437,7 +436,7 @@ static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%lld\n",
+ return sysfs_emit(buf, "%lld\n",
atomic64_read(&fs_info->discard_ctl.discardable_bytes));
}
BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show);
@@ -448,7 +447,7 @@ static ssize_t btrfs_discardable_extents_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%d\n",
+ return sysfs_emit(buf, "%d\n",
atomic_read(&fs_info->discard_ctl.discardable_extents));
}
BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show);
@@ -459,8 +458,8 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n",
- fs_info->discard_ctl.discard_bitmap_bytes);
+ return sysfs_emit(buf, "%llu\n",
+ fs_info->discard_ctl.discard_bitmap_bytes);
}
BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show);
@@ -470,7 +469,7 @@ static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%lld\n",
+ return sysfs_emit(buf, "%lld\n",
atomic64_read(&fs_info->discard_ctl.discard_bytes_saved));
}
BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show);
@@ -481,8 +480,8 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n",
- fs_info->discard_ctl.discard_extent_bytes);
+ return sysfs_emit(buf, "%llu\n",
+ fs_info->discard_ctl.discard_extent_bytes);
}
BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show);
@@ -492,8 +491,8 @@ static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n",
- READ_ONCE(fs_info->discard_ctl.iops_limit));
+ return sysfs_emit(buf, "%u\n",
+ READ_ONCE(fs_info->discard_ctl.iops_limit));
}
static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj,
@@ -523,8 +522,8 @@ static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n",
- READ_ONCE(fs_info->discard_ctl.kbps_limit));
+ return sysfs_emit(buf, "%u\n",
+ READ_ONCE(fs_info->discard_ctl.kbps_limit));
}
static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj,
@@ -553,8 +552,8 @@ static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n",
- READ_ONCE(fs_info->discard_ctl.max_discard_size));
+ return sysfs_emit(buf, "%llu\n",
+ READ_ONCE(fs_info->discard_ctl.max_discard_size));
}
static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj,
@@ -627,7 +626,7 @@ static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
val = *value_ptr;
if (lock)
spin_unlock(lock);
- return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
+ return sysfs_emit(buf, "%llu\n", val);
}
static ssize_t global_rsv_size_show(struct kobject *kobj,
@@ -673,7 +672,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
val += block_group->used;
}
up_read(&sinfo->groups_sem);
- return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
+ return sysfs_emit(buf, "%llu\n", val);
}
/*
@@ -771,7 +770,7 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
ssize_t ret;
spin_lock(&fs_info->super_lock);
- ret = scnprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
+ ret = sysfs_emit(buf, label[0] ? "%s\n" : "%s", label);
spin_unlock(&fs_info->super_lock);
return ret;
@@ -819,7 +818,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
+ return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize);
}
BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
@@ -829,8 +828,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n",
- fs_info->super_copy->sectorsize);
+ return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
}
BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
@@ -840,7 +838,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
+ return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
}
BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
@@ -852,7 +850,7 @@ static ssize_t quota_override_show(struct kobject *kobj,
int quota_override;
quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
- return scnprintf(buf, PAGE_SIZE, "%d\n", quota_override);
+ return sysfs_emit(buf, "%d\n", quota_override);
}
static ssize_t quota_override_store(struct kobject *kobj,
@@ -890,8 +888,7 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%pU\n",
- fs_info->fs_devices->metadata_uuid);
+ return sysfs_emit(buf, "%pU\n", fs_info->fs_devices->metadata_uuid);
}
BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show);
@@ -902,9 +899,9 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj,
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
u16 csum_type = btrfs_super_csum_type(fs_info->super_copy);
- return scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
- btrfs_super_csum_name(csum_type),
- crypto_shash_driver_name(fs_info->csum_shash));
+ return sysfs_emit(buf, "%s (%s)\n",
+ btrfs_super_csum_name(csum_type),
+ crypto_shash_driver_name(fs_info->csum_shash));
}
BTRFS_ATTR(, checksum, btrfs_checksum_show);
@@ -941,7 +938,7 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
str = "UNKNOWN\n";
break;
}
- return scnprintf(buf, PAGE_SIZE, "%s", str);
+ return sysfs_emit(buf, "%s", str);
}
BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
@@ -950,7 +947,7 @@ static ssize_t btrfs_generation_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->generation);
+ return sysfs_emit(buf, "%llu\n", fs_info->generation);
}
BTRFS_ATTR(, generation, btrfs_generation_show);
@@ -1028,8 +1025,7 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
ssize_t ret;
- ret = scnprintf(buf, PAGE_SIZE, "%d\n",
- READ_ONCE(fs_info->bg_reclaim_threshold));
+ ret = sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold));
return ret;
}
@@ -1471,7 +1467,7 @@ static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show);
@@ -1484,7 +1480,7 @@ static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show);
@@ -1498,7 +1494,7 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show);
@@ -1509,8 +1505,7 @@ static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj,
struct btrfs_device *device = container_of(kobj, struct btrfs_device,
devid_kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n",
- READ_ONCE(device->scrub_speed_max));
+ return sysfs_emit(buf, "%llu\n", READ_ONCE(device->scrub_speed_max));
}
static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj,
@@ -1538,7 +1533,7 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
@@ -1549,14 +1544,14 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
devid_kobj);
if (!device->dev_stats_valid)
- return scnprintf(buf, PAGE_SIZE, "invalid\n");
+ return sysfs_emit(buf, "invalid\n");
/*
* Print all at once so we get a snapshot of all values from the same
* time. Keep them in sync and in order of definition of
* btrfs_dev_stat_values.
*/
- return scnprintf(buf, PAGE_SIZE,
+ return sysfs_emit(buf,
"write_errs %d\n"
"read_errs %d\n"
"flush_errs %d\n"
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index df54cdfdc250..2a95f7224e18 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -60,7 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
key.type = BTRFS_EXTENT_CSUM_KEY;
key.offset = 0;
- setup_items_for_insert(root, path, &key, &value_len, 1);
+ btrfs_setup_item_for_insert(root, path, &key, value_len);
item = btrfs_item_nr(0);
write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
value_len);
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 73e96d505f4f..c2e72e7a8ff0 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -112,7 +112,7 @@ static int test_find_delalloc(u32 sectorsize)
*/
set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL);
start = 0;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
@@ -143,7 +143,7 @@ static int test_find_delalloc(u32 sectorsize)
}
set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL);
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
@@ -177,14 +177,14 @@ static int test_find_delalloc(u32 sectorsize)
goto out_bits;
}
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (found) {
test_err("found range when we shouldn't have");
goto out_bits;
}
- if (end != (u64)-1) {
+ if (end != test_start + PAGE_SIZE - 1) {
test_err("did not return the proper end offset");
goto out_bits;
}
@@ -198,7 +198,7 @@ static int test_find_delalloc(u32 sectorsize)
*/
set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL);
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
@@ -233,7 +233,7 @@ static int test_find_delalloc(u32 sectorsize)
/* We unlocked it in the previous test */
lock_page(locked_page);
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
/*
* Currently if we fail to find dirty pages in the delalloc range we
* will adjust max_bytes down to PAGE_SIZE and then re-search. If
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index c9874b12d337..cac89c388131 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -33,7 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = start;
- setup_items_for_insert(root, &path, &key, &value_len, 1);
+ btrfs_setup_item_for_insert(root, &path, &key, value_len);
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi, 1);
btrfs_set_file_extent_type(leaf, fi, type);
@@ -63,7 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- setup_items_for_insert(root, &path, &key, &value_len, 1);
+ btrfs_setup_item_for_insert(root, &path, &key, value_len);
}
/*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 14b9fdc8aaa9..1c3a1189c0bd 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -283,7 +283,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->trans_lock);
loop:
/* The file system has been taken offline. No new transactions. */
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
spin_unlock(&fs_info->trans_lock);
return -EROFS;
}
@@ -331,7 +331,7 @@ loop:
*/
kfree(cur_trans);
goto loop;
- } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ } else if (BTRFS_FS_ERROR(fs_info)) {
spin_unlock(&fs_info->trans_lock);
kfree(cur_trans);
return -EROFS;
@@ -579,7 +579,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
bool do_chunk_alloc = false;
int ret;
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
return ERR_PTR(-EROFS);
if (current->journal_info) {
@@ -991,8 +991,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (throttle)
btrfs_run_delayed_iputs(info);
- if (TRANS_ABORTED(trans) ||
- test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
+ if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) {
wake_up_process(info->transaction_kthread);
if (TRANS_ABORTED(trans))
err = trans->aborted;
@@ -2155,7 +2154,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* abort to prevent writing a new superblock that reflects a
* corrupt state (pointing to trees with unwritten nodes/leafs).
*/
- if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
ret = -EROFS;
goto cleanup_transaction;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f7efc26aa82a..8ab33caf016f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,7 +94,7 @@ enum {
};
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
int inode_only,
struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
@@ -207,7 +207,7 @@ again:
}
atomic_inc(&root->log_writers);
- if (ctx && !ctx->logging_new_name) {
+ if (!ctx->logging_new_name) {
int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
@@ -368,25 +368,11 @@ static int process_one_buffer(struct btrfs_root *log,
return ret;
}
-/*
- * Item overwrite used by replay and tree logging. eb, slot and key all refer
- * to the src data we are copying out.
- *
- * root is the tree we are copying into, and path is a scratch
- * path for use in this function (it should be released on entry and
- * will be released on exit).
- *
- * If the key is already in the destination tree the existing item is
- * overwritten. If the existing item isn't big enough, it is extended.
- * If it is too large, it is truncated.
- *
- * If the key isn't in the destination yet, a new item is inserted.
- */
-static noinline int overwrite_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static int do_overwrite_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
{
int ret;
u32 item_size;
@@ -403,10 +389,22 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
item_size = btrfs_item_size_nr(eb, slot);
src_ptr = btrfs_item_ptr_offset(eb, slot);
- /* look for the key in the destination tree */
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
- if (ret < 0)
- return ret;
+ /* Our caller must have done a search for the key for us. */
+ ASSERT(path->nodes[0] != NULL);
+
+ /*
+ * And the slot must point to the exact key or the slot where the key
+ * should be at (the first item with a key greater than 'key')
+ */
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ struct btrfs_key found_key;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+ ret = btrfs_comp_cpu_keys(&found_key, key);
+ ASSERT(ret >= 0);
+ } else {
+ ret = 1;
+ }
if (ret == 0) {
char *src_copy;
@@ -585,6 +583,36 @@ no_copy:
}
/*
+ * Item overwrite used by replay and tree logging. eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten. If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static int overwrite_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ int ret;
+
+ /* Look for the key in the destination tree. */
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ return do_overwrite_item(trans, root, path, eb, slot, key);
+}
+
+/*
* simple helper to read an inode off the disk from a given root
* This can only be called for subvolume roots and not for the log
*/
@@ -761,7 +789,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ins.objectid, ins.offset, 0);
btrfs_init_data_ref(&ref,
root->root_key.objectid,
- key->objectid, offset);
+ key->objectid, offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret)
goto out;
@@ -893,11 +921,11 @@ out:
* item
*/
static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_inode *dir,
struct btrfs_dir_item *di)
{
+ struct btrfs_root *root = dir->root;
struct inode *inode;
char *name;
int name_len;
@@ -926,7 +954,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
+ ret = btrfs_unlink_inode(trans, dir, BTRFS_I(inode), name,
name_len);
if (ret)
goto out;
@@ -939,9 +967,11 @@ out:
}
/*
- * helper function to see if a given name and sequence number found
- * in an inode back reference are already in a directory and correctly
- * point to this inode
+ * See if a given name and sequence number found in an inode back reference are
+ * already in a directory and correctly point to this inode.
+ *
+ * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
+ * exists.
*/
static noinline int inode_in_dir(struct btrfs_root *root,
struct btrfs_path *path,
@@ -950,29 +980,34 @@ static noinline int inode_in_dir(struct btrfs_root *root,
{
struct btrfs_dir_item *di;
struct btrfs_key location;
- int match = 0;
+ int ret = 0;
di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
index, name, name_len, 0);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ } else if (di) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
if (location.objectid != objectid)
goto out;
- } else
+ } else {
goto out;
- btrfs_release_path(path);
+ }
+ btrfs_release_path(path);
di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
- if (di && !IS_ERR(di)) {
- btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
- if (location.objectid != objectid)
- goto out;
- } else
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
goto out;
- match = 1;
+ } else if (di) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ if (location.objectid == objectid)
+ ret = 1;
+ }
out:
btrfs_release_path(path);
- return match;
+ return ret;
}
/*
@@ -1084,7 +1119,7 @@ again:
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
- ret = btrfs_unlink_inode(trans, root, dir, inode,
+ ret = btrfs_unlink_inode(trans, dir, inode,
victim_name, victim_name_len);
kfree(victim_name);
if (ret)
@@ -1155,7 +1190,7 @@ again:
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
- ret = btrfs_unlink_inode(trans, root,
+ ret = btrfs_unlink_inode(trans,
BTRFS_I(victim_parent),
inode,
victim_name,
@@ -1182,8 +1217,10 @@ next:
/* look for a conflicting sequence number */
di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
ref_index, name, namelen, 0);
- if (di && !IS_ERR(di)) {
- ret = drop_one_dir_item(trans, root, path, dir, di);
+ if (IS_ERR(di)) {
+ return PTR_ERR(di);
+ } else if (di) {
+ ret = drop_one_dir_item(trans, path, dir, di);
if (ret)
return ret;
}
@@ -1192,8 +1229,10 @@ next:
/* look for a conflicting name */
di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
name, namelen, 0);
- if (di && !IS_ERR(di)) {
- ret = drop_one_dir_item(trans, root, path, dir, di);
+ if (IS_ERR(di)) {
+ return PTR_ERR(di);
+ } else if (di) {
+ ret = drop_one_dir_item(trans, path, dir, di);
if (ret)
return ret;
}
@@ -1313,7 +1352,7 @@ again:
kfree(name);
goto out;
}
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
inode, name, namelen);
kfree(name);
iput(dir);
@@ -1374,10 +1413,11 @@ out:
return ret;
}
-static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+static int add_link(struct btrfs_trans_handle *trans,
struct inode *dir, struct inode *inode, const char *name,
int namelen, u64 ref_index)
{
+ struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_dir_item *dir_item;
struct btrfs_key key;
struct btrfs_path *path;
@@ -1411,7 +1451,7 @@ static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
ret = -ENOENT;
goto out;
}
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(other_inode),
name, namelen);
if (ret)
goto out;
@@ -1517,10 +1557,12 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- /* if we already have a perfect match, we're done */
- if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
- btrfs_ino(BTRFS_I(inode)), ref_index,
- name, namelen)) {
+ ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
+ btrfs_ino(BTRFS_I(inode)), ref_index,
+ name, namelen);
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
/*
* look for a conflicting back reference in the
* metadata. if we find one we have to unlink that name
@@ -1555,7 +1597,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
ret = btrfs_inode_ref_exists(inode, dir, key->type,
name, namelen);
if (ret > 0) {
- ret = btrfs_unlink_inode(trans, root,
+ ret = btrfs_unlink_inode(trans,
BTRFS_I(dir),
BTRFS_I(inode),
name, namelen);
@@ -1571,7 +1613,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
goto out;
/* insert our name */
- ret = add_link(trans, root, dir, inode, name, namelen,
+ ret = add_link(trans, dir, inode, name, namelen,
ref_index);
if (ret)
goto out;
@@ -1580,6 +1622,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
}
+ /* Else, ret == 1, we already have a perfect match, we're done. */
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
kfree(name);
@@ -1936,8 +1979,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
struct btrfs_key log_key;
struct inode *dir;
u8 log_type;
- int exists;
- int ret = 0;
+ bool exists;
+ int ret;
bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
bool name_added = false;
@@ -1957,12 +2000,12 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
name_len);
btrfs_dir_item_key_to_cpu(eb, di, &log_key);
- exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
- if (exists == 0)
- exists = 1;
- else
- exists = 0;
+ ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
btrfs_release_path(path);
+ if (ret < 0)
+ goto out;
+ exists = (ret == 0);
+ ret = 0;
if (key->type == BTRFS_DIR_ITEM_KEY) {
dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
@@ -1977,7 +2020,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
ret = -EINVAL;
goto out;
}
- if (IS_ERR_OR_NULL(dst_di)) {
+
+ if (IS_ERR(dst_di)) {
+ ret = PTR_ERR(dst_di);
+ goto out;
+ } else if (!dst_di) {
/* we need a sequence number to insert, so we only
* do inserts for the BTRFS_DIR_INDEX_KEY types
*/
@@ -2003,7 +2050,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
if (!exists)
goto out;
- ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
+ ret = drop_one_dir_item(trans, path, BTRFS_I(dir), dst_di);
if (ret)
goto out;
@@ -2233,13 +2280,13 @@ out:
* to is unlinked
*/
static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
struct btrfs_path *log_path,
struct inode *dir,
struct btrfs_key *dir_key)
{
+ struct btrfs_root *root = BTRFS_I(dir)->root;
int ret;
struct extent_buffer *eb;
int slot;
@@ -2281,7 +2328,7 @@ again:
dir_key->offset,
name, name_len, 0);
}
- if (!log_di || log_di == ERR_PTR(-ENOENT)) {
+ if (!log_di) {
btrfs_dir_item_key_to_cpu(eb, di, &location);
btrfs_release_path(path);
btrfs_release_path(log_path);
@@ -2300,7 +2347,7 @@ again:
}
inc_nlink(inode);
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
BTRFS_I(inode), name, name_len);
if (!ret)
ret = btrfs_run_delayed_items(trans);
@@ -2482,7 +2529,9 @@ again:
else {
ret = find_dir_range(log, path, dirid, key_type,
&range_start, &range_end);
- if (ret != 0)
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
break;
}
@@ -2511,7 +2560,7 @@ again:
if (found_key.offset > range_end)
break;
- ret = check_item_in_log(trans, root, log, path,
+ ret = check_item_in_log(trans, log, path,
log_path, dir,
&found_key);
if (ret)
@@ -3019,9 +3068,6 @@ static void wait_for_writer(struct btrfs_root *root)
static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
- if (!ctx)
- return;
-
mutex_lock(&root->log_mutex);
list_del_init(&ctx->list);
mutex_unlock(&root->log_mutex);
@@ -3310,7 +3356,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* writing the super here would result in transid mismatches. If there
* is an error here just bail.
*/
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
ret = -EIO;
btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
@@ -3434,6 +3480,9 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
if (inode->logged_trans == trans->transid)
return true;
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
+ return false;
+
/*
* The inode's logged_trans is always 0 when we load it (because it is
* not persisted in the inode item or elsewhere). So if it is 0, the
@@ -3472,10 +3521,10 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
* This optimizations allows us to avoid relogging the entire inode
* or the entire directory.
*/
-int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *dir, u64 index)
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *dir, u64 index)
{
struct btrfs_root *log;
struct btrfs_dir_item *di;
@@ -3485,11 +3534,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
u64 dir_ino = btrfs_ino(dir);
if (!inode_logged(trans, dir))
- return 0;
+ return;
ret = join_running_log_trans(root);
if (ret)
- return 0;
+ return;
mutex_lock(&dir->log_mutex);
@@ -3537,49 +3586,36 @@ fail:
btrfs_free_path(path);
out_unlock:
mutex_unlock(&dir->log_mutex);
- if (err == -ENOSPC) {
+ if (err < 0)
btrfs_set_log_full_commit(trans);
- err = 0;
- } else if (err < 0 && err != -ENOENT) {
- /* ENOENT can be returned if the entry hasn't been fsynced yet */
- btrfs_abort_transaction(trans, err);
- }
-
btrfs_end_log_trans(root);
-
- return err;
}
/* see comments for btrfs_del_dir_entries_in_log */
-int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *inode, u64 dirid)
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *inode, u64 dirid)
{
struct btrfs_root *log;
u64 index;
int ret;
if (!inode_logged(trans, inode))
- return 0;
+ return;
ret = join_running_log_trans(root);
if (ret)
- return 0;
+ return;
log = root->log_root;
mutex_lock(&inode->log_mutex);
ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
dirid, &index);
mutex_unlock(&inode->log_mutex);
- if (ret == -ENOSPC) {
+ if (ret < 0 && ret != -ENOENT)
btrfs_set_log_full_commit(trans);
- ret = 0;
- } else if (ret < 0 && ret != -ENOENT)
- btrfs_abort_transaction(trans, ret);
btrfs_end_log_trans(root);
-
- return ret;
}
/*
@@ -3615,31 +3651,231 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
return 0;
}
+static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct extent_buffer *src,
+ struct btrfs_path *dst_path,
+ int start_slot,
+ int count)
+{
+ char *ins_data = NULL;
+ struct btrfs_item_batch batch;
+ struct extent_buffer *dst;
+ unsigned long src_offset;
+ unsigned long dst_offset;
+ struct btrfs_key key;
+ u32 item_size;
+ int ret;
+ int i;
+
+ ASSERT(count > 0);
+ batch.nr = count;
+
+ if (count == 1) {
+ btrfs_item_key_to_cpu(src, &key, start_slot);
+ item_size = btrfs_item_size_nr(src, start_slot);
+ batch.keys = &key;
+ batch.data_sizes = &item_size;
+ batch.total_data_size = item_size;
+ } else {
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+
+ ins_data = kmalloc(count * sizeof(u32) +
+ count * sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data)
+ return -ENOMEM;
+
+ ins_sizes = (u32 *)ins_data;
+ ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ batch.total_data_size = 0;
+
+ for (i = 0; i < count; i++) {
+ const int slot = start_slot + i;
+
+ btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
+ ins_sizes[i] = btrfs_item_size_nr(src, slot);
+ batch.total_data_size += ins_sizes[i];
+ }
+ }
+
+ ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
+ if (ret)
+ goto out;
+
+ dst = dst_path->nodes[0];
+ /*
+ * Copy all the items in bulk, in a single copy operation. Item data is
+ * organized such that it's placed at the end of a leaf and from right
+ * to left. For example, the data for the second item ends at an offset
+ * that matches the offset where the data for the first item starts, the
+ * data for the third item ends at an offset that matches the offset
+ * where the data of the second items starts, and so on.
+ * Therefore our source and destination start offsets for copy match the
+ * offsets of the last items (highest slots).
+ */
+ dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
+ src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
+ copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
+ btrfs_release_path(dst_path);
+out:
+ kfree(ins_data);
+
+ return ret;
+}
+
+static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path,
+ int key_type,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *log = inode->root->log_root;
+ struct extent_buffer *src = path->nodes[0];
+ const int nritems = btrfs_header_nritems(src);
+ const u64 ino = btrfs_ino(inode);
+ const bool inode_logged_before = inode_logged(trans, inode);
+ u64 last_logged_key_offset;
+ bool last_found = false;
+ int batch_start = 0;
+ int batch_size = 0;
+ int i;
+
+ if (key_type == BTRFS_DIR_ITEM_KEY)
+ last_logged_key_offset = inode->last_dir_item_offset;
+ else
+ last_logged_key_offset = inode->last_dir_index_offset;
+
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_key key;
+ int ret;
+
+ btrfs_item_key_to_cpu(src, &key, i);
+
+ if (key.objectid != ino || key.type != key_type) {
+ last_found = true;
+ break;
+ }
+
+ ctx->last_dir_item_offset = key.offset;
+ /*
+ * We must make sure that when we log a directory entry, the
+ * corresponding inode, after log replay, has a matching link
+ * count. For example:
+ *
+ * touch foo
+ * mkdir mydir
+ * sync
+ * ln foo mydir/bar
+ * xfs_io -c "fsync" mydir
+ * <crash>
+ * <mount fs and log replay>
+ *
+ * Would result in a fsync log that when replayed, our file inode
+ * would have a link count of 1, but we get two directory entries
+ * pointing to the same inode. After removing one of the names,
+ * it would not be possible to remove the other name, which
+ * resulted always in stale file handle errors, and would not be
+ * possible to rmdir the parent directory, since its i_size could
+ * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
+ * resulting in -ENOTEMPTY errors.
+ */
+ if (!ctx->log_new_dentries) {
+ struct btrfs_dir_item *di;
+ struct btrfs_key di_key;
+
+ di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+ btrfs_dir_item_key_to_cpu(src, di, &di_key);
+ if ((btrfs_dir_transid(src, di) == trans->transid ||
+ btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
+ di_key.type != BTRFS_ROOT_ITEM_KEY)
+ ctx->log_new_dentries = true;
+ }
+
+ if (!inode_logged_before)
+ goto add_to_batch;
+
+ /*
+ * If we were logged before and have logged dir items, we can skip
+ * checking if any item with a key offset larger than the last one
+ * we logged is in the log tree, saving time and avoiding adding
+ * contention on the log tree.
+ */
+ if (key.offset > last_logged_key_offset)
+ goto add_to_batch;
+ /*
+ * Check if the key was already logged before. If not we can add
+ * it to a batch for bulk insertion.
+ */
+ ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ btrfs_release_path(dst_path);
+ goto add_to_batch;
+ }
+
+ /*
+ * Item exists in the log. Overwrite the item in the log if it
+ * has different content or do nothing if it has exactly the same
+ * content. And then flush the current batch if any - do it after
+ * overwriting the current item, or we would deadlock otherwise,
+ * since we are holding a path for the existing item.
+ */
+ ret = do_overwrite_item(trans, log, dst_path, src, i, &key);
+ if (ret < 0)
+ return ret;
+
+ if (batch_size > 0) {
+ ret = flush_dir_items_batch(trans, log, src, dst_path,
+ batch_start, batch_size);
+ if (ret < 0)
+ return ret;
+ batch_size = 0;
+ }
+ continue;
+add_to_batch:
+ if (batch_size == 0)
+ batch_start = i;
+ batch_size++;
+ }
+
+ if (batch_size > 0) {
+ int ret;
+
+ ret = flush_dir_items_batch(trans, log, src, dst_path,
+ batch_start, batch_size);
+ if (ret < 0)
+ return ret;
+ }
+
+ return last_found ? 1 : 0;
+}
+
/*
* log all the items included in the current transaction for a given
* directory. This also creates the range items in the log tree required
* to replay anything deleted before the fsync
*/
static noinline int log_dir_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path, int key_type,
struct btrfs_log_ctx *ctx,
u64 min_offset, u64 *last_offset_ret)
{
struct btrfs_key min_key;
+ struct btrfs_root *root = inode->root;
struct btrfs_root *log = root->log_root;
- struct extent_buffer *src;
int err = 0;
int ret;
- int i;
- int nritems;
u64 first_offset = min_offset;
u64 last_offset = (u64)-1;
u64 ino = btrfs_ino(inode);
- log = root->log_root;
-
min_key.objectid = ino;
min_key.type = key_type;
min_key.offset = min_offset;
@@ -3713,62 +3949,14 @@ search:
* from our directory
*/
while (1) {
- struct btrfs_key tmp;
- src = path->nodes[0];
- nritems = btrfs_header_nritems(src);
- for (i = path->slots[0]; i < nritems; i++) {
- struct btrfs_dir_item *di;
-
- btrfs_item_key_to_cpu(src, &min_key, i);
-
- if (min_key.objectid != ino || min_key.type != key_type)
- goto done;
-
- if (need_resched()) {
- btrfs_release_path(path);
- cond_resched();
- goto search;
- }
-
- ret = overwrite_item(trans, log, dst_path, src, i,
- &min_key);
- if (ret) {
+ ret = process_dir_items_leaf(trans, inode, path, dst_path,
+ key_type, ctx);
+ if (ret != 0) {
+ if (ret < 0)
err = ret;
- goto done;
- }
-
- /*
- * We must make sure that when we log a directory entry,
- * the corresponding inode, after log replay, has a
- * matching link count. For example:
- *
- * touch foo
- * mkdir mydir
- * sync
- * ln foo mydir/bar
- * xfs_io -c "fsync" mydir
- * <crash>
- * <mount fs and log replay>
- *
- * Would result in a fsync log that when replayed, our
- * file inode would have a link count of 1, but we get
- * two directory entries pointing to the same inode.
- * After removing one of the names, it would not be
- * possible to remove the other name, which resulted
- * always in stale file handle errors, and would not
- * be possible to rmdir the parent directory, since
- * its i_size could never decrement to the value
- * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
- */
- di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
- btrfs_dir_item_key_to_cpu(src, di, &tmp);
- if (ctx &&
- (btrfs_dir_transid(src, di) == trans->transid ||
- btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
- tmp.type != BTRFS_ROOT_ITEM_KEY)
- ctx->log_new_dentries = true;
+ goto done;
}
- path->slots[0] = nritems;
+ path->slots[0] = btrfs_header_nritems(path->nodes[0]);
/*
* look ahead to the next item and see if it is also
@@ -3782,21 +3970,26 @@ search:
err = ret;
goto done;
}
- btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
- if (tmp.objectid != ino || tmp.type != key_type) {
+ btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
+ if (min_key.objectid != ino || min_key.type != key_type) {
last_offset = (u64)-1;
goto done;
}
if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
ret = overwrite_item(trans, log, dst_path,
path->nodes[0], path->slots[0],
- &tmp);
+ &min_key);
if (ret)
err = ret;
else
- last_offset = tmp.offset;
+ last_offset = min_key.offset;
goto done;
}
+ if (need_resched()) {
+ btrfs_release_path(path);
+ cond_resched();
+ goto search;
+ }
}
done:
btrfs_release_path(path);
@@ -3829,7 +4022,7 @@ done:
* key logged by this transaction.
*/
static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path,
struct btrfs_log_ctx *ctx)
@@ -3839,11 +4032,33 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
int ret;
int key_type = BTRFS_DIR_ITEM_KEY;
+ /*
+ * If this is the first time we are being logged in the current
+ * transaction, or we were logged before but the inode was evicted and
+ * reloaded later, in which case its logged_trans is 0, reset the values
+ * of the last logged key offsets. Note that we don't use the helper
+ * function inode_logged() here - that is because the function returns
+ * true after an inode eviction, assuming the worst case as it can not
+ * know for sure if the inode was logged before. So we can not skip key
+ * searches in the case the inode was evicted, because it may not have
+ * been logged in this transaction and may have been logged in a past
+ * transaction, so we need to reset the last dir item and index offsets
+ * to (u64)-1.
+ */
+ if (inode->logged_trans != trans->transid) {
+ inode->last_dir_item_offset = (u64)-1;
+ inode->last_dir_index_offset = (u64)-1;
+ }
again:
min_key = 0;
max_key = 0;
+ if (key_type == BTRFS_DIR_ITEM_KEY)
+ ctx->last_dir_item_offset = inode->last_dir_item_offset;
+ else
+ ctx->last_dir_item_offset = inode->last_dir_index_offset;
+
while (1) {
- ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
+ ret = log_dir_items(trans, inode, path, dst_path, key_type,
ctx, min_key, &max_key);
if (ret)
return ret;
@@ -3853,8 +4068,11 @@ again:
}
if (key_type == BTRFS_DIR_ITEM_KEY) {
+ inode->last_dir_item_offset = ctx->last_dir_item_offset;
key_type = BTRFS_DIR_INDEX_KEY;
goto again;
+ } else {
+ inode->last_dir_index_offset = ctx->last_dir_item_offset;
}
return 0;
}
@@ -3865,17 +4083,21 @@ again:
* This cannot be run for file data extents because it does not
* free the extents they point to.
*/
-static int drop_objectid_items(struct btrfs_trans_handle *trans,
+static int drop_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
- u64 objectid, int max_key_type)
+ struct btrfs_inode *inode,
+ int max_key_type)
{
int ret;
struct btrfs_key key;
struct btrfs_key found_key;
int start_slot;
- key.objectid = objectid;
+ if (!inode_logged(trans, inode))
+ return 0;
+
+ key.objectid = btrfs_ino(inode);
key.type = max_key_type;
key.offset = (u64)-1;
@@ -3892,7 +4114,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
- if (found_key.objectid != objectid)
+ if (found_key.objectid != key.objectid)
break;
found_key.offset = 0;
@@ -3917,6 +4139,21 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
return ret;
}
+static int truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log_root,
+ struct btrfs_inode *inode,
+ u64 new_size, u32 min_type)
+{
+ int ret;
+
+ do {
+ ret = btrfs_truncate_inode_items(trans, log_root, inode,
+ new_size, min_type, NULL);
+ } while (ret == -EAGAIN);
+
+ return ret;
+}
+
static void fill_inode_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf,
struct btrfs_inode_item *item,
@@ -4089,6 +4326,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
+ struct btrfs_item_batch batch;
char *ins_data;
int i;
struct list_head ordered_sums;
@@ -4103,13 +4341,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
ins_sizes = (u32 *)ins_data;
ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ batch.total_data_size = 0;
+ batch.nr = nr;
for (i = 0; i < nr; i++) {
ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
+ batch.total_data_size += ins_sizes[i];
btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
}
- ret = btrfs_insert_empty_items(trans, log, dst_path,
- ins_keys, ins_sizes, nr);
+ ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
if (ret) {
kfree(ins_data);
return ret;
@@ -4321,13 +4563,13 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
}
static int log_one_extent(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, struct btrfs_root *root,
+ struct btrfs_inode *inode,
const struct extent_map *em,
struct btrfs_path *path,
struct btrfs_log_ctx *ctx)
{
struct btrfs_drop_extents_args drop_args = { 0 };
- struct btrfs_root *log = root->log_root;
+ struct btrfs_root *log = inode->root->log_root;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
struct btrfs_map_token token;
@@ -4340,14 +4582,25 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- drop_args.path = path;
- drop_args.start = em->start;
- drop_args.end = em->start + em->len;
- drop_args.replace_extent = true;
- drop_args.extent_item_size = sizeof(*fi);
- ret = btrfs_drop_extents(trans, log, inode, &drop_args);
- if (ret)
- return ret;
+ /*
+ * If this is the first time we are logging the inode in the current
+ * transaction, we can avoid btrfs_drop_extents(), which is expensive
+ * because it does a deletion search, which always acquires write locks
+ * for extent buffers at levels 2, 1 and 0. This not only wastes time
+ * but also adds significant contention in a log tree, since log trees
+ * are small, with a root at level 2 or 3 at most, due to their short
+ * life span.
+ */
+ if (inode_logged(trans, inode)) {
+ drop_args.path = path;
+ drop_args.start = em->start;
+ drop_args.end = em->start + em->len;
+ drop_args.replace_extent = true;
+ drop_args.extent_item_size = sizeof(*fi);
+ ret = btrfs_drop_extents(trans, log, inode, &drop_args);
+ if (ret)
+ return ret;
+ }
if (!drop_args.extent_inserted) {
key.objectid = btrfs_ino(inode);
@@ -4505,13 +4758,9 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
* Avoid logging extent items logged in past fsync calls
* and leading to duplicate keys in the log tree.
*/
- do {
- ret = btrfs_truncate_inode_items(trans,
- root->log_root,
- inode, truncate_offset,
- BTRFS_EXTENT_DATA_KEY,
- NULL);
- } while (ret == -EAGAIN);
+ ret = truncate_inode_items(trans, root->log_root, inode,
+ truncate_offset,
+ BTRFS_EXTENT_DATA_KEY);
if (ret)
goto out;
dropped_extents = true;
@@ -4538,7 +4787,6 @@ out:
}
static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_log_ctx *ctx)
@@ -4603,7 +4851,7 @@ process:
write_unlock(&tree->lock);
- ret = log_one_extent(trans, inode, root, em, path, ctx);
+ ret = log_one_extent(trans, inode, em, path, ctx);
write_lock(&tree->lock);
clear_em_logging(tree, em);
free_extent_map(em);
@@ -4692,11 +4940,11 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
* with a journal, ext3/4, xfs, f2fs, etc).
*/
static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path)
{
+ struct btrfs_root *root = inode->root;
int ret;
struct btrfs_key key;
const u64 ino = btrfs_ino(inode);
@@ -4770,10 +5018,10 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
* truncate operation that changes the inode's size.
*/
static int btrfs_log_holes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
const u64 ino = btrfs_ino(inode);
@@ -5050,7 +5298,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
} else {
- ret = btrfs_log_inode(trans, root,
+ ret = btrfs_log_inode(trans,
BTRFS_I(inode),
LOG_OTHER_INODE_ALL,
ctx);
@@ -5110,8 +5358,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/
- ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_OTHER_INODE, ctx);
+ ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx);
if (ret) {
btrfs_add_delayed_iput(inode);
continue;
@@ -5222,7 +5469,7 @@ again:
&other_ino, &other_parent);
if (ret < 0) {
return ret;
- } else if (ret > 0 && ctx &&
+ } else if (ret > 0 &&
other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
if (ins_nr > 0) {
ins_nr++;
@@ -5322,7 +5569,7 @@ next_key:
* This handles both files and directories.
*/
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
int inode_only,
struct btrfs_log_ctx *ctx)
{
@@ -5330,7 +5577,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_path *dst_path;
struct btrfs_key min_key;
struct btrfs_key max_key;
- struct btrfs_root *log = root->log_root;
+ struct btrfs_root *log = inode->root->log_root;
int err = 0;
int ret = 0;
bool fast_search = false;
@@ -5372,22 +5619,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* Only run delayed items if we are a directory. We want to make sure
* all directory indexes hit the fs/subvolume tree so we can find them
* and figure out which index ranges have to be logged.
- *
- * Otherwise commit the delayed inode only if the full sync flag is set,
- * as we want to make sure an up to date version is in the subvolume
- * tree so copy_inode_items_to_log() / copy_items() can find it and copy
- * it to the log tree. For a non full sync, we always log the inode item
- * based on the in-memory struct btrfs_inode which is always up to date.
*/
- if (S_ISDIR(inode->vfs_inode.i_mode))
- ret = btrfs_commit_inode_delayed_items(trans, inode);
- else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
- ret = btrfs_commit_inode_delayed_inode(inode);
-
- if (ret) {
- btrfs_free_path(path);
- btrfs_free_path(dst_path);
- return ret;
+ if (S_ISDIR(inode->vfs_inode.i_mode)) {
+ err = btrfs_commit_inode_delayed_items(trans, inode);
+ if (err)
+ goto out;
}
if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
@@ -5426,9 +5662,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
if (inode_only == LOG_INODE_EXISTS)
max_key_type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino, max_key_type);
+ ret = drop_inode_items(trans, log, path, inode, max_key_type);
} else {
- if (inode_only == LOG_INODE_EXISTS) {
+ if (inode_only == LOG_INODE_EXISTS && inode_logged(trans, inode)) {
/*
* Make sure the new inode item we write to the log has
* the same isize as the current one (if it exists).
@@ -5450,19 +5686,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
&inode->runtime_flags)) {
if (inode_only == LOG_INODE_EXISTS) {
max_key.type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino,
- max_key.type);
+ ret = drop_inode_items(trans, log, path, inode,
+ max_key.type);
} else {
clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags);
clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&inode->runtime_flags);
- while(1) {
- ret = btrfs_truncate_inode_items(trans,
- log, inode, 0, 0, NULL);
- if (ret != -EAGAIN)
- break;
- }
+ if (inode_logged(trans, inode))
+ ret = truncate_inode_items(trans, log,
+ inode, 0, 0);
}
} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&inode->runtime_flags) ||
@@ -5470,8 +5703,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (inode_only == LOG_INODE_ALL)
fast_search = true;
max_key.type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino,
- max_key.type);
+ ret = drop_inode_items(trans, log, path, inode,
+ max_key.type);
} else {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
@@ -5494,14 +5727,14 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
+ err = btrfs_log_all_xattrs(trans, inode, path, dst_path);
if (err)
goto out_unlock;
xattrs_logged = true;
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_holes(trans, root, inode, path);
+ err = btrfs_log_holes(trans, inode, path);
if (err)
goto out_unlock;
}
@@ -5521,16 +5754,14 @@ log_extents:
* BTRFS_INODE_COPY_EVERYTHING set.
*/
if (!xattrs_logged && inode->logged_trans < trans->transid) {
- err = btrfs_log_all_xattrs(trans, root, inode, path,
- dst_path);
+ err = btrfs_log_all_xattrs(trans, inode, path, dst_path);
if (err)
goto out_unlock;
btrfs_release_path(path);
}
}
if (fast_search) {
- ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- ctx);
+ ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
if (ret) {
err = ret;
goto out_unlock;
@@ -5545,59 +5776,52 @@ log_extents:
}
if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
- ret = log_directory_changes(trans, root, inode, path, dst_path,
- ctx);
+ ret = log_directory_changes(trans, inode, path, dst_path, ctx);
if (ret) {
err = ret;
goto out_unlock;
}
}
+ spin_lock(&inode->lock);
+ inode->logged_trans = trans->transid;
/*
- * If we are logging that an ancestor inode exists as part of logging a
- * new name from a link or rename operation, don't mark the inode as
- * logged - otherwise if an explicit fsync is made against an ancestor,
- * the fsync considers the inode in the log and doesn't sync the log,
- * resulting in the ancestor missing after a power failure unless the
- * log was synced as part of an fsync against any other unrelated inode.
- * So keep it simple for this case and just don't flag the ancestors as
- * logged.
+ * Don't update last_log_commit if we logged that an inode exists.
+ * We do this for three reasons:
+ *
+ * 1) We might have had buffered writes to this inode that were
+ * flushed and had their ordered extents completed in this
+ * transaction, but we did not previously log the inode with
+ * LOG_INODE_ALL. Later the inode was evicted and after that
+ * it was loaded again and this LOG_INODE_EXISTS log operation
+ * happened. We must make sure that if an explicit fsync against
+ * the inode is performed later, it logs the new extents, an
+ * updated inode item, etc, and syncs the log. The same logic
+ * applies to direct IO writes instead of buffered writes.
+ *
+ * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
+ * is logged with an i_size of 0 or whatever value was logged
+ * before. If later the i_size of the inode is increased by a
+ * truncate operation, the log is synced through an fsync of
+ * some other inode and then finally an explicit fsync against
+ * this inode is made, we must make sure this fsync logs the
+ * inode with the new i_size, the hole between old i_size and
+ * the new i_size, and syncs the log.
+ *
+ * 3) If we are logging that an ancestor inode exists as part of
+ * logging a new name from a link or rename operation, don't update
+ * its last_log_commit - otherwise if an explicit fsync is made
+ * against an ancestor, the fsync considers the inode in the log
+ * and doesn't sync the log, resulting in the ancestor missing after
+ * a power failure unless the log was synced as part of an fsync
+ * against any other unrelated inode.
*/
- if (!ctx ||
- !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
- &inode->vfs_inode != ctx->inode)) {
- spin_lock(&inode->lock);
- inode->logged_trans = trans->transid;
- /*
- * Don't update last_log_commit if we logged that an inode exists.
- * We do this for two reasons:
- *
- * 1) We might have had buffered writes to this inode that were
- * flushed and had their ordered extents completed in this
- * transaction, but we did not previously log the inode with
- * LOG_INODE_ALL. Later the inode was evicted and after that
- * it was loaded again and this LOG_INODE_EXISTS log operation
- * happened. We must make sure that if an explicit fsync against
- * the inode is performed later, it logs the new extents, an
- * updated inode item, etc, and syncs the log. The same logic
- * applies to direct IO writes instead of buffered writes.
- *
- * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
- * is logged with an i_size of 0 or whatever value was logged
- * before. If later the i_size of the inode is increased by a
- * truncate operation, the log is synced through an fsync of
- * some other inode and then finally an explicit fsync against
- * this inode is made, we must make sure this fsync logs the
- * inode with the new i_size, the hole between old i_size and
- * the new i_size, and syncs the log.
- */
- if (inode_only != LOG_INODE_EXISTS)
- inode->last_log_commit = inode->last_sub_trans;
- spin_unlock(&inode->lock);
- }
+ if (inode_only != LOG_INODE_EXISTS)
+ inode->last_log_commit = inode->last_sub_trans;
+ spin_unlock(&inode->lock);
out_unlock:
mutex_unlock(&inode->log_mutex);
-
+out:
btrfs_free_path(path);
btrfs_free_path(dst_path);
return err;
@@ -5697,6 +5921,14 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
struct btrfs_dir_list *dir_elem;
int ret = 0;
+ /*
+ * If we are logging a new name, as part of a link or rename operation,
+ * don't bother logging new dentries, as we just want to log the names
+ * of an inode and that any new parents exist.
+ */
+ if (ctx->logging_new_name)
+ return 0;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -5773,7 +6005,7 @@ process_leaf:
ctx->log_new_dentries = false;
if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
log_mode = LOG_INODE_ALL;
- ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
+ ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
log_mode, ctx);
btrfs_add_delayed_iput(di_inode);
if (ret)
@@ -5917,11 +6149,10 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
continue;
}
- if (ctx)
- ctx->log_new_dentries = false;
- ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
+ ctx->log_new_dentries = false;
+ ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
LOG_INODE_ALL, ctx);
- if (!ret && ctx && ctx->log_new_dentries)
+ if (!ret && ctx->log_new_dentries)
ret = log_new_dir_dentries(trans, root,
BTRFS_I(dir_inode), ctx);
btrfs_add_delayed_iput(dir_inode);
@@ -5967,7 +6198,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
if (BTRFS_I(inode)->generation >= trans->transid &&
need_log_inode(trans, BTRFS_I(inode)))
- ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
+ ret = btrfs_log_inode(trans, BTRFS_I(inode),
LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
if (ret)
@@ -6022,7 +6253,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
if (inode->generation >= trans->transid &&
need_log_inode(trans, inode)) {
- ret = btrfs_log_inode(trans, root, inode,
+ ret = btrfs_log_inode(trans, inode,
LOG_INODE_EXISTS, ctx);
if (ret)
break;
@@ -6165,7 +6396,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
+ ret = btrfs_log_inode(trans, inode, inode_only, ctx);
if (ret)
goto end_trans;
@@ -6182,7 +6413,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
}
- if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
+ if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
log_dentries = true;
/*
@@ -6308,8 +6539,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
ret = walk_log_tree(trans, log_root_tree, &wc);
if (ret) {
- btrfs_handle_fs_error(fs_info, ret,
- "Failed to pin buffers while recovering log root tree.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
@@ -6322,8 +6552,7 @@ again:
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
if (ret < 0) {
- btrfs_handle_fs_error(fs_info, ret,
- "Couldn't find tree log root.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
if (ret > 0) {
@@ -6340,8 +6569,7 @@ again:
log = btrfs_read_tree_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
- btrfs_handle_fs_error(fs_info, ret,
- "Couldn't read tree log root.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
@@ -6369,8 +6597,7 @@ again:
if (!ret)
goto next;
- btrfs_handle_fs_error(fs_info, ret,
- "Couldn't read target root for tree log recovery.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
@@ -6378,14 +6605,15 @@ again:
ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
if (ret)
/* The loop needs to continue due to the root refs */
- btrfs_handle_fs_error(fs_info, ret,
- "failed to record the log root in transaction");
+ btrfs_abort_transaction(trans, ret);
else
ret = walk_log_tree(trans, log, &wc);
if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
ret = fixup_inode_link_counts(trans, wc.replay_dest,
path);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
}
if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
@@ -6402,6 +6630,8 @@ again:
* could only happen during mount.
*/
ret = btrfs_init_root_free_objectid(root);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
}
wc.replay_dest->log_root = NULL;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 731bd9c029f5..f6811c3df38a 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -17,6 +17,8 @@ struct btrfs_log_ctx {
int log_transid;
bool log_new_dentries;
bool logging_new_name;
+ /* Tracks the last logged dir item/index key offset. */
+ u64 last_dir_item_offset;
struct inode *inode;
struct list_head list;
/* Only used for fast fsyncs. */
@@ -68,14 +70,14 @@ int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
struct btrfs_log_ctx *ctx);
-int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *dir, u64 index);
-int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *inode, u64 dirid);
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *dir, u64 index);
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *inode, u64 dirid);
void btrfs_end_log_trans(struct btrfs_root *root);
void btrfs_pin_log_trans(struct btrfs_root *root);
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2ec3b8ac8fa3..61ac57bcbf1a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -14,6 +14,7 @@
#include <linux/semaphore.h>
#include <linux/uuid.h>
#include <linux/list_sort.h>
+#include <linux/namei.h>
#include "misc.h"
#include "ctree.h"
#include "extent_map.h"
@@ -250,7 +251,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret,
+ struct btrfs_io_context **bioc_ret,
int mirror_num, int need_raid_map);
/*
@@ -508,7 +509,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
}
if (flush)
- filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
+ sync_blockdev(*bdev);
ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
if (ret) {
blkdev_put(*bdev, flags);
@@ -812,9 +813,13 @@ static noinline struct btrfs_device *device_list_add(const char *path,
device = NULL;
} else {
+ struct btrfs_dev_lookup_args args = {
+ .devid = devid,
+ .uuid = disk_super->dev_item.uuid,
+ };
+
mutex_lock(&fs_devices->device_list_mutex);
- device = btrfs_find_device(fs_devices, devid,
- disk_super->dev_item.uuid, NULL);
+ device = btrfs_find_device(fs_devices, &args);
/*
* If this disk has been pulled into an fs devices created by
@@ -1091,7 +1096,7 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
__btrfs_free_extra_devids(seed_dev, &latest_dev);
- fs_devices->latest_bdev = latest_dev->bdev;
+ fs_devices->latest_dev = latest_dev;
mutex_unlock(&uuid_mutex);
}
@@ -1122,8 +1127,10 @@ static void btrfs_close_one_device(struct btrfs_device *device)
if (device->devid == BTRFS_DEV_REPLACE_DEVID)
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+ clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
fs_devices->missing_devices--;
+ }
btrfs_close_bdev(device);
if (device->bdev) {
@@ -1222,7 +1229,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
return -EINVAL;
fs_devices->opened = 1;
- fs_devices->latest_bdev = latest_dev->bdev;
+ fs_devices->latest_dev = latest_dev;
fs_devices->total_rw_bytes = 0;
fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
fs_devices->read_policy = BTRFS_READ_POLICY_PID;
@@ -1286,7 +1293,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
pgoff_t index;
/* make sure our super fits in the device */
- if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
+ if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
return ERR_PTR(-EINVAL);
/* make sure our super fits in the page */
@@ -1843,8 +1850,10 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
+ btrfs_reserve_chunk_metadata(trans, true);
ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
&key, sizeof(*dev_item));
+ btrfs_trans_release_chunk_metadata(trans);
if (ret)
goto out;
@@ -1882,18 +1891,22 @@ out:
/*
* Function to update ctime/mtime for a given device path.
* Mainly used for ctime/mtime based probe like libblkid.
+ *
+ * We don't care about errors here, this is just to be kind to userspace.
*/
-static void update_dev_time(struct block_device *bdev)
+static void update_dev_time(const char *device_path)
{
- struct inode *inode = bdev->bd_inode;
+ struct path path;
struct timespec64 now;
+ int ret;
- /* Shouldn't happen but just in case. */
- if (!inode)
+ ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
+ if (ret)
return;
- now = current_time(inode);
- generic_update_time(inode, &now, S_MTIME | S_CTIME);
+ now = current_time(d_inode(path.dentry));
+ inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
+ path_put(&path);
}
static int btrfs_rm_dev_item(struct btrfs_device *device)
@@ -1917,7 +1930,9 @@ static int btrfs_rm_dev_item(struct btrfs_device *device)
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -1986,7 +2001,7 @@ static struct btrfs_device * btrfs_find_next_active_device(
}
/*
- * Helper function to check if the given device is part of s_bdev / latest_bdev
+ * Helper function to check if the given device is part of s_bdev / latest_dev
* and replace it with the provided or the next active device, in the context
* where this function called, there should be always be another device (or
* this_dev) which is active.
@@ -2005,8 +2020,8 @@ void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
(fs_info->sb->s_bdev == device->bdev))
fs_info->sb->s_bdev = next_device->bdev;
- if (fs_info->fs_devices->latest_bdev == device->bdev)
- fs_info->fs_devices->latest_bdev = next_device->bdev;
+ if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
+ fs_info->fs_devices->latest_dev = next_device;
}
/*
@@ -2069,11 +2084,12 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
/* Update ctime/mtime for device path for libblkid */
- update_dev_time(bdev);
+ update_dev_time(device_path);
}
-int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
- u64 devid, struct block_device **bdev, fmode_t *mode)
+int btrfs_rm_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_dev_lookup_args *args,
+ struct block_device **bdev, fmode_t *mode)
{
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
@@ -2081,22 +2097,23 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
u64 num_devices;
int ret = 0;
- mutex_lock(&uuid_mutex);
-
+ /*
+ * The device list in fs_devices is accessed without locks (neither
+ * uuid_mutex nor device_list_mutex) as it won't change on a mounted
+ * filesystem and another device rm cannot run.
+ */
num_devices = btrfs_num_devices(fs_info);
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
goto out;
- device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
-
- if (IS_ERR(device)) {
- if (PTR_ERR(device) == -ENOENT &&
- device_path && strcmp(device_path, "missing") == 0)
+ device = btrfs_find_device(fs_info->fs_devices, args);
+ if (!device) {
+ if (args->missing)
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
else
- ret = PTR_ERR(device);
+ ret = -ENOENT;
goto out;
}
@@ -2126,11 +2143,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
mutex_unlock(&fs_info->chunk_mutex);
}
- mutex_unlock(&uuid_mutex);
ret = btrfs_shrink_device(device, 0);
if (!ret)
btrfs_reada_remove_dev(device);
- mutex_lock(&uuid_mutex);
if (ret)
goto error_undo;
@@ -2159,7 +2174,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
/*
* In normal cases the cur_devices == fs_devices. But in case
* of deleting a seed device, the cur_devices should point to
- * its own fs_devices listed under the fs_devices->seed.
+ * its own fs_devices listed under the fs_devices->seed_list.
*/
cur_devices = device->fs_devices;
mutex_lock(&fs_devices->device_list_mutex);
@@ -2210,14 +2225,21 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
synchronize_rcu();
btrfs_free_device(device);
- if (cur_devices->open_devices == 0) {
+ /*
+ * This can happen if cur_devices is the private seed devices list. We
+ * cannot call close_fs_devices() here because it expects the uuid_mutex
+ * to be held, but in fact we don't need that for the private
+ * seed_devices, we can simply decrement cur_devices->opened and then
+ * remove it from our list and free the fs_devices.
+ */
+ if (cur_devices->num_devices == 0) {
list_del_init(&cur_devices->seed_list);
- close_fs_devices(cur_devices);
+ ASSERT(cur_devices->opened == 1);
+ cur_devices->opened--;
free_fs_devices(cur_devices);
}
out:
- mutex_unlock(&uuid_mutex);
return ret;
error_undo:
@@ -2305,13 +2327,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
mutex_unlock(&fs_devices->device_list_mutex);
- /*
- * The update_dev_time() with in btrfs_scratch_superblocks()
- * may lead to a call to btrfs_show_devname() which will try
- * to hold device_list_mutex. And here this device
- * is already out of device list, so we don't have to hold
- * the device_list_mutex lock.
- */
btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
tgtdev->name->str);
@@ -2320,69 +2335,98 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
btrfs_free_device(tgtdev);
}
-static struct btrfs_device *btrfs_find_device_by_path(
- struct btrfs_fs_info *fs_info, const char *device_path)
+/**
+ * Populate args from device at path
+ *
+ * @fs_info: the filesystem
+ * @args: the args to populate
+ * @path: the path to the device
+ *
+ * This will read the super block of the device at @path and populate @args with
+ * the devid, fsid, and uuid. This is meant to be used for ioctls that need to
+ * lookup a device to operate on, but need to do it before we take any locks.
+ * This properly handles the special case of "missing" that a user may pass in,
+ * and does some basic sanity checks. The caller must make sure that @path is
+ * properly NUL terminated before calling in, and must call
+ * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
+ * uuid buffers.
+ *
+ * Return: 0 for success, -errno for failure
+ */
+int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
+ struct btrfs_dev_lookup_args *args,
+ const char *path)
{
- int ret = 0;
struct btrfs_super_block *disk_super;
- u64 devid;
- u8 *dev_uuid;
struct block_device *bdev;
- struct btrfs_device *device;
+ int ret;
- ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
- fs_info->bdev_holder, 0, &bdev, &disk_super);
- if (ret)
- return ERR_PTR(ret);
+ if (!path || !path[0])
+ return -EINVAL;
+ if (!strcmp(path, "missing")) {
+ args->missing = true;
+ return 0;
+ }
- devid = btrfs_stack_device_id(&disk_super->dev_item);
- dev_uuid = disk_super->dev_item.uuid;
+ args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
+ args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
+ if (!args->uuid || !args->fsid) {
+ btrfs_put_dev_args_from_path(args);
+ return -ENOMEM;
+ }
+
+ ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0,
+ &bdev, &disk_super);
+ if (ret)
+ return ret;
+ args->devid = btrfs_stack_device_id(&disk_super->dev_item);
+ memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- disk_super->metadata_uuid);
+ memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
else
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- disk_super->fsid);
-
+ memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
btrfs_release_disk_super(disk_super);
- if (!device)
- device = ERR_PTR(-ENOENT);
blkdev_put(bdev, FMODE_READ);
- return device;
+ return 0;
}
/*
- * Lookup a device given by device id, or the path if the id is 0.
+ * Only use this jointly with btrfs_get_dev_args_from_path() because we will
+ * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
+ * that don't need to be freed.
*/
+void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
+{
+ kfree(args->uuid);
+ kfree(args->fsid);
+ args->uuid = NULL;
+ args->fsid = NULL;
+}
+
struct btrfs_device *btrfs_find_device_by_devspec(
struct btrfs_fs_info *fs_info, u64 devid,
const char *device_path)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_device *device;
+ int ret;
if (devid) {
- device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
- NULL);
+ args.devid = devid;
+ device = btrfs_find_device(fs_info->fs_devices, &args);
if (!device)
return ERR_PTR(-ENOENT);
return device;
}
- if (!device_path || !device_path[0])
- return ERR_PTR(-EINVAL);
-
- if (strcmp(device_path, "missing") == 0) {
- /* Find first missing device */
- list_for_each_entry(device, &fs_info->fs_devices->devices,
- dev_list) {
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
- &device->dev_state) && !device->bdev)
- return device;
- }
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
+ if (ret)
+ return ERR_PTR(ret);
+ device = btrfs_find_device(fs_info->fs_devices, &args);
+ btrfs_put_dev_args_from_path(&args);
+ if (!device)
return ERR_PTR(-ENOENT);
- }
-
- return btrfs_find_device_by_path(fs_info, device_path);
+ return device;
}
/*
@@ -2459,6 +2503,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
*/
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_path *path;
@@ -2468,7 +2513,6 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
struct btrfs_key key;
u8 fs_uuid[BTRFS_FSID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
- u64 devid;
int ret;
path = btrfs_alloc_path();
@@ -2480,7 +2524,9 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
key.type = BTRFS_DEV_ITEM_KEY;
while (1) {
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret < 0)
goto error;
@@ -2505,13 +2551,14 @@ next_slot:
dev_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_item);
- devid = btrfs_device_id(leaf, dev_item);
+ args.devid = btrfs_device_id(leaf, dev_item);
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- fs_uuid);
+ args.uuid = dev_uuid;
+ args.fsid = fs_uuid;
+ device = btrfs_find_device(fs_info->fs_devices, &args);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
@@ -2610,8 +2657,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
device->sector_size = fs_info->sectorsize;
- device->total_bytes = round_down(i_size_read(bdev->bd_inode),
- fs_info->sectorsize);
+ device->total_bytes =
+ round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
@@ -2627,6 +2674,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
btrfs_abort_transaction(trans, ret);
goto error_trans;
}
+ btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
+ device);
}
device->fs_devices = fs_devices;
@@ -2733,7 +2782,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
btrfs_forget_devices(device_path);
/* Update ctime/mtime for blkid or udev */
- update_dev_time(bdev);
+ update_dev_time(device_path);
return ret;
@@ -2826,6 +2875,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_super_block *super_copy = fs_info->super_copy;
u64 old_total;
u64 diff;
+ int ret;
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return -EACCES;
@@ -2854,7 +2904,11 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
&trans->transaction->dev_update_list);
mutex_unlock(&fs_info->chunk_mutex);
- return btrfs_update_device(trans, device);
+ btrfs_reserve_chunk_metadata(trans, false);
+ ret = btrfs_update_device(trans, device);
+ btrfs_trans_release_chunk_metadata(trans);
+
+ return ret;
}
static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
@@ -3096,7 +3150,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
struct btrfs_block_group *sys_bg;
- sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+ sys_bg = btrfs_create_chunk(trans, sys_flags);
if (IS_ERR(sys_bg)) {
ret = PTR_ERR(sys_bg);
btrfs_abort_transaction(trans, ret);
@@ -4889,8 +4943,10 @@ again:
round_down(old_total - diff, fs_info->sectorsize));
mutex_unlock(&fs_info->chunk_mutex);
+ btrfs_reserve_chunk_metadata(trans, false);
/* Now btrfs_update_device() will change the on-disk size. */
ret = btrfs_update_device(trans, device);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
@@ -4973,7 +5029,7 @@ static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
}
/*
- * Structure used internally for __btrfs_alloc_chunk() function.
+ * Structure used internally for btrfs_create_chunk() function.
* Wraps needed parameters.
*/
struct alloc_chunk_ctl {
@@ -5377,7 +5433,7 @@ error_del_extent:
return block_group;
}
-struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
u64 type)
{
struct btrfs_fs_info *info = trans->fs_info;
@@ -5578,12 +5634,12 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
*/
alloc_profile = btrfs_metadata_alloc_profile(fs_info);
- meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
+ meta_bg = btrfs_create_chunk(trans, alloc_profile);
if (IS_ERR(meta_bg))
return PTR_ERR(meta_bg);
alloc_profile = btrfs_system_alloc_profile(fs_info);
- sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
+ sys_bg = btrfs_create_chunk(trans, alloc_profile);
if (IS_ERR(sys_bg))
return PTR_ERR(sys_bg);
@@ -5597,17 +5653,17 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
return btrfs_raid_array[index].tolerated_failures;
}
-int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct extent_map *em;
struct map_lookup *map;
- int readonly = 0;
int miss_ndevs = 0;
int i;
+ bool ret = true;
em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
if (IS_ERR(em))
- return 1;
+ return false;
map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
@@ -5618,21 +5674,20 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
}
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
&map->stripes[i].dev->dev_state)) {
- readonly = 1;
+ ret = false;
goto end;
}
}
/*
- * If the number of missing devices is larger than max errors,
- * we can not write the data into that chunk successfully, so
- * set it readonly.
+ * If the number of missing devices is larger than max errors, we can
+ * not write the data into that chunk successfully.
*/
if (miss_ndevs > btrfs_chunk_max_errors(map))
- readonly = 1;
+ ret = false;
end:
free_extent_map(em);
- return readonly;
+ return ret;
}
void btrfs_mapping_tree_free(struct extent_map_tree *tree)
@@ -5795,7 +5850,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
}
/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
-static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
+static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
{
int i;
int again = 1;
@@ -5804,52 +5859,55 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
again = 0;
for (i = 0; i < num_stripes - 1; i++) {
/* Swap if parity is on a smaller index */
- if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
- swap(bbio->stripes[i], bbio->stripes[i + 1]);
- swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
+ if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
+ swap(bioc->stripes[i], bioc->stripes[i + 1]);
+ swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
again = 1;
}
}
}
}
-static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
+static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+ int total_stripes,
+ int real_stripes)
{
- struct btrfs_bio *bbio = kzalloc(
- /* the size of the btrfs_bio */
- sizeof(struct btrfs_bio) +
- /* plus the variable array for the stripes */
- sizeof(struct btrfs_bio_stripe) * (total_stripes) +
- /* plus the variable array for the tgt dev */
+ struct btrfs_io_context *bioc = kzalloc(
+ /* The size of btrfs_io_context */
+ sizeof(struct btrfs_io_context) +
+ /* Plus the variable array for the stripes */
+ sizeof(struct btrfs_io_stripe) * (total_stripes) +
+ /* Plus the variable array for the tgt dev */
sizeof(int) * (real_stripes) +
/*
- * plus the raid_map, which includes both the tgt dev
- * and the stripes
+ * Plus the raid_map, which includes both the tgt dev
+ * and the stripes.
*/
sizeof(u64) * (total_stripes),
GFP_NOFS|__GFP_NOFAIL);
- atomic_set(&bbio->error, 0);
- refcount_set(&bbio->refs, 1);
+ atomic_set(&bioc->error, 0);
+ refcount_set(&bioc->refs, 1);
- bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
- bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
+ bioc->fs_info = fs_info;
+ bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
+ bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
- return bbio;
+ return bioc;
}
-void btrfs_get_bbio(struct btrfs_bio *bbio)
+void btrfs_get_bioc(struct btrfs_io_context *bioc)
{
- WARN_ON(!refcount_read(&bbio->refs));
- refcount_inc(&bbio->refs);
+ WARN_ON(!refcount_read(&bioc->refs));
+ refcount_inc(&bioc->refs);
}
-void btrfs_put_bbio(struct btrfs_bio *bbio)
+void btrfs_put_bioc(struct btrfs_io_context *bioc)
{
- if (!bbio)
+ if (!bioc)
return;
- if (refcount_dec_and_test(&bbio->refs))
- kfree(bbio);
+ if (refcount_dec_and_test(&bioc->refs))
+ kfree(bioc);
}
/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
@@ -5859,11 +5917,11 @@ void btrfs_put_bbio(struct btrfs_bio *bbio)
*/
static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length_ret,
- struct btrfs_bio **bbio_ret)
+ struct btrfs_io_context **bioc_ret)
{
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
u64 length = *length_ret;
u64 offset;
u64 stripe_nr;
@@ -5882,8 +5940,8 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
int ret = 0;
int i;
- /* discard always return a bbio */
- ASSERT(bbio_ret);
+ /* Discard always returns a bioc. */
+ ASSERT(bioc_ret);
em = btrfs_get_chunk_map(fs_info, logical, length);
if (IS_ERR(em))
@@ -5946,26 +6004,25 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
&stripe_index);
}
- bbio = alloc_btrfs_bio(num_stripes, 0);
- if (!bbio) {
+ bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0);
+ if (!bioc) {
ret = -ENOMEM;
goto out;
}
for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical =
+ bioc->stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
- bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+ bioc->stripes[i].dev = map->stripes[stripe_index].dev;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
- bbio->stripes[i].length = stripes_per_dev *
+ bioc->stripes[i].length = stripes_per_dev *
map->stripe_len;
if (i / sub_stripes < remaining_stripes)
- bbio->stripes[i].length +=
- map->stripe_len;
+ bioc->stripes[i].length += map->stripe_len;
/*
* Special for the first stripe and
@@ -5976,19 +6033,17 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
* off end_off
*/
if (i < sub_stripes)
- bbio->stripes[i].length -=
- stripe_offset;
+ bioc->stripes[i].length -= stripe_offset;
if (stripe_index >= last_stripe &&
stripe_index <= (last_stripe +
sub_stripes - 1))
- bbio->stripes[i].length -=
- stripe_end_offset;
+ bioc->stripes[i].length -= stripe_end_offset;
if (i == sub_stripes - 1)
stripe_offset = 0;
} else {
- bbio->stripes[i].length = length;
+ bioc->stripes[i].length = length;
}
stripe_index++;
@@ -5998,9 +6053,9 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
}
}
- *bbio_ret = bbio;
- bbio->map_type = map->type;
- bbio->num_stripes = num_stripes;
+ *bioc_ret = bioc;
+ bioc->map_type = map->type;
+ bioc->num_stripes = num_stripes;
out:
free_extent_map(em);
return ret;
@@ -6024,7 +6079,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
u64 srcdev_devid, int *mirror_num,
u64 *physical)
{
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
int num_stripes;
int index_srcdev = 0;
int found = 0;
@@ -6033,20 +6088,20 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
int ret = 0;
ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &length, &bbio, 0, 0);
+ logical, &length, &bioc, 0, 0);
if (ret) {
- ASSERT(bbio == NULL);
+ ASSERT(bioc == NULL);
return ret;
}
- num_stripes = bbio->num_stripes;
+ num_stripes = bioc->num_stripes;
if (*mirror_num > num_stripes) {
/*
* BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
* that means that the requested area is not left of the left
* cursor
*/
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
return -EIO;
}
@@ -6056,7 +6111,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
* pointer to the one of the target drive.
*/
for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid != srcdev_devid)
+ if (bioc->stripes[i].dev->devid != srcdev_devid)
continue;
/*
@@ -6064,15 +6119,15 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
* mirror with the lowest physical address
*/
if (found &&
- physical_of_found <= bbio->stripes[i].physical)
+ physical_of_found <= bioc->stripes[i].physical)
continue;
index_srcdev = i;
found = 1;
- physical_of_found = bbio->stripes[i].physical;
+ physical_of_found = bioc->stripes[i].physical;
}
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
ASSERT(found);
if (!found)
@@ -6103,12 +6158,12 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
}
static void handle_ops_on_dev_replace(enum btrfs_map_op op,
- struct btrfs_bio **bbio_ret,
+ struct btrfs_io_context **bioc_ret,
struct btrfs_dev_replace *dev_replace,
u64 logical,
int *num_stripes_ret, int *max_errors_ret)
{
- struct btrfs_bio *bbio = *bbio_ret;
+ struct btrfs_io_context *bioc = *bioc_ret;
u64 srcdev_devid = dev_replace->srcdev->devid;
int tgtdev_indexes = 0;
int num_stripes = *num_stripes_ret;
@@ -6138,17 +6193,17 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
*/
index_where_to_add = num_stripes;
for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ if (bioc->stripes[i].dev->devid == srcdev_devid) {
/* write to new disk, too */
- struct btrfs_bio_stripe *new =
- bbio->stripes + index_where_to_add;
- struct btrfs_bio_stripe *old =
- bbio->stripes + i;
+ struct btrfs_io_stripe *new =
+ bioc->stripes + index_where_to_add;
+ struct btrfs_io_stripe *old =
+ bioc->stripes + i;
new->physical = old->physical;
new->length = old->length;
new->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[i] = index_where_to_add;
+ bioc->tgtdev_map[i] = index_where_to_add;
index_where_to_add++;
max_errors++;
tgtdev_indexes++;
@@ -6168,30 +6223,29 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
* full copy of the source drive.
*/
for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ if (bioc->stripes[i].dev->devid == srcdev_devid) {
/*
* In case of DUP, in order to keep it simple,
* only add the mirror with the lowest physical
* address
*/
if (found &&
- physical_of_found <=
- bbio->stripes[i].physical)
+ physical_of_found <= bioc->stripes[i].physical)
continue;
index_srcdev = i;
found = 1;
- physical_of_found = bbio->stripes[i].physical;
+ physical_of_found = bioc->stripes[i].physical;
}
}
if (found) {
- struct btrfs_bio_stripe *tgtdev_stripe =
- bbio->stripes + num_stripes;
+ struct btrfs_io_stripe *tgtdev_stripe =
+ bioc->stripes + num_stripes;
tgtdev_stripe->physical = physical_of_found;
tgtdev_stripe->length =
- bbio->stripes[index_srcdev].length;
+ bioc->stripes[index_srcdev].length;
tgtdev_stripe->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[index_srcdev] = num_stripes;
+ bioc->tgtdev_map[index_srcdev] = num_stripes;
tgtdev_indexes++;
num_stripes++;
@@ -6200,8 +6254,8 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
*num_stripes_ret = num_stripes;
*max_errors_ret = max_errors;
- bbio->num_tgtdevs = tgtdev_indexes;
- *bbio_ret = bbio;
+ bioc->num_tgtdevs = tgtdev_indexes;
+ *bioc_ret = bioc;
}
static bool need_full_stripe(enum btrfs_map_op op)
@@ -6304,7 +6358,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret,
+ struct btrfs_io_context **bioc_ret,
int mirror_num, int need_raid_map)
{
struct extent_map *em;
@@ -6319,7 +6373,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
int num_stripes;
int max_errors = 0;
int tgtdev_indexes = 0;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int dev_replace_is_ongoing = 0;
int num_alloc_stripes;
@@ -6328,7 +6382,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
u64 raid56_full_stripe_start = (u64)-1;
struct btrfs_io_geometry geom;
- ASSERT(bbio_ret);
+ ASSERT(bioc_ret);
ASSERT(op != BTRFS_MAP_DISCARD);
em = btrfs_get_chunk_map(fs_info, logical, *length);
@@ -6472,20 +6526,20 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
tgtdev_indexes = num_stripes;
}
- bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
- if (!bbio) {
+ bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
+ if (!bioc) {
ret = -ENOMEM;
goto out;
}
for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical = map->stripes[stripe_index].physical +
+ bioc->stripes[i].physical = map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
- bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+ bioc->stripes[i].dev = map->stripes[stripe_index].dev;
stripe_index++;
}
- /* build raid_map */
+ /* Build raid_map */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
(need_full_stripe(op) || mirror_num > 1)) {
u64 tmp;
@@ -6497,15 +6551,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
/* Fill in the logical address of each stripe */
tmp = stripe_nr * data_stripes;
for (i = 0; i < data_stripes; i++)
- bbio->raid_map[(i+rot) % num_stripes] =
+ bioc->raid_map[(i + rot) % num_stripes] =
em->start + (tmp + i) * map->stripe_len;
- bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+ bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- bbio->raid_map[(i+rot+1) % num_stripes] =
+ bioc->raid_map[(i + rot + 1) % num_stripes] =
RAID6_Q_STRIPE;
- sort_parity_stripes(bbio, num_stripes);
+ sort_parity_stripes(bioc, num_stripes);
}
if (need_full_stripe(op))
@@ -6513,15 +6567,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
need_full_stripe(op)) {
- handle_ops_on_dev_replace(op, &bbio, dev_replace, logical,
+ handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
&num_stripes, &max_errors);
}
- *bbio_ret = bbio;
- bbio->map_type = map->type;
- bbio->num_stripes = num_stripes;
- bbio->max_errors = max_errors;
- bbio->mirror_num = mirror_num;
+ *bioc_ret = bioc;
+ bioc->map_type = map->type;
+ bioc->num_stripes = num_stripes;
+ bioc->max_errors = max_errors;
+ bioc->mirror_num = mirror_num;
/*
* this is the case that REQ_READ && dev_replace_is_ongoing &&
@@ -6530,9 +6584,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
*/
if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
WARN_ON(num_stripes > 1);
- bbio->stripes[0].dev = dev_replace->tgtdev;
- bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
- bbio->mirror_num = map->num_stripes + 1;
+ bioc->stripes[0].dev = dev_replace->tgtdev;
+ bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
+ bioc->mirror_num = map->num_stripes + 1;
}
out:
if (dev_replace_is_ongoing) {
@@ -6546,43 +6600,43 @@ out:
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret, int mirror_num)
+ struct btrfs_io_context **bioc_ret, int mirror_num)
{
if (op == BTRFS_MAP_DISCARD)
return __btrfs_map_block_for_discard(fs_info, logical,
- length, bbio_ret);
+ length, bioc_ret);
- return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
+ return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
mirror_num, 0);
}
/* For Scrub/replace */
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret)
+ struct btrfs_io_context **bioc_ret)
{
- return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
+ return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
}
-static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
+static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
{
- bio->bi_private = bbio->private;
- bio->bi_end_io = bbio->end_io;
+ bio->bi_private = bioc->private;
+ bio->bi_end_io = bioc->end_io;
bio_endio(bio);
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
}
static void btrfs_end_bio(struct bio *bio)
{
- struct btrfs_bio *bbio = bio->bi_private;
+ struct btrfs_io_context *bioc = bio->bi_private;
int is_orig_bio = 0;
if (bio->bi_status) {
- atomic_inc(&bbio->error);
+ atomic_inc(&bioc->error);
if (bio->bi_status == BLK_STS_IOERR ||
bio->bi_status == BLK_STS_TARGET) {
- struct btrfs_device *dev = btrfs_io_bio(bio)->device;
+ struct btrfs_device *dev = btrfs_bio(bio)->device;
ASSERT(dev->bdev);
if (btrfs_op(bio) == BTRFS_MAP_WRITE)
@@ -6597,22 +6651,22 @@ static void btrfs_end_bio(struct bio *bio)
}
}
- if (bio == bbio->orig_bio)
+ if (bio == bioc->orig_bio)
is_orig_bio = 1;
- btrfs_bio_counter_dec(bbio->fs_info);
+ btrfs_bio_counter_dec(bioc->fs_info);
- if (atomic_dec_and_test(&bbio->stripes_pending)) {
+ if (atomic_dec_and_test(&bioc->stripes_pending)) {
if (!is_orig_bio) {
bio_put(bio);
- bio = bbio->orig_bio;
+ bio = bioc->orig_bio;
}
- btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+ btrfs_bio(bio)->mirror_num = bioc->mirror_num;
/* only send an error to the higher layers if it is
* beyond the tolerance of the btrfs bio
*/
- if (atomic_read(&bbio->error) > bbio->max_errors) {
+ if (atomic_read(&bioc->error) > bioc->max_errors) {
bio->bi_status = BLK_STS_IOERR;
} else {
/*
@@ -6622,19 +6676,19 @@ static void btrfs_end_bio(struct bio *bio)
bio->bi_status = BLK_STS_OK;
}
- btrfs_end_bbio(bbio, bio);
+ btrfs_end_bioc(bioc, bio);
} else if (!is_orig_bio) {
bio_put(bio);
}
}
-static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
+static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
u64 physical, struct btrfs_device *dev)
{
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
- bio->bi_private = bbio;
- btrfs_io_bio(bio)->device = dev;
+ bio->bi_private = bioc;
+ btrfs_bio(bio)->device = dev;
bio->bi_end_io = btrfs_end_bio;
bio->bi_iter.bi_sector = physical >> 9;
/*
@@ -6663,20 +6717,20 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
btrfsic_submit_bio(bio);
}
-static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
{
- atomic_inc(&bbio->error);
- if (atomic_dec_and_test(&bbio->stripes_pending)) {
+ atomic_inc(&bioc->error);
+ if (atomic_dec_and_test(&bioc->stripes_pending)) {
/* Should be the original bio. */
- WARN_ON(bio != bbio->orig_bio);
+ WARN_ON(bio != bioc->orig_bio);
- btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+ btrfs_bio(bio)->mirror_num = bioc->mirror_num;
bio->bi_iter.bi_sector = logical >> 9;
- if (atomic_read(&bbio->error) > bbio->max_errors)
+ if (atomic_read(&bioc->error) > bioc->max_errors)
bio->bi_status = BLK_STS_IOERR;
else
bio->bi_status = BLK_STS_OK;
- btrfs_end_bbio(bbio, bio);
+ btrfs_end_bioc(bioc, bio);
}
}
@@ -6691,36 +6745,34 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int ret;
int dev_nr;
int total_devs;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
length = bio->bi_iter.bi_size;
map_length = length;
btrfs_bio_counter_inc_blocked(fs_info);
ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
- &map_length, &bbio, mirror_num, 1);
+ &map_length, &bioc, mirror_num, 1);
if (ret) {
btrfs_bio_counter_dec(fs_info);
return errno_to_blk_status(ret);
}
- total_devs = bbio->num_stripes;
- bbio->orig_bio = first_bio;
- bbio->private = first_bio->bi_private;
- bbio->end_io = first_bio->bi_end_io;
- bbio->fs_info = fs_info;
- atomic_set(&bbio->stripes_pending, bbio->num_stripes);
+ total_devs = bioc->num_stripes;
+ bioc->orig_bio = first_bio;
+ bioc->private = first_bio->bi_private;
+ bioc->end_io = first_bio->bi_end_io;
+ atomic_set(&bioc->stripes_pending, bioc->num_stripes);
- if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
+ if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- ret = raid56_parity_write(fs_info, bio, bbio,
- map_length);
+ ret = raid56_parity_write(bio, bioc, map_length);
} else {
- ret = raid56_parity_recover(fs_info, bio, bbio,
- map_length, mirror_num, 1);
+ ret = raid56_parity_recover(bio, bioc, map_length,
+ mirror_num, 1);
}
btrfs_bio_counter_dec(fs_info);
@@ -6735,12 +6787,12 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
}
for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
- dev = bbio->stripes[dev_nr].dev;
+ dev = bioc->stripes[dev_nr].dev;
if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
&dev->dev_state) ||
(btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
- bbio_error(bbio, first_bio, logical);
+ bioc_error(bioc, first_bio, logical);
continue;
}
@@ -6749,12 +6801,39 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
else
bio = first_bio;
- submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
+ submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
}
btrfs_bio_counter_dec(fs_info);
return BLK_STS_OK;
}
+static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
+ const struct btrfs_fs_devices *fs_devices)
+{
+ if (args->fsid == NULL)
+ return true;
+ if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
+ return true;
+ return false;
+}
+
+static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
+ const struct btrfs_device *device)
+{
+ ASSERT((args->devid != (u64)-1) || args->missing);
+
+ if ((args->devid != (u64)-1) && device->devid != args->devid)
+ return false;
+ if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
+ return false;
+ if (!args->missing)
+ return true;
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
+ !device->bdev)
+ return true;
+ return false;
+}
+
/*
* Find a device specified by @devid or @uuid in the list of @fs_devices, or
* return NULL.
@@ -6762,31 +6841,25 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
* If devid and uuid are both specified, the match must be exact, otherwise
* only devid is used.
*/
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, u8 *uuid, u8 *fsid)
+struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
+ const struct btrfs_dev_lookup_args *args)
{
struct btrfs_device *device;
struct btrfs_fs_devices *seed_devs;
- if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ if (dev_args_match_fs_devices(args, fs_devices)) {
list_for_each_entry(device, &fs_devices->devices, dev_list) {
- if (device->devid == devid &&
- (!uuid || memcmp(device->uuid, uuid,
- BTRFS_UUID_SIZE) == 0))
+ if (dev_args_match_device(args, device))
return device;
}
}
list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
- if (!fsid ||
- !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
- list_for_each_entry(device, &seed_devs->devices,
- dev_list) {
- if (device->devid == devid &&
- (!uuid || memcmp(device->uuid, uuid,
- BTRFS_UUID_SIZE) == 0))
- return device;
- }
+ if (!dev_args_match_fs_devices(args, seed_devs))
+ continue;
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
+ if (dev_args_match_device(args, device))
+ return device;
}
}
@@ -6952,6 +7025,7 @@ static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
struct map_lookup *map;
@@ -7029,11 +7103,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+ args.devid = devid;
read_extent_buffer(leaf, uuid, (unsigned long)
btrfs_stripe_dev_uuid_nr(chunk, i),
BTRFS_UUID_SIZE);
- map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
- devid, uuid, NULL);
+ args.uuid = uuid;
+ map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!map->stripes[i].dev &&
!btrfs_test_opt(fs_info, DEGRADED)) {
free_extent_map(em);
@@ -7151,6 +7226,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
static int read_one_dev(struct extent_buffer *leaf,
struct btrfs_dev_item *dev_item)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
@@ -7159,11 +7235,13 @@ static int read_one_dev(struct extent_buffer *leaf,
u8 fs_uuid[BTRFS_FSID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
- devid = btrfs_device_id(leaf, dev_item);
+ devid = args.devid = btrfs_device_id(leaf, dev_item);
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
+ args.uuid = dev_uuid;
+ args.fsid = fs_uuid;
if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
fs_devices = open_seed_devices(fs_info, fs_uuid);
@@ -7171,8 +7249,7 @@ static int read_one_dev(struct extent_buffer *leaf,
return PTR_ERR(fs_devices);
}
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- fs_uuid);
+ device = btrfs_find_device(fs_info->fs_devices, &args);
if (!device) {
if (!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_report_missing_device(fs_info, devid,
@@ -7236,7 +7313,7 @@ static int read_one_dev(struct extent_buffer *leaf,
fill_device_from_item(leaf, dev_item, device);
if (device->bdev) {
- u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
+ u64 max_total_bytes = bdev_nr_bytes(device->bdev);
if (device->total_bytes > max_total_bytes) {
btrfs_err(fs_info,
@@ -7841,12 +7918,14 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_get_dev_stats *stats)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_device *dev;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
int i;
mutex_lock(&fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
+ args.devid = stats->devid;
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
@@ -7922,6 +8001,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
u64 chunk_offset, u64 devid,
u64 physical_offset, u64 physical_len)
{
+ struct btrfs_dev_lookup_args args = { .devid = devid };
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
@@ -7977,7 +8057,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
/* Make sure no dev extent is beyond device boundary */
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!dev) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
ret = -EUCLEAN;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2183361db614..3b8130680749 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -236,17 +236,40 @@ struct btrfs_fs_devices {
bool fsid_change;
struct list_head fs_list;
+ /*
+ * Number of devices under this fsid including missing and
+ * replace-target device and excludes seed devices.
+ */
u64 num_devices;
+
+ /*
+ * The number of devices that successfully opened, including
+ * replace-target, excludes seed devices.
+ */
u64 open_devices;
+
+ /* The number of devices that are under the chunk allocation list. */
u64 rw_devices;
+
+ /* Count of missing devices under this fsid excluding seed device. */
u64 missing_devices;
u64 total_rw_bytes;
+
+ /*
+ * Count of devices from btrfs_super_block::num_devices for this fsid,
+ * which includes the seed device, excludes the transient replace-target
+ * device.
+ */
u64 total_devices;
/* Highest generation number of seen devices */
u64 latest_generation;
- struct block_device *latest_bdev;
+ /*
+ * The mount device or a device with highest generation after removal
+ * or replace.
+ */
+ struct btrfs_device *latest_dev;
/* all of the devices in the FS, protected by a mutex
* so we can safely walk it to write out the supers without
@@ -300,48 +323,62 @@ struct btrfs_fs_devices {
/ sizeof(struct btrfs_stripe) + 1)
/*
- * we need the mirror number and stripe index to be passed around
- * the call chain while we are processing end_io (especially errors).
- * Really, what we need is a btrfs_bio structure that has this info
- * and is properly sized with its stripe array, but we're not there
- * quite yet. We have our own btrfs bioset, and all of the bios
- * we allocate are actually btrfs_io_bios. We'll cram as much of
- * struct btrfs_bio as we can into this over time.
+ * Additional info to pass along bio.
+ *
+ * Mostly for btrfs specific features like csum and mirror_num.
*/
-struct btrfs_io_bio {
+struct btrfs_bio {
unsigned int mirror_num;
+
+ /* @device is for stripe IO submission. */
struct btrfs_device *device;
- u64 logical;
u8 *csum;
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
struct bvec_iter iter;
+
/*
* This member must come last, bio_alloc_bioset will allocate enough
- * bytes for entire btrfs_io_bio but relies on bio being last.
+ * bytes for entire btrfs_bio but relies on bio being last.
*/
struct bio bio;
};
-static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio)
+static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
{
- return container_of(bio, struct btrfs_io_bio, bio);
+ return container_of(bio, struct btrfs_bio, bio);
}
-static inline void btrfs_io_bio_free_csum(struct btrfs_io_bio *io_bio)
+static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
{
- if (io_bio->csum != io_bio->csum_inline) {
- kfree(io_bio->csum);
- io_bio->csum = NULL;
+ if (bbio->csum != bbio->csum_inline) {
+ kfree(bbio->csum);
+ bbio->csum = NULL;
}
}
-struct btrfs_bio_stripe {
+struct btrfs_io_stripe {
struct btrfs_device *dev;
u64 physical;
u64 length; /* only used for discard mappings */
};
-struct btrfs_bio {
+/*
+ * Context for IO subsmission for device stripe.
+ *
+ * - Track the unfinished mirrors for mirror based profiles
+ * Mirror based profiles are SINGLE/DUP/RAID1/RAID10.
+ *
+ * - Contain the logical -> physical mapping info
+ * Used by submit_stripe_bio() for mapping logical bio
+ * into physical device address.
+ *
+ * - Contain device replace info
+ * Used by handle_ops_on_dev_replace() to copy logical bios
+ * into the new device.
+ *
+ * - Contain RAID56 full stripe logical bytenrs
+ */
+struct btrfs_io_context {
refcount_t refs;
atomic_t stripes_pending;
struct btrfs_fs_info *fs_info;
@@ -361,7 +398,7 @@ struct btrfs_bio {
* so raid_map[0] is the start of our full stripe
*/
u64 *raid_map;
- struct btrfs_bio_stripe stripes[];
+ struct btrfs_io_stripe stripes[];
};
struct btrfs_device_info {
@@ -396,11 +433,11 @@ struct map_lookup {
int num_stripes;
int sub_stripes;
int verified_stripes; /* For mount time dev extent verification */
- struct btrfs_bio_stripe stripes[];
+ struct btrfs_io_stripe stripes[];
};
#define map_lookup_size(n) (sizeof(struct map_lookup) + \
- (sizeof(struct btrfs_bio_stripe) * (n)))
+ (sizeof(struct btrfs_io_stripe) * (n)))
struct btrfs_balance_args;
struct btrfs_balance_progress;
@@ -414,6 +451,22 @@ struct btrfs_balance_control {
struct btrfs_balance_progress stat;
};
+/*
+ * Search for a given device by the set parameters
+ */
+struct btrfs_dev_lookup_args {
+ u64 devid;
+ u8 *uuid;
+ u8 *fsid;
+ bool missing;
+};
+
+/* We have to initialize to -1 because BTRFS_DEV_REPLACE_DEVID is 0 */
+#define BTRFS_DEV_LOOKUP_ARGS_INIT { .devid = (u64)-1 }
+
+#define BTRFS_DEV_LOOKUP_ARGS(name) \
+ struct btrfs_dev_lookup_args name = BTRFS_DEV_LOOKUP_ARGS_INIT
+
enum btrfs_map_op {
BTRFS_MAP_READ,
BTRFS_MAP_WRITE,
@@ -437,20 +490,20 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio)
}
}
-void btrfs_get_bbio(struct btrfs_bio *bbio);
-void btrfs_put_bbio(struct btrfs_bio *bbio);
+void btrfs_get_bioc(struct btrfs_io_context *bioc);
+void btrfs_put_bioc(struct btrfs_io_context *bioc);
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret, int mirror_num);
+ struct btrfs_io_context **bioc_ret, int mirror_num);
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret);
+ struct btrfs_io_context **bioc_ret);
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
enum btrfs_map_op op, u64 logical,
struct btrfs_io_geometry *io_geom);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
-struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
u64 type);
void btrfs_mapping_tree_free(struct extent_map_tree *tree);
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
@@ -467,19 +520,23 @@ void btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info,
u64 devid,
const char *devpath);
+int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
+ struct btrfs_dev_lookup_args *args,
+ const char *path);
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
const u8 *uuid);
+void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
void btrfs_free_device(struct btrfs_device *device);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
- const char *device_path, u64 devid,
+ struct btrfs_dev_lookup_args *args,
struct block_device **bdev, fmode_t *mode);
void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, u8 *uuid, u8 *fsid);
+struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
+ const struct btrfs_dev_lookup_args *args);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
int btrfs_balance(struct btrfs_fs_info *fs_info,
@@ -493,7 +550,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_uuid_scan_kthread(void *data);
-int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset);
+bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 8a4514283a4b..2837b4c8424d 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -138,7 +138,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
* matches our target xattr, so lets check.
*/
ret = 0;
- btrfs_assert_tree_locked(path->nodes[0]);
+ btrfs_assert_tree_write_locked(path->nodes[0]);
di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
if (!di && !(flags & XATTR_REPLACE)) {
ret = -ENOSPC;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 8afa90074891..767a0c6c9694 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -126,7 +126,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
+ cpage_out = kmap(out_page);
pages[0] = out_page;
nr_pages = 1;
@@ -148,22 +148,26 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
int i;
for (i = 0; i < in_buf_pages; i++) {
- if (in_page)
+ if (in_page) {
+ kunmap(in_page);
put_page(in_page);
+ }
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
- data_in = page_address(in_page);
+ data_in = kmap(in_page);
memcpy(workspace->buf + i * PAGE_SIZE,
data_in, PAGE_SIZE);
start += PAGE_SIZE;
}
workspace->strm.next_in = workspace->buf;
} else {
- if (in_page)
+ if (in_page) {
+ kunmap(in_page);
put_page(in_page);
+ }
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
- data_in = page_address(in_page);
+ data_in = kmap(in_page);
start += PAGE_SIZE;
workspace->strm.next_in = data_in;
}
@@ -192,6 +196,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
* the stream end if required
*/
if (workspace->strm.avail_out == 0) {
+ kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
@@ -202,7 +207,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
+ cpage_out = kmap(out_page);
pages[nr_pages] = out_page;
nr_pages++;
workspace->strm.avail_out = PAGE_SIZE;
@@ -229,6 +234,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
} else if (workspace->strm.avail_out == 0) {
/* get another page for the stream end */
+ kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
@@ -239,7 +245,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
+ cpage_out = kmap(out_page);
pages[nr_pages] = out_page;
nr_pages++;
workspace->strm.avail_out = PAGE_SIZE;
@@ -258,8 +264,13 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_in = workspace->strm.total_in;
out:
*out_pages = nr_pages;
- if (in_page)
+ if (out_page)
+ kunmap(out_page);
+
+ if (in_page) {
+ kunmap(in_page);
put_page(in_page);
+ }
return ret;
}
@@ -276,7 +287,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
unsigned long buf_start;
struct page **pages_in = cb->compressed_pages;
- data_in = page_address(pages_in[page_in_index]);
+ data_in = kmap(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
workspace->strm.total_in = 0;
@@ -298,6 +309,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
pr_warn("BTRFS: inflateInit failed\n");
+ kunmap(pages_in[page_in_index]);
return -EIO;
}
while (workspace->strm.total_in < srclen) {
@@ -324,13 +336,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (workspace->strm.avail_in == 0) {
unsigned long tmp;
-
+ kunmap(pages_in[page_in_index]);
page_in_index++;
if (page_in_index >= total_pages_in) {
data_in = NULL;
break;
}
- data_in = page_address(pages_in[page_in_index]);
+ data_in = kmap(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
tmp = srclen - workspace->strm.total_in;
workspace->strm.avail_in = min(tmp, PAGE_SIZE);
@@ -342,6 +354,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
ret = 0;
done:
zlib_inflateEnd(&workspace->strm);
+ if (data_in)
+ kunmap(pages_in[page_in_index]);
if (!ret)
zero_fill_bio(cb->orig_bio);
return ret;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 47af1ab3bf12..67d932d70798 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -4,6 +4,7 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/sched/mm.h>
+#include <linux/atomic.h>
#include "ctree.h"
#include "volumes.h"
#include "zoned.h"
@@ -39,12 +40,30 @@
#define BTRFS_NR_SB_LOG_ZONES 2
/*
+ * Minimum of active zones we need:
+ *
+ * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
+ * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
+ * - 1 zone for tree-log dedicated block group
+ * - 1 zone for relocation
+ */
+#define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5)
+
+/*
* Maximum supported zone size. Currently, SMR disks have a zone size of
* 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
* expect the zone size to become larger than 8GiB in the near future.
*/
#define BTRFS_MAX_ZONE_SIZE SZ_8G
+#define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
+
+static inline bool sb_zone_is_full(const struct blk_zone *zone)
+{
+ return (zone->cond == BLK_ZONE_COND_FULL) ||
+ (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity);
+}
+
static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
{
struct blk_zone *zones = data;
@@ -60,14 +79,13 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
bool empty[BTRFS_NR_SB_LOG_ZONES];
bool full[BTRFS_NR_SB_LOG_ZONES];
sector_t sector;
+ int i;
- ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
- zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
-
- empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
- empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
- full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
- full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
+ empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
+ full[i] = sb_zone_is_full(&zones[i]);
+ }
/*
* Possible states of log buffer zones
@@ -296,6 +314,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_zoned_device_info *zone_info = NULL;
struct block_device *bdev = device->bdev;
+ struct request_queue *queue = bdev_get_queue(bdev);
+ unsigned int max_active_zones;
+ unsigned int nactive;
sector_t nr_sectors;
sector_t sector = 0;
struct blk_zone *zones = NULL;
@@ -351,6 +372,17 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
+ max_active_zones = queue_max_active_zones(queue);
+ if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
+ btrfs_err_in_rcu(fs_info,
+"zoned: %s: max active zones %u is too small, need at least %u active zones",
+ rcu_str_deref(device->name), max_active_zones,
+ BTRFS_MIN_ACTIVE_ZONES);
+ ret = -EINVAL;
+ goto out;
+ }
+ zone_info->max_active_zones = max_active_zones;
+
zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
if (!zone_info->seq_zones) {
ret = -ENOMEM;
@@ -363,6 +395,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
goto out;
}
+ zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->active_zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
if (!zones) {
ret = -ENOMEM;
@@ -370,6 +408,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
}
/* Get zones type */
+ nactive = 0;
while (sector < nr_sectors) {
nr_zones = BTRFS_REPORT_NR_ZONES;
ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
@@ -380,8 +419,17 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
for (i = 0; i < nr_zones; i++) {
if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
__set_bit(nreported, zone_info->seq_zones);
- if (zones[i].cond == BLK_ZONE_COND_EMPTY)
+ switch (zones[i].cond) {
+ case BLK_ZONE_COND_EMPTY:
__set_bit(nreported, zone_info->empty_zones);
+ break;
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ __set_bit(nreported, zone_info->active_zones);
+ nactive++;
+ break;
+ }
nreported++;
}
sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
@@ -396,6 +444,19 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
goto out;
}
+ if (max_active_zones) {
+ if (nactive > max_active_zones) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: %u active zones on %s exceeds max_active_zones %u",
+ nactive, rcu_str_deref(device->name),
+ max_active_zones);
+ ret = -EIO;
+ goto out;
+ }
+ atomic_set(&zone_info->active_zones_left,
+ max_active_zones - nactive);
+ }
+
/* Validate superblock log */
nr_zones = BTRFS_NR_SB_LOG_ZONES;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -478,6 +539,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
out:
kfree(zones);
out_free_zone_info:
+ bitmap_free(zone_info->active_zones);
bitmap_free(zone_info->empty_zones);
bitmap_free(zone_info->seq_zones);
kfree(zone_info);
@@ -493,6 +555,7 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
if (!zone_info)
return;
+ bitmap_free(zone_info->active_zones);
bitmap_free(zone_info->seq_zones);
bitmap_free(zone_info->empty_zones);
kfree(zone_info);
@@ -585,7 +648,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
/*
* stripe_size is always aligned to BTRFS_STRIPE_LEN in
- * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
+ * btrfs_create_chunk(). Since we want stripe_len == zone_size,
* check the alignment here.
*/
if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
@@ -664,7 +727,7 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
reset = &zones[1];
if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
- ASSERT(reset->cond == BLK_ZONE_COND_FULL);
+ ASSERT(sb_zone_is_full(reset));
ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
reset->start, reset->len,
@@ -676,9 +739,20 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
reset->wp = reset->start;
}
} else if (ret != -ENOENT) {
- /* For READ, we want the precious one */
+ /*
+ * For READ, we want the previous one. Move write pointer to
+ * the end of a zone, if it is at the head of a zone.
+ */
+ u64 zone_end = 0;
+
if (wp == zones[0].start << SECTOR_SHIFT)
- wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
+ zone_end = zones[1].start + zones[1].capacity;
+ else if (wp == zones[1].start << SECTOR_SHIFT)
+ zone_end = zones[0].start + zones[0].capacity;
+ if (zone_end)
+ wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT,
+ BTRFS_SUPER_INFO_SIZE);
+
wp -= BTRFS_SUPER_INFO_SIZE;
}
@@ -771,36 +845,56 @@ static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
return true;
}
-void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
{
struct btrfs_zoned_device_info *zinfo = device->zone_info;
struct blk_zone *zone;
+ int i;
if (!is_sb_log_zone(zinfo, mirror))
- return;
+ return 0;
zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
- if (zone->cond != BLK_ZONE_COND_FULL) {
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ /* Advance the next zone */
+ if (zone->cond == BLK_ZONE_COND_FULL) {
+ zone++;
+ continue;
+ }
+
if (zone->cond == BLK_ZONE_COND_EMPTY)
zone->cond = BLK_ZONE_COND_IMP_OPEN;
- zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
+ zone->wp += SUPER_INFO_SECTORS;
+
+ if (sb_zone_is_full(zone)) {
+ /*
+ * No room left to write new superblock. Since
+ * superblock is written with REQ_SYNC, it is safe to
+ * finish the zone now.
+ *
+ * If the write pointer is exactly at the capacity,
+ * explicit ZONE_FINISH is not necessary.
+ */
+ if (zone->wp != zone->start + zone->capacity) {
+ int ret;
+
+ ret = blkdev_zone_mgmt(device->bdev,
+ REQ_OP_ZONE_FINISH, zone->start,
+ zone->len, GFP_NOFS);
+ if (ret)
+ return ret;
+ }
- if (zone->wp == zone->start + zone->len)
+ zone->wp = zone->start + zone->len;
zone->cond = BLK_ZONE_COND_FULL;
-
- return;
+ }
+ return 0;
}
- zone++;
- ASSERT(zone->cond != BLK_ZONE_COND_FULL);
- if (zone->cond == BLK_ZONE_COND_EMPTY)
- zone->cond = BLK_ZONE_COND_IMP_OPEN;
-
- zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
-
- if (zone->wp == zone->start + zone->len)
- zone->cond = BLK_ZONE_COND_FULL;
+ /* All the zones are FULL. Should not reach here. */
+ ASSERT(0);
+ return -EIO;
}
int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
@@ -895,6 +989,41 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
return pos;
}
+static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+ unsigned int zno = (pos >> zone_info->zone_size_shift);
+
+ /* We can use any number of zones */
+ if (zone_info->max_active_zones == 0)
+ return true;
+
+ if (!test_bit(zno, zone_info->active_zones)) {
+ /* Active zone left? */
+ if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0)
+ return false;
+ if (test_and_set_bit(zno, zone_info->active_zones)) {
+ /* Someone already set the bit */
+ atomic_inc(&zone_info->active_zones_left);
+ }
+ }
+
+ return true;
+}
+
+static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+ unsigned int zno = (pos >> zone_info->zone_size_shift);
+
+ /* We can use any number of zones */
+ if (zone_info->max_active_zones == 0)
+ return;
+
+ if (test_and_clear_bit(zno, zone_info->active_zones))
+ atomic_inc(&zone_info->active_zones_left);
+}
+
int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
u64 length, u64 *bytes)
{
@@ -910,6 +1039,7 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
*bytes = length;
while (length) {
btrfs_dev_set_zone_empty(device, physical);
+ btrfs_dev_clear_active_zone(device, physical);
physical += device->zone_info->zone_size;
length -= device->zone_info->zone_size;
}
@@ -1039,6 +1169,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
int i;
unsigned int nofs_flag;
u64 *alloc_offsets = NULL;
+ u64 *caps = NULL;
+ unsigned long *active = NULL;
u64 last_alloc = 0;
u32 num_sequential = 0, num_conventional = 0;
@@ -1063,10 +1195,28 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
map = em->map_lookup;
+ cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
+ if (!cache->physical_map) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
if (!alloc_offsets) {
- free_extent_map(em);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
+ if (!caps) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
+ if (!active) {
+ ret = -ENOMEM;
+ goto out;
}
for (i = 0; i < map->num_stripes; i++) {
@@ -1131,6 +1281,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
+ caps[i] = (zone.capacity << SECTOR_SHIFT);
+
switch (zone.cond) {
case BLK_ZONE_COND_OFFLINE:
case BLK_ZONE_COND_READONLY:
@@ -1144,14 +1296,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
alloc_offsets[i] = 0;
break;
case BLK_ZONE_COND_FULL:
- alloc_offsets[i] = fs_info->zone_size;
+ alloc_offsets[i] = caps[i];
break;
default:
/* Partially used zone */
alloc_offsets[i] =
((zone.wp - zone.start) << SECTOR_SHIFT);
+ __set_bit(i, active);
break;
}
+
+ /*
+ * Consider a zone as active if we can allow any number of
+ * active zones.
+ */
+ if (!device->zone_info->max_active_zones)
+ __set_bit(i, active);
}
if (num_sequential > 0)
@@ -1169,6 +1329,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
* calculate_alloc_pointer() which takes extent buffer
* locks to avoid deadlock.
*/
+
+ /* Zone capacity is always zone size in emulation */
+ cache->zone_capacity = cache->length;
if (new) {
cache->alloc_offset = 0;
goto out;
@@ -1195,6 +1358,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
cache->alloc_offset = alloc_offsets[0];
+ cache->zone_capacity = caps[0];
+ cache->zone_is_active = test_bit(0, active);
break;
case BTRFS_BLOCK_GROUP_DUP:
case BTRFS_BLOCK_GROUP_RAID1:
@@ -1210,6 +1375,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
+ if (cache->zone_is_active) {
+ btrfs_get_block_group(cache);
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ }
+
out:
if (cache->alloc_offset > fs_info->zone_size) {
btrfs_err(fs_info,
@@ -1218,6 +1390,14 @@ out:
ret = -EIO;
}
+ if (cache->alloc_offset > cache->zone_capacity) {
+ btrfs_err(fs_info,
+"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
+ cache->alloc_offset, cache->zone_capacity,
+ cache->start);
+ ret = -EIO;
+ }
+
/* An extent is allocated after the write pointer */
if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
btrfs_err(fs_info,
@@ -1229,6 +1409,12 @@ out:
if (!ret)
cache->meta_write_pointer = cache->alloc_offset + cache->start;
+ if (ret) {
+ kfree(cache->physical_map);
+ cache->physical_map = NULL;
+ }
+ bitmap_free(active);
+ kfree(caps);
kfree(alloc_offsets);
free_extent_map(em);
@@ -1243,17 +1429,15 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
return;
WARN_ON(cache->bytes_super != 0);
- unusable = cache->alloc_offset - cache->used;
- free = cache->length - cache->alloc_offset;
+ unusable = (cache->alloc_offset - cache->used) +
+ (cache->length - cache->zone_capacity);
+ free = cache->zone_capacity - cache->alloc_offset;
/* We only need ->free_space in ALLOC_SEQ block groups */
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
cache->free_space_ctl->free_space = free;
cache->zone_unusable = unusable;
-
- /* Should not have any excluded extents. Just in case, though */
- btrfs_free_excluded_extents(cache);
}
void btrfs_redirty_list_add(struct btrfs_transaction *trans,
@@ -1304,6 +1488,17 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
if (!is_data_inode(&inode->vfs_inode))
return false;
+ /*
+ * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
+ * extent layout the relocation code has.
+ * Furthermore we have set aside own block-group from which only the
+ * relocation "process" can allocate and make sure only one process at a
+ * time can add pages to an extent that gets relocated, so it's safe to
+ * use regular REQ_OP_WRITE for this special case.
+ */
+ if (btrfs_is_data_reloc_root(inode->root))
+ return false;
+
cache = btrfs_lookup_block_group(fs_info, start);
ASSERT(cache);
if (!cache)
@@ -1440,27 +1635,27 @@ int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 len
static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
struct blk_zone *zone)
{
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
u64 mapped_length = PAGE_SIZE;
unsigned int nofs_flag;
int nmirrors;
int i, ret;
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &mapped_length, &bbio);
- if (ret || !bbio || mapped_length < PAGE_SIZE) {
- btrfs_put_bbio(bbio);
+ &mapped_length, &bioc);
+ if (ret || !bioc || mapped_length < PAGE_SIZE) {
+ btrfs_put_bioc(bioc);
return -EIO;
}
- if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
return -EINVAL;
nofs_flag = memalloc_nofs_save();
- nmirrors = (int)bbio->num_stripes;
+ nmirrors = (int)bioc->num_stripes;
for (i = 0; i < nmirrors; i++) {
- u64 physical = bbio->stripes[i].physical;
- struct btrfs_device *dev = bbio->stripes[i].dev;
+ u64 physical = bioc->stripes[i].physical;
+ struct btrfs_device *dev = bioc->stripes[i].dev;
/* Missing device */
if (!dev->bdev)
@@ -1530,3 +1725,251 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
return device;
}
+
+/**
+ * Activate block group and underlying device zones
+ *
+ * @block_group: the block group to activate
+ *
+ * Return: true on success, false otherwise
+ */
+bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ u64 physical;
+ bool ret;
+
+ if (!btrfs_is_zoned(block_group->fs_info))
+ return true;
+
+ map = block_group->physical_map;
+ /* Currently support SINGLE profile only */
+ ASSERT(map->num_stripes == 1);
+ device = map->stripes[0].dev;
+ physical = map->stripes[0].physical;
+
+ if (device->zone_info->max_active_zones == 0)
+ return true;
+
+ spin_lock(&block_group->lock);
+
+ if (block_group->zone_is_active) {
+ ret = true;
+ goto out_unlock;
+ }
+
+ /* No space left */
+ if (block_group->alloc_offset == block_group->zone_capacity) {
+ ret = false;
+ goto out_unlock;
+ }
+
+ if (!btrfs_dev_set_active_zone(device, physical)) {
+ /* Cannot activate the zone */
+ ret = false;
+ goto out_unlock;
+ }
+
+ /* Successfully activated all the zones */
+ block_group->zone_is_active = 1;
+
+ spin_unlock(&block_group->lock);
+
+ /* For the active block group list */
+ btrfs_get_block_group(block_group);
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ ASSERT(list_empty(&block_group->active_bg_list));
+ list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ return true;
+
+out_unlock:
+ spin_unlock(&block_group->lock);
+ return ret;
+}
+
+int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ u64 physical;
+ int ret = 0;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ map = block_group->physical_map;
+ /* Currently support SINGLE profile only */
+ ASSERT(map->num_stripes == 1);
+
+ device = map->stripes[0].dev;
+ physical = map->stripes[0].physical;
+
+ if (device->zone_info->max_active_zones == 0)
+ return 0;
+
+ spin_lock(&block_group->lock);
+ if (!block_group->zone_is_active) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+
+ /* Check if we have unwritten allocated space */
+ if ((block_group->flags &
+ (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) &&
+ block_group->alloc_offset > block_group->meta_write_pointer) {
+ spin_unlock(&block_group->lock);
+ return -EAGAIN;
+ }
+ spin_unlock(&block_group->lock);
+
+ ret = btrfs_inc_block_group_ro(block_group, false);
+ if (ret)
+ return ret;
+
+ /* Ensure all writes in this block group finish */
+ btrfs_wait_block_group_reservations(block_group);
+ /* No need to wait for NOCOW writers. Zoned mode does not allow that. */
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
+ block_group->length);
+
+ spin_lock(&block_group->lock);
+
+ /*
+ * Bail out if someone already deactivated the block group, or
+ * allocated space is left in the block group.
+ */
+ if (!block_group->zone_is_active) {
+ spin_unlock(&block_group->lock);
+ btrfs_dec_block_group_ro(block_group);
+ return 0;
+ }
+
+ if (block_group->reserved) {
+ spin_unlock(&block_group->lock);
+ btrfs_dec_block_group_ro(block_group);
+ return -EAGAIN;
+ }
+
+ block_group->zone_is_active = 0;
+ block_group->alloc_offset = block_group->zone_capacity;
+ block_group->free_space_ctl->free_space = 0;
+ btrfs_clear_treelog_bg(block_group);
+ spin_unlock(&block_group->lock);
+
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+ physical >> SECTOR_SHIFT,
+ device->zone_info->zone_size >> SECTOR_SHIFT,
+ GFP_NOFS);
+ btrfs_dec_block_group_ro(block_group);
+
+ if (!ret) {
+ btrfs_dev_clear_active_zone(device, physical);
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ ASSERT(!list_empty(&block_group->active_bg_list));
+ list_del_init(&block_group->active_bg_list);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ /* For active_bg_list */
+ btrfs_put_block_group(block_group);
+ }
+
+ return ret;
+}
+
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index)
+{
+ struct btrfs_device *device;
+ bool ret = false;
+
+ if (!btrfs_is_zoned(fs_devices->fs_info))
+ return true;
+
+ /* Non-single profiles are not supported yet */
+ if (raid_index != BTRFS_RAID_SINGLE)
+ return false;
+
+ /* Check if there is a device with active zones left */
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+
+ if (!device->bdev)
+ continue;
+
+ if (!zinfo->max_active_zones ||
+ atomic_read(&zinfo->active_zones_left)) {
+ ret = true;
+ break;
+ }
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ return ret;
+}
+
+void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
+{
+ struct btrfs_block_group *block_group;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ u64 physical;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ block_group = btrfs_lookup_block_group(fs_info, logical);
+ ASSERT(block_group);
+
+ if (logical + length < block_group->start + block_group->zone_capacity)
+ goto out;
+
+ spin_lock(&block_group->lock);
+
+ if (!block_group->zone_is_active) {
+ spin_unlock(&block_group->lock);
+ goto out;
+ }
+
+ block_group->zone_is_active = 0;
+ /* We should have consumed all the free space */
+ ASSERT(block_group->alloc_offset == block_group->zone_capacity);
+ ASSERT(block_group->free_space_ctl->free_space == 0);
+ btrfs_clear_treelog_bg(block_group);
+ spin_unlock(&block_group->lock);
+
+ map = block_group->physical_map;
+ device = map->stripes[0].dev;
+ physical = map->stripes[0].physical;
+
+ if (!device->zone_info->max_active_zones)
+ goto out;
+
+ btrfs_dev_clear_active_zone(device, physical);
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ ASSERT(!list_empty(&block_group->active_bg_list));
+ list_del_init(&block_group->active_bg_list);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ btrfs_put_block_group(block_group);
+
+out:
+ btrfs_put_block_group(block_group);
+}
+
+void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ spin_lock(&fs_info->relocation_bg_lock);
+ if (fs_info->data_reloc_bg == bg->start)
+ fs_info->data_reloc_bg = 0;
+ spin_unlock(&fs_info->relocation_bg_lock);
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 4b299705bb12..e53ab7b96437 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -23,8 +23,11 @@ struct btrfs_zoned_device_info {
u64 zone_size;
u8 zone_size_shift;
u32 nr_zones;
+ unsigned int max_active_zones;
+ atomic_t active_zones_left;
unsigned long *seq_zones;
unsigned long *empty_zones;
+ unsigned long *active_zones;
struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
};
@@ -40,7 +43,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
u64 *bytenr_ret);
int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
u64 *bytenr_ret);
-void btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
+int btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror);
u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
u64 hole_end, u64 num_bytes);
@@ -66,6 +69,13 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
u64 physical_start, u64 physical_pos);
struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
u64 logical, u64 length);
+bool btrfs_zone_activate(struct btrfs_block_group *block_group);
+int btrfs_zone_finish(struct btrfs_block_group *block_group);
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+ int raid_index);
+void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length);
+void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@@ -113,8 +123,10 @@ static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror,
return 0;
}
-static inline void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
-{ }
+static inline int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+{
+ return 0;
+}
static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
{
@@ -199,6 +211,27 @@ static inline struct btrfs_device *btrfs_zoned_get_device(
return ERR_PTR(-EOPNOTSUPP);
}
+static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+{
+ return true;
+}
+
+static inline int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+ return 0;
+}
+
+static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+ int raid_index)
+{
+ return true;
+}
+
+static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length) { }
+
+static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
+
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 56dce9f00988..f06b68040352 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -399,7 +399,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
/* map in the first page of input data */
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = page_address(in_page);
+ workspace->in_buf.src = kmap(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
@@ -411,7 +411,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ workspace->out_buf.dst = kmap(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
@@ -446,6 +446,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
if (workspace->out_buf.pos == workspace->out_buf.size) {
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
+ kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
@@ -457,7 +458,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ workspace->out_buf.dst = kmap(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out,
PAGE_SIZE);
@@ -472,12 +473,13 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
/* Check if we need more input */
if (workspace->in_buf.pos == workspace->in_buf.size) {
tot_in += PAGE_SIZE;
+ kunmap(in_page);
put_page(in_page);
start += PAGE_SIZE;
len -= PAGE_SIZE;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = page_address(in_page);
+ workspace->in_buf.src = kmap(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
}
@@ -504,6 +506,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
+ kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
@@ -515,7 +518,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ workspace->out_buf.dst = kmap(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
}
@@ -531,8 +534,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
out:
*out_pages = nr_pages;
/* Cleanup */
- if (in_page)
+ if (in_page) {
+ kunmap(in_page);
put_page(in_page);
+ }
+ if (out_page)
+ kunmap(out_page);
return ret;
}
@@ -556,7 +563,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
goto done;
}
- workspace->in_buf.src = page_address(pages_in[page_in_index]);
+ workspace->in_buf.src = kmap(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
@@ -592,14 +599,14 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
break;
if (workspace->in_buf.pos == workspace->in_buf.size) {
- page_in_index++;
+ kunmap(pages_in[page_in_index++]);
if (page_in_index >= total_pages_in) {
workspace->in_buf.src = NULL;
ret = -EIO;
goto done;
}
srclen -= PAGE_SIZE;
- workspace->in_buf.src = page_address(pages_in[page_in_index]);
+ workspace->in_buf.src = kmap(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
}
@@ -607,6 +614,8 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
ret = 0;
zero_fill_bio(cb->orig_bio);
done:
+ if (workspace->in_buf.src)
+ kunmap(pages_in[page_in_index]);
return ret;
}