summaryrefslogtreecommitdiff
path: root/fs/btrfs/volumes.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r--fs/btrfs/volumes.c4290
1 files changed, 2089 insertions, 2201 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b07d382d53a8..ae1742a35e76 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5,27 +5,19 @@
#include <linux/sched.h>
#include <linux/sched/mm.h>
-#include <linux/bio.h>
#include <linux/slab.h>
-#include <linux/blkdev.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
-#include <linux/raid/pq.h>
#include <linux/semaphore.h>
#include <linux/uuid.h>
#include <linux/list_sort.h>
#include <linux/namei.h>
#include "misc.h"
-#include "ctree.h"
-#include "extent_map.h"
#include "disk-io.h"
+#include "extent-tree.h"
#include "transaction.h"
-#include "print-tree.h"
#include "volumes.h"
#include "raid56.h"
-#include "async-thread.h"
-#include "check-integrity.h"
-#include "rcu-string.h"
#include "dev-replace.h"
#include "sysfs.h"
#include "tree-checker.h"
@@ -33,11 +25,31 @@
#include "block-group.h"
#include "discard.h"
#include "zoned.h"
+#include "fs.h"
+#include "accessors.h"
+#include "uuid-tree.h"
+#include "ioctl.h"
+#include "relocation.h"
+#include "scrub.h"
+#include "super.h"
+#include "raid-stripe-tree.h"
#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
BTRFS_BLOCK_GROUP_RAID10 | \
BTRFS_BLOCK_GROUP_RAID56_MASK)
+struct btrfs_io_geometry {
+ u32 stripe_index;
+ u32 stripe_nr;
+ int mirror_num;
+ int num_stripes;
+ u64 stripe_offset;
+ u64 raid56_full_stripe_start;
+ int max_errors;
+ enum btrfs_map_op op;
+ bool use_rst;
+};
+
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
.sub_stripes = 2,
@@ -164,24 +176,12 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
*/
enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
{
- if (flags & BTRFS_BLOCK_GROUP_RAID10)
- return BTRFS_RAID_RAID10;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1)
- return BTRFS_RAID_RAID1;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
- return BTRFS_RAID_RAID1C3;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
- return BTRFS_RAID_RAID1C4;
- else if (flags & BTRFS_BLOCK_GROUP_DUP)
- return BTRFS_RAID_DUP;
- else if (flags & BTRFS_BLOCK_GROUP_RAID0)
- return BTRFS_RAID_RAID0;
- else if (flags & BTRFS_BLOCK_GROUP_RAID5)
- return BTRFS_RAID_RAID5;
- else if (flags & BTRFS_BLOCK_GROUP_RAID6)
- return BTRFS_RAID_RAID6;
-
- return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
+ const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+
+ if (!profile)
+ return BTRFS_RAID_SINGLE;
+
+ return BTRFS_BG_FLAG_TO_INDEX(profile);
}
const char *btrfs_bg_type_to_raid_name(u64 flags)
@@ -194,6 +194,13 @@ const char *btrfs_bg_type_to_raid_name(u64 flags)
return btrfs_raid_array[index].raid_name;
}
+int btrfs_nr_parity_stripes(u64 type)
+{
+ enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type);
+
+ return btrfs_raid_array[index].nparity;
+}
+
/*
* Fill @buf with textual description of @bg_flags, no more than @size_buf
* bytes including terminating null byte.
@@ -206,10 +213,8 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
u64 flags = bg_flags;
u32 size_bp = size_buf;
- if (!flags) {
- strcpy(bp, "NONE");
+ if (!flags)
return;
- }
#define DESCRIBE_FLAG(flag, desc) \
do { \
@@ -250,13 +255,7 @@ out_overflow:;
static int init_first_rw_device(struct btrfs_trans_handle *trans);
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
-static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
-static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
- enum btrfs_map_op op,
- u64 logical, u64 *length,
- struct btrfs_io_context **bioc_ret,
- int mirror_num, int need_raid_map);
/*
* Device locking
@@ -366,16 +365,16 @@ struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
}
/*
- * alloc_fs_devices - allocate struct btrfs_fs_devices
- * @fsid: if not NULL, copy the UUID to fs_devices::fsid
- * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid
+ * Allocate new btrfs_fs_devices structure identified by a fsid.
+ *
+ * @fsid: if not NULL, copy the UUID to fs_devices::fsid and to
+ * fs_devices::metadata_fsid
*
* Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
* The returned struct is not linked onto any lists and can be destroyed with
* kfree() right away.
*/
-static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
- const u8 *metadata_fsid)
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
{
struct btrfs_fs_devices *fs_devs;
@@ -389,23 +388,24 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
INIT_LIST_HEAD(&fs_devs->alloc_list);
INIT_LIST_HEAD(&fs_devs->fs_list);
INIT_LIST_HEAD(&fs_devs->seed_list);
- if (fsid)
- memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
- if (metadata_fsid)
- memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
- else if (fsid)
+ if (fsid) {
+ memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
+ }
return fs_devs;
}
-void btrfs_free_device(struct btrfs_device *device)
+static void btrfs_free_device(struct btrfs_device *device)
{
WARN_ON(!list_empty(&device->post_commit_list));
- rcu_string_free(device->name);
- extent_io_tree_release(&device->alloc_state);
- bio_put(device->flush_bio);
+ /*
+ * No need to call kfree_rcu() nor do RCU lock/unlock, nothing is
+ * reading the device name.
+ */
+ kfree(rcu_dereference_raw(device->name));
+ btrfs_extent_io_tree_release(&device->alloc_state);
btrfs_destroy_dev_zone_info(device);
kfree(device);
}
@@ -413,10 +413,12 @@ void btrfs_free_device(struct btrfs_device *device)
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device;
+
WARN_ON(fs_devices->opened);
+ WARN_ON(fs_devices->holding);
while (!list_empty(&fs_devices->devices)) {
- device = list_entry(fs_devices->devices.next,
- struct btrfs_device, dev_list);
+ device = list_first_entry(&fs_devices->devices,
+ struct btrfs_device, dev_list);
list_del(&device->dev_list);
btrfs_free_device(device);
}
@@ -428,146 +430,110 @@ void __exit btrfs_cleanup_fs_uuids(void)
struct btrfs_fs_devices *fs_devices;
while (!list_empty(&fs_uuids)) {
- fs_devices = list_entry(fs_uuids.next,
- struct btrfs_fs_devices, fs_list);
+ fs_devices = list_first_entry(&fs_uuids, struct btrfs_fs_devices,
+ fs_list);
list_del(&fs_devices->fs_list);
free_fs_devices(fs_devices);
}
}
-static noinline struct btrfs_fs_devices *find_fsid(
- const u8 *fsid, const u8 *metadata_fsid)
+static bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices,
+ const u8 *fsid, const u8 *metadata_fsid)
{
- struct btrfs_fs_devices *fs_devices;
+ if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0)
+ return false;
- ASSERT(fsid);
+ if (!metadata_fsid)
+ return true;
- /* Handle non-split brain cases */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (metadata_fsid) {
- if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
- && memcmp(metadata_fsid, fs_devices->metadata_uuid,
- BTRFS_FSID_SIZE) == 0)
- return fs_devices;
- } else {
- if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
- return fs_devices;
- }
- }
- return NULL;
+ if (memcmp(metadata_fsid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0)
+ return false;
+
+ return true;
}
-static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
- struct btrfs_super_block *disk_super)
+static noinline struct btrfs_fs_devices *find_fsid(
+ const u8 *fsid, const u8 *metadata_fsid)
{
-
struct btrfs_fs_devices *fs_devices;
- /*
- * Handle scanned device having completed its fsid change but
- * belonging to a fs_devices that was created by first scanning
- * a device which didn't have its fsid/metadata_uuid changed
- * at all and the CHANGING_FSID_V2 flag set.
- */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (fs_devices->fsid_change &&
- memcmp(disk_super->metadata_uuid, fs_devices->fsid,
- BTRFS_FSID_SIZE) == 0 &&
- memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
- BTRFS_FSID_SIZE) == 0) {
- return fs_devices;
- }
- }
- /*
- * Handle scanned device having completed its fsid change but
- * belonging to a fs_devices that was created by a device that
- * has an outdated pair of fsid/metadata_uuid and
- * CHANGING_FSID_V2 flag set.
- */
+ ASSERT(fsid);
+
+ /* Handle non-split brain cases */
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (fs_devices->fsid_change &&
- memcmp(fs_devices->metadata_uuid,
- fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
- memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
- BTRFS_FSID_SIZE) == 0) {
+ if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid))
return fs_devices;
- }
}
-
- return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
+ return NULL;
}
-
static int
-btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
- int flush, struct block_device **bdev,
+btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
+ int flush, struct file **bdev_file,
struct btrfs_super_block **disk_super)
{
+ struct block_device *bdev;
int ret;
- *bdev = blkdev_get_by_path(device_path, flags, holder);
+ *bdev_file = bdev_file_open_by_path(device_path, flags, holder, &fs_holder_ops);
- if (IS_ERR(*bdev)) {
- ret = PTR_ERR(*bdev);
+ if (IS_ERR(*bdev_file)) {
+ ret = PTR_ERR(*bdev_file);
+ btrfs_err(NULL, "failed to open device for path %s with flags 0x%x: %d",
+ device_path, flags, ret);
goto error;
}
+ bdev = file_bdev(*bdev_file);
if (flush)
- sync_blockdev(*bdev);
- ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
- if (ret) {
- blkdev_put(*bdev, flags);
- goto error;
+ sync_blockdev(bdev);
+ if (holder) {
+ ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE);
+ if (ret) {
+ bdev_fput(*bdev_file);
+ goto error;
+ }
}
- invalidate_bdev(*bdev);
- *disk_super = btrfs_read_dev_super(*bdev);
+ invalidate_bdev(bdev);
+ *disk_super = btrfs_read_disk_super(bdev, 0, false);
if (IS_ERR(*disk_super)) {
ret = PTR_ERR(*disk_super);
- blkdev_put(*bdev, flags);
+ bdev_fput(*bdev_file);
goto error;
}
return 0;
error:
- *bdev = NULL;
+ *disk_super = NULL;
+ *bdev_file = NULL;
return ret;
}
-static bool device_path_matched(const char *path, struct btrfs_device *device)
-{
- int found;
-
- rcu_read_lock();
- found = strcmp(rcu_str_deref(device->name), path);
- rcu_read_unlock();
-
- return found == 0;
-}
-
/*
- * Search and remove all stale (devices which are not mounted) devices.
- * When both inputs are NULL, it will search and release all stale devices.
- * path: Optional. When provided will it release all unmounted devices
- * matching this path only.
- * skip_dev: Optional. Will skip this device when searching for the stale
- * devices.
- * Return: 0 for success or if @path is NULL.
- * -EBUSY if @path is a mounted device.
- * -ENOENT if @path does not match any device in the list.
+ * Search and remove all stale devices (which are not mounted). When both
+ * inputs are NULL, it will search and release all stale devices.
+ *
+ * @devt: Optional. When provided will it release all unmounted devices
+ * matching this devt only.
+ * @skip_device: Optional. Will skip this device when searching for the stale
+ * devices.
+ *
+ * Return: 0 for success or if @devt is 0.
+ * -EBUSY if @devt is a mounted device.
+ * -ENOENT if @devt does not match any device in the list.
*/
-static int btrfs_free_stale_devices(const char *path,
- struct btrfs_device *skip_device)
+static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
{
struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
struct btrfs_device *device, *tmp_device;
- int ret = 0;
+ int ret;
+ bool freed = false;
lockdep_assert_held(&uuid_mutex);
- if (path)
- ret = -ENOENT;
-
+ /* Return good status if there is no instance of devt. */
+ ret = 0;
list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
mutex_lock(&fs_devices->device_list_mutex);
@@ -575,13 +541,10 @@ static int btrfs_free_stale_devices(const char *path,
&fs_devices->devices, dev_list) {
if (skip_device && skip_device == device)
continue;
- if (path && !device->name)
+ if (devt && devt != device->devt)
continue;
- if (path && !device_path_matched(path, device))
- continue;
- if (fs_devices->opened) {
- /* for an already deleted device return 0 */
- if (path && ret != 0)
+ if (fs_devices->opened || fs_devices->holding) {
+ if (devt)
ret = -EBUSY;
break;
}
@@ -591,7 +554,7 @@ static int btrfs_free_stale_devices(const char *path,
list_del(&device->dev_list);
btrfs_free_device(device);
- ret = 0;
+ freed = true;
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -602,20 +565,91 @@ static int btrfs_free_stale_devices(const char *path,
}
}
+ /* If there is at least one freed device return 0. */
+ if (freed)
+ return 0;
+
return ret;
}
+static struct btrfs_fs_devices *find_fsid_by_device(
+ struct btrfs_super_block *disk_super,
+ dev_t devt, bool *same_fsid_diff_dev)
+{
+ struct btrfs_fs_devices *fsid_fs_devices;
+ struct btrfs_fs_devices *devt_fs_devices;
+ const bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
+ bool found_by_devt = false;
+
+ /* Find the fs_device by the usual method, if found use it. */
+ fsid_fs_devices = find_fsid(disk_super->fsid,
+ has_metadata_uuid ? disk_super->metadata_uuid : NULL);
+
+ /* The temp_fsid feature is supported only with single device filesystem. */
+ if (btrfs_super_num_devices(disk_super) != 1)
+ return fsid_fs_devices;
+
+ /*
+ * A seed device is an integral component of the sprout device, which
+ * functions as a multi-device filesystem. So, temp-fsid feature is
+ * not supported.
+ */
+ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)
+ return fsid_fs_devices;
+
+ /* Try to find a fs_devices by matching devt. */
+ list_for_each_entry(devt_fs_devices, &fs_uuids, fs_list) {
+ struct btrfs_device *device;
+
+ list_for_each_entry(device, &devt_fs_devices->devices, dev_list) {
+ if (device->devt == devt) {
+ found_by_devt = true;
+ break;
+ }
+ }
+ if (found_by_devt)
+ break;
+ }
+
+ if (found_by_devt) {
+ /* Existing device. */
+ if (fsid_fs_devices == NULL) {
+ if (devt_fs_devices->opened == 0) {
+ /* Stale device. */
+ return NULL;
+ } else {
+ /* temp_fsid is mounting a subvol. */
+ return devt_fs_devices;
+ }
+ } else {
+ /* Regular or temp_fsid device mounting a subvol. */
+ return devt_fs_devices;
+ }
+ } else {
+ /* New device. */
+ if (fsid_fs_devices == NULL) {
+ return NULL;
+ } else {
+ /* sb::fsid is already used create a new temp_fsid. */
+ *same_fsid_diff_dev = true;
+ return NULL;
+ }
+ }
+
+ /* Not reached. */
+}
+
/*
* This is only used on mount, and we are protected from competing things
* messing with our fs_devices by the uuid_mutex, thus we do not need the
* fs_devices->device_list_mutex here.
*/
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *device, fmode_t flags,
+ struct btrfs_device *device, blk_mode_t flags,
void *holder)
{
- struct request_queue *q;
- struct block_device *bdev;
+ struct file *bdev_file;
struct btrfs_super_block *disk_super;
u64 devid;
int ret;
@@ -625,8 +659,8 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
if (!device->name)
return -EINVAL;
- ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
- &bdev, &disk_super);
+ ret = btrfs_get_bdev_and_sb(rcu_dereference_raw(device->name), flags, holder, 1,
+ &bdev_file, &disk_super);
if (ret)
return ret;
@@ -642,27 +676,39 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
if (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
- pr_err(
- "BTRFS: Invalid seeding and uuid-changed device detected\n");
+ btrfs_err(NULL,
+ "invalid seeding and uuid-changed device detected");
goto error_free_page;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
fs_devices->seeding = true;
} else {
- if (bdev_read_only(bdev))
+ if (bdev_read_only(file_bdev(bdev_file)))
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
else
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
- q = bdev_get_queue(bdev);
- if (!blk_queue_nonrot(q))
+ if (!bdev_nonrot(file_bdev(bdev_file)))
fs_devices->rotating = true;
- device->bdev = bdev;
+ if (bdev_max_discard_sectors(file_bdev(bdev_file)))
+ fs_devices->discardable = true;
+
+ device->bdev_file = bdev_file;
+ device->bdev = file_bdev(bdev_file);
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
- device->mode = flags;
+
+ if (device->devt != device->bdev->bd_dev) {
+ btrfs_warn(NULL,
+ "device %s maj:min changed from %d:%d to %d:%d",
+ rcu_dereference_raw(device->name), MAJOR(device->devt),
+ MINOR(device->devt), MAJOR(device->bdev->bd_dev),
+ MINOR(device->bdev->bd_dev));
+
+ device->devt = device->bdev->bd_dev;
+ }
fs_devices->open_devices++;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
@@ -676,95 +722,54 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
error_free_page:
btrfs_release_disk_super(disk_super);
- blkdev_put(bdev, flags);
+ bdev_fput(bdev_file);
return -EINVAL;
}
-/*
- * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
- * being created with a disk that has already completed its fsid change. Such
- * disk can belong to an fs which has its FSID changed or to one which doesn't.
- * Handle both cases here.
- */
-static struct btrfs_fs_devices *find_fsid_inprogress(
- struct btrfs_super_block *disk_super)
+const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)
{
- struct btrfs_fs_devices *fs_devices;
-
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
- BTRFS_FSID_SIZE) != 0 &&
- memcmp(fs_devices->metadata_uuid, disk_super->fsid,
- BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
- return fs_devices;
- }
- }
+ bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) &
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
- return find_fsid(disk_super->fsid, NULL);
+ return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
}
-
-static struct btrfs_fs_devices *find_fsid_changed(
- struct btrfs_super_block *disk_super)
+static bool is_same_device(struct btrfs_device *device, const char *new_path)
{
- struct btrfs_fs_devices *fs_devices;
-
- /*
- * Handles the case where scanned device is part of an fs that had
- * multiple successful changes of FSID but currently device didn't
- * observe it. Meaning our fsid will be different than theirs. We need
- * to handle two subcases :
- * 1 - The fs still continues to have different METADATA/FSID uuids.
- * 2 - The fs is switched back to its original FSID (METADATA/FSID
- * are equal).
- */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- /* Changed UUIDs */
- if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
- BTRFS_FSID_SIZE) != 0 &&
- memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
- BTRFS_FSID_SIZE) == 0 &&
- memcmp(fs_devices->fsid, disk_super->fsid,
- BTRFS_FSID_SIZE) != 0)
- return fs_devices;
-
- /* Unchanged UUIDs */
- if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
- BTRFS_FSID_SIZE) == 0 &&
- memcmp(fs_devices->fsid, disk_super->metadata_uuid,
- BTRFS_FSID_SIZE) == 0)
- return fs_devices;
- }
+ struct path old = { .mnt = NULL, .dentry = NULL };
+ struct path new = { .mnt = NULL, .dentry = NULL };
+ char AUTO_KFREE(old_path);
+ bool is_same = false;
+ int ret;
- return NULL;
-}
+ if (!device->name)
+ goto out;
-static struct btrfs_fs_devices *find_fsid_reverted_metadata(
- struct btrfs_super_block *disk_super)
-{
- struct btrfs_fs_devices *fs_devices;
+ old_path = kzalloc(PATH_MAX, GFP_NOFS);
+ if (!old_path)
+ goto out;
- /*
- * Handle the case where the scanned device is part of an fs whose last
- * metadata UUID change reverted it to the original FSID. At the same
- * time * fs_devices was first created by another constitutent device
- * which didn't fully observe the operation. This results in an
- * btrfs_fs_devices created with metadata/fsid different AND
- * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
- * fs_devices equal to the FSID of the disk.
- */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
- BTRFS_FSID_SIZE) != 0 &&
- memcmp(fs_devices->metadata_uuid, disk_super->fsid,
- BTRFS_FSID_SIZE) == 0 &&
- fs_devices->fsid_change)
- return fs_devices;
- }
+ rcu_read_lock();
+ ret = strscpy(old_path, rcu_dereference(device->name), PATH_MAX);
+ rcu_read_unlock();
+ if (ret < 0)
+ goto out;
- return NULL;
+ ret = kern_path(old_path, LOOKUP_FOLLOW, &old);
+ if (ret)
+ goto out;
+ ret = kern_path(new_path, LOOKUP_FOLLOW, &new);
+ if (ret)
+ goto out;
+ if (path_equal(&old, &new))
+ is_same = true;
+out:
+ path_put(&old);
+ path_put(&new);
+ return is_same;
}
+
/*
* Add new device to list of registered devices
*
@@ -778,39 +783,47 @@ static noinline struct btrfs_device *device_list_add(const char *path,
{
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices = NULL;
- struct rcu_string *name;
+ const char *name;
u64 found_transid = btrfs_super_generation(disk_super);
u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
+ dev_t path_devt;
+ int ret;
+ bool same_fsid_diff_dev = false;
bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
- bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
- BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
- if (fsid_change_in_progress) {
- if (!has_metadata_uuid)
- fs_devices = find_fsid_inprogress(disk_super);
- else
- fs_devices = find_fsid_changed(disk_super);
- } else if (has_metadata_uuid) {
- fs_devices = find_fsid_with_metadata_uuid(disk_super);
- } else {
- fs_devices = find_fsid_reverted_metadata(disk_super);
- if (!fs_devices)
- fs_devices = find_fsid(disk_super->fsid, NULL);
+ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
+ btrfs_err(NULL,
+"device %s has incomplete metadata_uuid change, please use btrfstune to complete",
+ path);
+ return ERR_PTR(-EAGAIN);
}
+ ret = lookup_bdev(path, &path_devt);
+ if (ret) {
+ btrfs_err(NULL, "failed to lookup block device for path %s: %d",
+ path, ret);
+ return ERR_PTR(ret);
+ }
- if (!fs_devices) {
- if (has_metadata_uuid)
- fs_devices = alloc_fs_devices(disk_super->fsid,
- disk_super->metadata_uuid);
- else
- fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
+ fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev);
+ if (!fs_devices) {
+ fs_devices = alloc_fs_devices(disk_super->fsid);
if (IS_ERR(fs_devices))
return ERR_CAST(fs_devices);
- fs_devices->fsid_change = fsid_change_in_progress;
+ if (has_metadata_uuid)
+ memcpy(fs_devices->metadata_uuid,
+ disk_super->metadata_uuid, BTRFS_FSID_SIZE);
+
+ if (same_fsid_diff_dev) {
+ generate_random_uuid(fs_devices->fsid);
+ fs_devices->temp_fsid = true;
+ btrfs_info(NULL, "device %s (%d:%d) using temp-fsid %pU",
+ path, MAJOR(path_devt), MINOR(path_devt),
+ fs_devices->fsid);
+ }
mutex_lock(&fs_devices->device_list_mutex);
list_add(&fs_devices->fs_list, &fs_uuids);
@@ -825,49 +838,38 @@ static noinline struct btrfs_device *device_list_add(const char *path,
mutex_lock(&fs_devices->device_list_mutex);
device = btrfs_find_device(fs_devices, &args);
- /*
- * If this disk has been pulled into an fs devices created by
- * a device which had the CHANGING_FSID_V2 flag then replace the
- * metadata_uuid/fsid values of the fs_devices.
- */
- if (fs_devices->fsid_change &&
- found_transid > fs_devices->latest_generation) {
+ if (found_transid > fs_devices->latest_generation) {
memcpy(fs_devices->fsid, disk_super->fsid,
BTRFS_FSID_SIZE);
-
- if (has_metadata_uuid)
- memcpy(fs_devices->metadata_uuid,
- disk_super->metadata_uuid,
- BTRFS_FSID_SIZE);
- else
- memcpy(fs_devices->metadata_uuid,
- disk_super->fsid, BTRFS_FSID_SIZE);
-
- fs_devices->fsid_change = false;
+ memcpy(fs_devices->metadata_uuid,
+ btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE);
}
}
if (!device) {
+ unsigned int nofs_flag;
+
if (fs_devices->opened) {
+ btrfs_err(NULL,
+"device %s (%d:%d) belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)",
+ path, MAJOR(path_devt), MINOR(path_devt),
+ fs_devices->fsid, current->comm,
+ task_pid_nr(current));
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-EBUSY);
}
+ nofs_flag = memalloc_nofs_save();
device = btrfs_alloc_device(NULL, &devid,
- disk_super->dev_item.uuid);
+ disk_super->dev_item.uuid, path);
+ memalloc_nofs_restore(nofs_flag);
if (IS_ERR(device)) {
mutex_unlock(&fs_devices->device_list_mutex);
/* we can safely leave the fs_devices entry around */
return device;
}
- name = rcu_string_strdup(path, GFP_NOFS);
- if (!name) {
- btrfs_free_device(device);
- mutex_unlock(&fs_devices->device_list_mutex);
- return ERR_PTR(-ENOMEM);
- }
- rcu_assign_pointer(device->name, name);
+ device->devt = path_devt;
list_add_rcu(&device->dev_list, &fs_devices->devices);
fs_devices->num_devices++;
@@ -877,16 +879,20 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (disk_super->label[0])
pr_info(
- "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
+"BTRFS: device label %s devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n",
disk_super->label, devid, found_transid, path,
+ MAJOR(path_devt), MINOR(path_devt),
current->comm, task_pid_nr(current));
else
pr_info(
- "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
+"BTRFS: device fsid %pU devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n",
disk_super->fsid, devid, found_transid, path,
+ MAJOR(path_devt), MINOR(path_devt),
current->comm, task_pid_nr(current));
- } else if (!device->name || strcmp(device->name->str, path)) {
+ } else if (!device->name || !is_same_device(device, path)) {
+ const char *old_name;
+
/*
* When FS is already mounted.
* 1. If you are here and if the device->name is NULL that
@@ -922,56 +928,54 @@ static noinline struct btrfs_device *device_list_add(const char *path,
* generation are equal.
*/
mutex_unlock(&fs_devices->device_list_mutex);
+ btrfs_err(NULL,
+"device %s already registered with a higher generation, found %llu expect %llu",
+ path, found_transid, device->generation);
return ERR_PTR(-EEXIST);
}
/*
* We are going to replace the device path for a given devid,
* make sure it's the same device if the device is mounted
+ *
+ * NOTE: the device->fs_info may not be reliable here so pass
+ * in a NULL to message helpers instead. This avoids a possible
+ * use-after-free when the fs_info and fs_info->sb are already
+ * torn down.
*/
if (device->bdev) {
- int error;
- dev_t path_dev;
-
- error = lookup_bdev(path, &path_dev);
- if (error) {
+ if (device->devt != path_devt) {
mutex_unlock(&fs_devices->device_list_mutex);
- return ERR_PTR(error);
- }
-
- if (device->bdev->bd_dev != path_dev) {
- mutex_unlock(&fs_devices->device_list_mutex);
- /*
- * device->fs_info may not be reliable here, so
- * pass in a NULL instead. This avoids a
- * possible use-after-free when the fs_info and
- * fs_info->sb are already torn down.
- */
- btrfs_warn_in_rcu(NULL,
+ btrfs_warn(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
current->comm,
task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
- btrfs_info_in_rcu(device->fs_info,
+ btrfs_info(NULL,
"devid %llu device path %s changed to %s scanned by %s (%d)",
- devid, rcu_str_deref(device->name),
+ devid, btrfs_dev_name(device),
path, current->comm,
task_pid_nr(current));
}
- name = rcu_string_strdup(path, GFP_NOFS);
+ name = kstrdup(path, GFP_NOFS);
if (!name) {
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-ENOMEM);
}
- rcu_string_free(device->name);
+ rcu_read_lock();
+ old_name = rcu_dereference(device->name);
+ rcu_read_unlock();
rcu_assign_pointer(device->name, name);
+ kfree_rcu_mightsleep(old_name);
+
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
fs_devices->missing_devices--;
clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
}
+ device->devt = path_devt;
}
/*
@@ -1001,35 +1005,39 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
lockdep_assert_held(&uuid_mutex);
- fs_devices = alloc_fs_devices(orig->fsid, NULL);
+ fs_devices = alloc_fs_devices(orig->fsid);
if (IS_ERR(fs_devices))
return fs_devices;
fs_devices->total_devices = orig->total_devices;
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
- struct rcu_string *name;
+ const char *dev_path = NULL;
+
+ /*
+ * This is ok to do without RCU read locked because we hold the
+ * uuid mutex so nothing we touch in here is going to disappear.
+ */
+ if (orig_dev->name)
+ dev_path = rcu_dereference_raw(orig_dev->name);
device = btrfs_alloc_device(NULL, &orig_dev->devid,
- orig_dev->uuid);
+ orig_dev->uuid, dev_path);
if (IS_ERR(device)) {
ret = PTR_ERR(device);
goto error;
}
- /*
- * This is ok to do without rcu read locked because we hold the
- * uuid mutex so nothing we touch in here is going to disappear.
- */
- if (orig_dev->name) {
- name = rcu_string_strdup(orig_dev->name->str,
- GFP_KERNEL);
- if (!name) {
+ if (orig_dev->zone_info) {
+ struct btrfs_zoned_device_info *zone_info;
+
+ zone_info = btrfs_clone_dev_zone_info(orig_dev);
+ if (!zone_info) {
btrfs_free_device(device);
ret = -ENOMEM;
goto error;
}
- rcu_assign_pointer(device->name, name);
+ device->zone_info = zone_info;
}
list_add(&device->dev_list, &fs_devices->devices);
@@ -1068,9 +1076,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
if (device->devid == BTRFS_DEV_REPLACE_DEVID)
continue;
- if (device->bdev) {
- blkdev_put(device->bdev, device->mode);
+ if (device->bdev_file) {
+ bdev_fput(device->bdev_file);
device->bdev = NULL;
+ device->bdev_file = NULL;
fs_devices->open_devices--;
}
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
@@ -1115,7 +1124,7 @@ static void btrfs_close_bdev(struct btrfs_device *device)
invalidate_bdev(device->bdev);
}
- blkdev_put(device->bdev, device->mode);
+ bdev_fput(device->bdev_file);
}
static void btrfs_close_one_device(struct btrfs_device *device)
@@ -1140,13 +1149,14 @@ static void btrfs_close_one_device(struct btrfs_device *device)
if (device->bdev) {
fs_devices->open_devices--;
device->bdev = NULL;
+ device->bdev_file = NULL;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
btrfs_destroy_dev_zone_info(device);
device->fs_info = NULL;
atomic_set(&device->dev_stats_ccnt, 0);
- extent_io_tree_release(&device->alloc_state);
+ btrfs_extent_io_tree_release(&device->alloc_state);
/*
* Reset the flush error record. We might have a transient flush error
@@ -1162,10 +1172,10 @@ static void btrfs_close_one_device(struct btrfs_device *device)
device->last_flush_error = 0;
/* Verify the device is back in a pristine state */
- ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
- ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
- ASSERT(list_empty(&device->dev_alloc_list));
- ASSERT(list_empty(&device->post_commit_list));
+ WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
+ WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
+ WARN_ON(!list_empty(&device->dev_alloc_list));
+ WARN_ON(!list_empty(&device->post_commit_list));
}
static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
@@ -1194,9 +1204,22 @@ void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
mutex_lock(&uuid_mutex);
close_fs_devices(fs_devices);
- if (!fs_devices->opened)
+ if (!fs_devices->opened && !fs_devices->holding) {
list_splice_init(&fs_devices->seed_list, &list);
+ /*
+ * If the struct btrfs_fs_devices is not assembled with any
+ * other device, it can be re-initialized during the next mount
+ * without the needing device-scan step. Therefore, it can be
+ * fully freed.
+ */
+ if (fs_devices->num_devices == 1) {
+ list_del(&fs_devices->fs_list);
+ free_fs_devices(fs_devices);
+ }
+ }
+
+
list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
close_fs_devices(fs_devices);
list_del(&fs_devices->seed_list);
@@ -1206,36 +1229,58 @@ void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
}
static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
- fmode_t flags, void *holder)
+ blk_mode_t flags, void *holder)
{
struct btrfs_device *device;
struct btrfs_device *latest_dev = NULL;
struct btrfs_device *tmp_device;
-
- flags |= FMODE_EXCL;
+ s64 __maybe_unused value = 0;
+ int ret = 0;
list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
dev_list) {
- int ret;
+ int ret2;
- ret = btrfs_open_one_device(fs_devices, device, flags, holder);
- if (ret == 0 &&
+ ret2 = btrfs_open_one_device(fs_devices, device, flags, holder);
+ if (ret2 == 0 &&
(!latest_dev || device->generation > latest_dev->generation)) {
latest_dev = device;
- } else if (ret == -ENODATA) {
+ } else if (ret2 == -ENODATA) {
fs_devices->num_devices--;
list_del(&device->dev_list);
btrfs_free_device(device);
}
+ if (ret == 0 && ret2 != 0)
+ ret = ret2;
}
- if (fs_devices->open_devices == 0)
+
+ if (fs_devices->open_devices == 0) {
+ if (ret)
+ return ret;
return -EINVAL;
+ }
fs_devices->opened = 1;
fs_devices->latest_dev = latest_dev;
fs_devices->total_rw_bytes = 0;
fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ;
+ fs_devices->read_devid = latest_dev->devid;
+ fs_devices->read_policy = btrfs_read_policy_to_enum(btrfs_get_mod_read_policy(),
+ &value);
+ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR)
+ fs_devices->collect_fs_stats = true;
+
+ if (value) {
+ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR)
+ fs_devices->rr_min_contig_read = value;
+ if (fs_devices->read_policy == BTRFS_READ_POLICY_DEVID)
+ fs_devices->read_devid = value;
+ }
+#else
fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+#endif
return 0;
}
@@ -1256,7 +1301,7 @@ static int devid_cmp(void *priv, const struct list_head *a,
}
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
- fmode_t flags, void *holder)
+ blk_mode_t flags, void *holder)
{
int ret;
@@ -1287,113 +1332,172 @@ void btrfs_release_disk_super(struct btrfs_super_block *super)
put_page(page);
}
-static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
- u64 bytenr, u64 bytenr_orig)
+struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
+ int copy_num, bool drop_cache)
{
- struct btrfs_super_block *disk_super;
+ struct btrfs_super_block *super;
struct page *page;
- void *p;
- pgoff_t index;
+ u64 bytenr, bytenr_orig;
+ struct address_space *mapping = bdev->bd_mapping;
+ int ret;
- /* make sure our super fits in the device */
- if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
- return ERR_PTR(-EINVAL);
+ bytenr_orig = btrfs_sb_offset(copy_num);
+ ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
+ if (ret < 0) {
+ if (ret == -ENOENT)
+ ret = -EINVAL;
+ return ERR_PTR(ret);
+ }
- /* make sure our super fits in the page */
- if (sizeof(*disk_super) > PAGE_SIZE)
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
return ERR_PTR(-EINVAL);
- /* make sure our super doesn't straddle pages on disk */
- index = bytenr >> PAGE_SHIFT;
- if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
- return ERR_PTR(-EINVAL);
+ if (drop_cache) {
+ /* This should only be called with the primary sb. */
+ ASSERT(copy_num == 0);
- /* pull in the page with our super */
- page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
+ /*
+ * Drop the page of the primary superblock, so later read will
+ * always read from the device.
+ */
+ invalidate_inode_pages2_range(mapping, bytenr >> PAGE_SHIFT,
+ (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
+ }
+ page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
if (IS_ERR(page))
return ERR_CAST(page);
- p = page_address(page);
-
- /* align our pointer to the offset of the super block */
- disk_super = p + offset_in_page(bytenr);
-
- if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
- btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
- btrfs_release_disk_super(p);
+ super = page_address(page);
+ if (btrfs_super_magic(super) != BTRFS_MAGIC ||
+ btrfs_super_bytenr(super) != bytenr_orig) {
+ btrfs_release_disk_super(super);
return ERR_PTR(-EINVAL);
}
- if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
- disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
+ /*
+ * Make sure the last byte of label is properly NUL terminated. We use
+ * '%s' to print the label, if not properly NUL terminated we can access
+ * beyond the label.
+ */
+ if (super->label[0] && super->label[BTRFS_LABEL_SIZE - 1])
+ super->label[BTRFS_LABEL_SIZE - 1] = 0;
- return disk_super;
+ return super;
}
-int btrfs_forget_devices(const char *path)
+int btrfs_forget_devices(dev_t devt)
{
int ret;
mutex_lock(&uuid_mutex);
- ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
+ ret = btrfs_free_stale_devices(devt, NULL);
mutex_unlock(&uuid_mutex);
return ret;
}
+static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
+ const char *path, dev_t devt,
+ bool mount_arg_dev)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ /*
+ * Do not skip device registration for mounted devices with matching
+ * maj:min but different paths. Booting without initrd relies on
+ * /dev/root initially, later replaced with the actual root device.
+ * A successful scan ensures grub2-probe selects the correct device.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ struct btrfs_device *device;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+
+ if (!fs_devices->opened) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+ continue;
+ }
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (device->bdev && (device->bdev->bd_dev == devt) &&
+ strcmp(rcu_dereference_raw(device->name), path) != 0) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ /* Do not skip registration. */
+ return false;
+ }
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+ }
+
+ if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
+ !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING))
+ return true;
+
+ return false;
+}
+
/*
* Look for a btrfs signature on a device. This may be called out of the mount path
* and we are not allowed to call set_blocksize during the scan. The superblock
- * is read via pagecache
+ * is read via pagecache.
+ *
+ * With @mount_arg_dev it's a scan during mount time that will always register
+ * the device or return an error. Multi-device and seeding devices are registered
+ * in both cases.
*/
-struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
- void *holder)
+struct btrfs_device *btrfs_scan_one_device(const char *path,
+ bool mount_arg_dev)
{
struct btrfs_super_block *disk_super;
bool new_device_added = false;
struct btrfs_device *device = NULL;
- struct block_device *bdev;
- u64 bytenr, bytenr_orig;
- int ret;
+ struct file *bdev_file;
+ dev_t devt;
lockdep_assert_held(&uuid_mutex);
/*
- * we would like to check all the supers, but that would make
- * a btrfs mount succeed after a mkfs from a different FS.
- * So, we need to add a special mount option to scan for
- * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+ * Avoid an exclusive open here, as the systemd-udev may initiate the
+ * device scan which may race with the user's mount or mkfs command,
+ * resulting in failure.
+ * Since the device scan is solely for reading purposes, there is no
+ * need for an exclusive open. Additionally, the devices are read again
+ * during the mount process. It is ok to get some inconsistent
+ * values temporarily, as the device paths of the fsid are the only
+ * required information for assembling the volume.
*/
- flags |= FMODE_EXCL;
+ bdev_file = bdev_file_open_by_path(path, BLK_OPEN_READ, NULL, NULL);
+ if (IS_ERR(bdev_file))
+ return ERR_CAST(bdev_file);
- bdev = blkdev_get_by_path(path, flags, holder);
- if (IS_ERR(bdev))
- return ERR_CAST(bdev);
-
- bytenr_orig = btrfs_sb_offset(0);
- ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
- if (ret) {
- device = ERR_PTR(ret);
- goto error_bdev_put;
- }
-
- disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
+ disk_super = btrfs_read_disk_super(file_bdev(bdev_file), 0, false);
if (IS_ERR(disk_super)) {
device = ERR_CAST(disk_super);
goto error_bdev_put;
}
- device = device_list_add(path, disk_super, &new_device_added);
- if (!IS_ERR(device)) {
- if (new_device_added)
- btrfs_free_stale_devices(path, device);
+ devt = file_bdev(bdev_file)->bd_dev;
+ if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) {
+ btrfs_debug(NULL, "skip registering single non-seed device %s (%d:%d)",
+ path, MAJOR(devt), MINOR(devt));
+
+ btrfs_free_stale_devices(devt, NULL);
+
+ device = NULL;
+ goto free_disk_super;
}
+ device = device_list_add(path, disk_super, &new_device_added);
+ if (!IS_ERR(device) && new_device_added)
+ btrfs_free_stale_devices(device->devt, device);
+
+free_disk_super:
btrfs_release_disk_super(disk_super);
error_bdev_put:
- blkdev_put(bdev, flags);
+ bdev_fput(bdev_file);
return device;
}
@@ -1409,13 +1513,13 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
lockdep_assert_held(&device->fs_info->chunk_mutex);
- if (!find_first_extent_bit(&device->alloc_state, *start,
- &physical_start, &physical_end,
- CHUNK_ALLOCATED, NULL)) {
+ if (btrfs_find_first_extent_bit(&device->alloc_state, *start,
+ &physical_start, &physical_end,
+ CHUNK_ALLOCATED, NULL)) {
if (in_range(physical_start, *start, len) ||
in_range(*start, physical_start,
- physical_end - physical_start)) {
+ physical_end + 1 - physical_start)) {
*start = physical_end + 1;
return true;
}
@@ -1423,25 +1527,21 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
return false;
}
-static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
+static u64 dev_extent_search_start(struct btrfs_device *device)
{
switch (device->fs_devices->chunk_alloc_policy) {
+ default:
+ btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy);
+ fallthrough;
case BTRFS_CHUNK_ALLOC_REGULAR:
- /*
- * We don't want to overwrite the superblock on the drive nor
- * any area used by the boot loader (grub for example), so we
- * make sure to start at an offset of at least 1MB.
- */
- return max_t(u64, start, SZ_1M);
+ return BTRFS_DEVICE_RANGE_RESERVED;
case BTRFS_CHUNK_ALLOC_ZONED:
/*
* We don't care about the starting region like regular
* allocator, because we anyway use/reserve the first two zones
* for superblock logging.
*/
- return ALIGN(start, device->zone_info->zone_size);
- default:
- BUG();
+ return 0;
}
}
@@ -1454,7 +1554,8 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
int ret;
bool changed = false;
- ASSERT(IS_ALIGNED(*hole_start, zone_size));
+ ASSERT(IS_ALIGNED(*hole_start, zone_size),
+ "hole_start=%llu zone_size=%llu", *hole_start, zone_size);
while (*hole_size > 0) {
pos = btrfs_find_allocatable_zones(device, *hole_start,
@@ -1489,8 +1590,9 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
return changed;
}
-/**
- * dev_extent_hole_check - check if specified hole is suitable for allocation
+/*
+ * Check if specified hole is suitable for allocation.
+ *
* @device: the device which we have the hole
* @hole_start: starting position of the hole
* @hole_size: the size of the hole
@@ -1519,6 +1621,9 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
}
switch (device->fs_devices->chunk_alloc_policy) {
+ default:
+ btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy);
+ fallthrough;
case BTRFS_CHUNK_ALLOC_REGULAR:
/* No extra check */
break;
@@ -1533,8 +1638,6 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
continue;
}
break;
- default:
- BUG();
}
break;
@@ -1544,7 +1647,8 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
}
/*
- * find_free_dev_extent_start - find free space in the specified device
+ * Find free space in the specified device.
+ *
* @device: the device which we search the free space in
* @num_bytes: the size of the free space that we need
* @search_start: the position from which to begin the search
@@ -1552,9 +1656,8 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
* @len: the size of the free space. that we find, or the size
* of the max free space if we don't find suitable free space
*
- * this uses a pretty simple search, the expectation is that it is
- * called very infrequently and that a given device has a small number
- * of extents
+ * This does a pretty simple search, the expectation is that it is called very
+ * infrequently and that a given device has a small number of extents.
*
* @start is used to store the start of the free space if we find. But if we
* don't find suitable free space, it will be used to store the start position
@@ -1570,36 +1673,35 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
* correct usable device space, as device extent freed in current transaction
* is not reported as available.
*/
-static int find_free_dev_extent_start(struct btrfs_device *device,
- u64 num_bytes, u64 search_start, u64 *start,
- u64 *len)
+static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+ u64 *start, u64 *len)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_key key;
struct btrfs_dev_extent *dev_extent;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
+ u64 search_start;
u64 hole_size;
u64 max_hole_start;
- u64 max_hole_size;
+ u64 max_hole_size = 0;
u64 extent_end;
u64 search_end = device->total_bytes;
int ret;
int slot;
struct extent_buffer *l;
- search_start = dev_extent_search_start(device, search_start);
+ search_start = dev_extent_search_start(device);
+ max_hole_start = search_start;
WARN_ON(device->zone_info &&
!IS_ALIGNED(num_bytes, device->zone_info->zone_size));
path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- max_hole_start = search_start;
- max_hole_size = 0;
-
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
again:
if (search_start >= search_end ||
test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
@@ -1608,18 +1710,18 @@ again:
}
path->reada = READA_FORWARD;
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
key.objectid = device->devid;
- key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = search_start;
ret = btrfs_search_backwards(root, &key, path);
if (ret < 0)
goto out;
- while (1) {
+ while (search_start < search_end) {
l = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(l)) {
@@ -1642,6 +1744,9 @@ again:
if (key.type != BTRFS_DEV_EXTENT_KEY)
goto next;
+ if (key.offset > search_end)
+ break;
+
if (key.offset > search_start) {
hole_size = key.offset - search_start;
dev_extent_hole_check(device, &search_start, &hole_size,
@@ -1702,21 +1807,16 @@ next:
else
ret = 0;
+ ASSERT(max_hole_start + max_hole_size <= search_end,
+ "max_hole_start=%llu max_hole_size=%llu search_end=%llu",
+ max_hole_start, max_hole_size, search_end);
out:
- btrfs_free_path(path);
*start = max_hole_start;
if (len)
*len = max_hole_size;
return ret;
}
-int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
- u64 *start, u64 *len)
-{
- /* FIXME use last free of some kind */
- return find_free_dev_extent_start(device, num_bytes, 0, start, len);
-}
-
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
u64 start, u64 *dev_extent_len)
@@ -1724,7 +1824,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
struct extent_buffer *leaf = NULL;
@@ -1735,15 +1835,15 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
return -ENOMEM;
key.objectid = device->devid;
- key.offset = start;
key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = start;
again:
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid,
BTRFS_DEV_EXTENT_KEY);
if (ret)
- goto out;
+ return ret;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -1758,7 +1858,7 @@ again:
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
} else {
- goto out;
+ return ret;
}
*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
@@ -1766,26 +1866,23 @@ again:
ret = btrfs_del_item(trans, root, path);
if (ret == 0)
set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
-out:
- btrfs_free_path(path);
return ret;
}
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *em_tree;
- struct extent_map *em;
struct rb_node *n;
u64 ret = 0;
- em_tree = &fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- n = rb_last(&em_tree->map.rb_root);
+ read_lock(&fs_info->mapping_tree_lock);
+ n = rb_last(&fs_info->mapping_tree.rb_root);
if (n) {
- em = rb_entry(n, struct extent_map, rb_node);
- ret = em->start + em->len;
+ struct btrfs_chunk_map *map;
+
+ map = rb_entry(n, struct btrfs_chunk_map, rb_node);
+ ret = map->start + map->chunk_len;
}
- read_unlock(&em_tree->lock);
+ read_unlock(&fs_info->mapping_tree_lock);
return ret;
}
@@ -1796,7 +1893,7 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
int ret;
struct btrfs_key key;
struct btrfs_key found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
path = btrfs_alloc_path();
if (!path)
@@ -1808,13 +1905,12 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
if (ret < 0)
- goto error;
+ return ret;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/* Corruption */
btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
- ret = -EUCLEAN;
- goto error;
+ return -EUCLEAN;
}
ret = btrfs_previous_item(fs_info->chunk_root, path,
@@ -1827,10 +1923,7 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
path->slots[0]);
*devid_ret = found_key.offset + 1;
}
- ret = 0;
-error:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
/*
@@ -1841,7 +1934,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dev_item *dev_item;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -1860,7 +1953,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
&key, sizeof(*dev_item));
btrfs_trans_release_chunk_metadata(trans);
if (ret)
- goto out;
+ return ret;
leaf = path->nodes[0];
dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
@@ -1885,12 +1978,8 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
ptr = btrfs_device_fsid(dev_item);
write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
ptr, BTRFS_FSID_SIZE);
- btrfs_mark_buffer_dirty(leaf);
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
/*
@@ -1902,35 +1991,25 @@ out:
static void update_dev_time(const char *device_path)
{
struct path path;
- struct timespec64 now;
- int ret;
- ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
- if (ret)
- return;
-
- now = current_time(d_inode(path.dentry));
- inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
- path_put(&path);
+ if (!kern_path(device_path, LOOKUP_FOLLOW, &path)) {
+ vfs_utimes(&path, NULL);
+ path_put(&path);
+ }
}
-static int btrfs_rm_dev_item(struct btrfs_device *device)
+static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device)
{
struct btrfs_root *root = device->fs_info->chunk_root;
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
- struct btrfs_trans_handle *trans;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- btrfs_free_path(path);
- return PTR_ERR(trans);
- }
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
@@ -1938,25 +2017,12 @@ static int btrfs_rm_dev_item(struct btrfs_device *device)
btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
btrfs_trans_release_chunk_metadata(trans);
- if (ret) {
- if (ret > 0)
- ret = -ENOENT;
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- goto out;
- }
-
- ret = btrfs_del_item(trans, root, path);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- }
+ if (ret > 0)
+ return -ENOENT;
+ if (ret < 0)
+ return ret;
-out:
- btrfs_free_path(path);
- if (!ret)
- ret = btrfs_commit_transaction(trans);
- return ret;
+ return btrfs_del_item(trans, root, path);
}
/*
@@ -2039,7 +2105,7 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
down_read(&fs_info->dev_replace.rwsem);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
- ASSERT(num_devices > 1);
+ ASSERT(num_devices > 1, "num_devices=%llu", num_devices);
num_devices--;
}
up_read(&fs_info->dev_replace.rwsem);
@@ -2047,61 +2113,66 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
return num_devices;
}
-void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
- struct block_device *bdev,
- const char *device_path)
+static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info,
+ struct block_device *bdev, int copy_num)
{
struct btrfs_super_block *disk_super;
- int copy_num;
+ const size_t len = sizeof(disk_super->magic);
+ const u64 bytenr = btrfs_sb_offset(copy_num);
+ int ret;
- if (!bdev)
+ disk_super = btrfs_read_disk_super(bdev, copy_num, false);
+ if (IS_ERR(disk_super))
return;
- for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
- struct page *page;
- int ret;
-
- disk_super = btrfs_read_dev_one_super(bdev, copy_num);
- if (IS_ERR(disk_super))
- continue;
+ memset(&disk_super->magic, 0, len);
+ folio_mark_dirty(virt_to_folio(disk_super));
+ btrfs_release_disk_super(disk_super);
- if (bdev_is_zoned(bdev)) {
- btrfs_reset_sb_log_zones(bdev, copy_num);
- continue;
- }
+ ret = sync_blockdev_range(bdev, bytenr, bytenr + len - 1);
+ if (ret)
+ btrfs_warn(fs_info, "error clearing superblock number %d (%d)",
+ copy_num, ret);
+}
- memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device)
+{
+ int copy_num;
+ struct block_device *bdev = device->bdev;
- page = virt_to_page(disk_super);
- set_page_dirty(page);
- lock_page(page);
- /* write_on_page() unlocks the page */
- ret = write_one_page(page);
- if (ret)
- btrfs_warn(fs_info,
- "error clearing superblock number %d (%d)",
- copy_num, ret);
- btrfs_release_disk_super(disk_super);
+ if (!bdev)
+ return;
+ for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
+ if (bdev_is_zoned(bdev))
+ btrfs_reset_sb_log_zones(bdev, copy_num);
+ else
+ btrfs_scratch_superblock(fs_info, bdev, copy_num);
}
/* Notify udev that device has changed */
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
/* Update ctime/mtime for device path for libblkid */
- update_dev_time(device_path);
+ update_dev_time(rcu_dereference_raw(device->name));
}
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
- struct block_device **bdev, fmode_t *mode)
+ struct file **bdev_file)
{
+ struct btrfs_trans_handle *trans;
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 num_devices;
int ret = 0;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
/*
* The device list in fs_devices is accessed without locks (neither
* uuid_mutex nor device_list_mutex) as it won't change on a mounted
@@ -2111,7 +2182,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
- goto out;
+ return ret;
device = btrfs_find_device(fs_info->fs_devices, args);
if (!device) {
@@ -2119,27 +2190,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
else
ret = -ENOENT;
- goto out;
+ return ret;
}
if (btrfs_pinned_by_swapfile(fs_info, device)) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"cannot remove device %s (devid %llu) due to active swapfile",
- rcu_str_deref(device->name), device->devid);
- ret = -ETXTBSY;
- goto out;
+ btrfs_dev_name(device), device->devid);
+ return -ETXTBSY;
}
- if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
- ret = BTRFS_ERROR_DEV_TGT_REPLACE;
- goto out;
- }
+ if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
+ return BTRFS_ERROR_DEV_TGT_REPLACE;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
- fs_info->fs_devices->rw_devices == 1) {
- ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
- goto out;
- }
+ fs_info->fs_devices->rw_devices == 1)
+ return BTRFS_ERROR_DEV_ONLY_WRITABLE;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
@@ -2152,14 +2218,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
if (ret)
goto error_undo;
- /*
- * TODO: the superblock still includes this device in its num_devices
- * counter although write_all_supers() is not locked out. This
- * could give a filesystem state which requires a degraded mount.
- */
- ret = btrfs_rm_dev_item(device);
- if (ret)
+ trans = btrfs_start_transaction(fs_info->chunk_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
goto error_undo;
+ }
+
+ ret = btrfs_rm_dev_item(trans, device);
+ if (unlikely(ret)) {
+ /* Any error in dev item removal is critical */
+ btrfs_crit(fs_info,
+ "failed to remove device item for devid %llu: %d",
+ device->devid, ret);
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
btrfs_scrub_cancel_dev(device);
@@ -2194,7 +2268,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
btrfs_assign_next_active_device(device, NULL);
- if (device->bdev) {
+ if (device->bdev_file) {
cur_devices->open_devices--;
/* remove sysfs entry */
btrfs_sysfs_remove_device(device);
@@ -2210,21 +2284,19 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
* free the device.
*
* We cannot call btrfs_close_bdev() here because we're holding the sb
- * write lock, and blkdev_put() will pull in the ->open_mutex on the
- * block device and it's dependencies. Instead just flush the device
- * and let the caller do the final blkdev_put.
+ * write lock, and bdev_fput() on the block device will pull in the
+ * ->open_mutex on the block device and it's dependencies. Instead
+ * just flush the device and let the caller do the final bdev_release.
*/
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
- btrfs_scratch_superblocks(fs_info, device->bdev,
- device->name->str);
+ btrfs_scratch_superblocks(fs_info, device);
if (device->bdev) {
sync_blockdev(device->bdev);
invalidate_bdev(device->bdev);
}
}
- *bdev = device->bdev;
- *mode = device->mode;
+ *bdev_file = device->bdev_file;
synchronize_rcu();
btrfs_free_device(device);
@@ -2237,12 +2309,13 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
*/
if (cur_devices->num_devices == 0) {
list_del_init(&cur_devices->seed_list);
- ASSERT(cur_devices->opened == 1);
+ ASSERT(cur_devices->opened == 1, "opened=%d", cur_devices->opened);
cur_devices->opened--;
free_fs_devices(cur_devices);
}
-out:
+ ret = btrfs_commit_transaction(trans);
+
return ret;
error_undo:
@@ -2253,7 +2326,7 @@ error_undo:
device->fs_devices->rw_devices++;
mutex_unlock(&fs_info->chunk_mutex);
}
- goto out;
+ return ret;
}
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
@@ -2329,16 +2402,15 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
mutex_unlock(&fs_devices->device_list_mutex);
- btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
- tgtdev->name->str);
+ btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev);
btrfs_close_bdev(tgtdev);
synchronize_rcu();
btrfs_free_device(tgtdev);
}
-/**
- * Populate args from device at path
+/*
+ * Populate args from device at path.
*
* @fs_info: the filesystem
* @args: the args to populate
@@ -2360,7 +2432,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
const char *path)
{
struct btrfs_super_block *disk_super;
- struct block_device *bdev;
+ struct file *bdev_file;
int ret;
if (!path || !path[0])
@@ -2377,10 +2449,13 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
return -ENOMEM;
}
- ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0,
- &bdev, &disk_super);
- if (ret)
+ ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
+ &bdev_file, &disk_super);
+ if (ret) {
+ btrfs_put_dev_args_from_path(args);
return ret;
+ }
+
args->devid = btrfs_stack_device_id(&disk_super->dev_item);
memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
@@ -2388,7 +2463,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
else
memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
btrfs_release_disk_super(disk_super);
- blkdev_put(bdev, FMODE_READ);
+ bdev_fput(bdev_file);
return 0;
}
@@ -2445,7 +2520,7 @@ static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
* Private copy of the seed devices, anchored at
* fs_info->fs_devices->seed_list
*/
- seed_devices = alloc_fs_devices(NULL, NULL);
+ seed_devices = alloc_fs_devices(NULL);
if (IS_ERR(seed_devices))
return seed_devices;
@@ -2533,7 +2608,7 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->chunk_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_dev_item *dev_item;
struct btrfs_device *device;
@@ -2547,15 +2622,15 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
return -ENOMEM;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
- key.offset = 0;
key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = 0;
while (1) {
btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
btrfs_trans_release_chunk_metadata(trans);
if (ret < 0)
- goto error;
+ return ret;
leaf = path->nodes[0];
next_slot:
@@ -2564,7 +2639,7 @@ next_slot:
if (ret > 0)
break;
if (ret < 0)
- goto error;
+ return ret;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
btrfs_release_path(path);
@@ -2588,32 +2663,25 @@ next_slot:
device = btrfs_find_device(fs_info->fs_devices, &args);
BUG_ON(!device); /* Logic error */
- if (device->fs_devices->seeding) {
+ if (device->fs_devices->seeding)
btrfs_set_device_generation(leaf, dev_item,
device->generation);
- btrfs_mark_buffer_dirty(leaf);
- }
path->slots[0]++;
goto next_slot;
}
- ret = 0;
-error:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
{
struct btrfs_root *root = fs_info->dev_root;
- struct request_queue *q;
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
- struct block_device *bdev;
+ struct file *bdev_file;
struct super_block *sb = fs_info->sb;
- struct rcu_string *name;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- struct btrfs_fs_devices *seed_devices;
+ struct btrfs_fs_devices *seed_devices = NULL;
u64 orig_super_total_bytes;
u64 orig_super_num_devices;
int ret = 0;
@@ -2623,12 +2691,17 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (sb_rdonly(sb) && !fs_devices->seeding)
return -EROFS;
- bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
- fs_info->bdev_holder);
- if (IS_ERR(bdev))
- return PTR_ERR(bdev);
+ bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
+ fs_info->sb, &fs_holder_ops);
+ if (IS_ERR(bdev_file))
+ return PTR_ERR(bdev_file);
+
+ if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) {
+ ret = -EINVAL;
+ goto error;
+ }
- if (!btrfs_check_device_zone_type(fs_info, bdev)) {
+ if (bdev_nr_bytes(file_bdev(bdev_file)) <= BTRFS_DEVICE_RANGE_RESERVED) {
ret = -EINVAL;
goto error;
}
@@ -2640,11 +2713,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
locked = true;
}
- sync_blockdev(bdev);
+ sync_blockdev(file_bdev(bdev_file));
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
- if (device->bdev == bdev) {
+ if (device->bdev == file_bdev(bdev_file)) {
ret = -EEXIST;
rcu_read_unlock();
goto error;
@@ -2652,22 +2725,19 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
rcu_read_unlock();
- device = btrfs_alloc_device(fs_info, NULL, NULL);
+ device = btrfs_alloc_device(fs_info, NULL, NULL, device_path);
if (IS_ERR(device)) {
/* we can safely leave the fs_devices entry around */
ret = PTR_ERR(device);
goto error;
}
- name = rcu_string_strdup(device_path, GFP_KERNEL);
- if (!name) {
- ret = -ENOMEM;
- goto error_free_device;
- }
- rcu_assign_pointer(device->name, name);
-
device->fs_info = fs_info;
- device->bdev = bdev;
+ device->bdev_file = bdev_file;
+ device->bdev = file_bdev(bdev_file);
+ ret = lookup_bdev(device_path, &device->devt);
+ if (ret)
+ goto error_free_device;
ret = btrfs_get_dev_zone_info(device, false);
if (ret)
@@ -2679,25 +2749,21 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
goto error_free_zone;
}
- q = bdev_get_queue(bdev);
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = trans->transid;
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
device->sector_size = fs_info->sectorsize;
device->total_bytes =
- round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
+ round_down(bdev_nr_bytes(device->bdev), fs_info->sectorsize);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
- set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
+ set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);
if (seeding_dev) {
- btrfs_clear_sb_rdonly(sb);
-
/* GFP_KERNEL allocation must not be under device_list_mutex */
seed_devices = btrfs_init_sprout(fs_info);
if (IS_ERR(seed_devices)) {
@@ -2727,7 +2793,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
- if (!blk_queue_nonrot(q))
+ if (!bdev_nonrot(device->bdev))
fs_devices->rotating = true;
orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
@@ -2756,21 +2822,21 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
mutex_lock(&fs_info->chunk_mutex);
ret = init_first_rw_device(trans);
mutex_unlock(&fs_info->chunk_mutex);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
}
}
ret = btrfs_add_dev_item(trans, device);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
}
if (seeding_dev) {
ret = btrfs_finish_sprout(trans);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
}
@@ -2814,7 +2880,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
* We can ignore the return value as it typically returns -EINVAL and
* only succeeds if the device was an alien.
*/
- btrfs_forget_devices(device_path);
+ btrfs_forget_devices(device->devt);
/* Update ctime/mtime for blkid or udev */
update_dev_time(device_path);
@@ -2840,8 +2906,6 @@ error_sysfs:
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
error_trans:
- if (seeding_dev)
- btrfs_set_sb_rdonly(sb);
if (trans)
btrfs_end_transaction(trans);
error_free_zone:
@@ -2849,7 +2913,7 @@ error_free_zone:
error_free_device:
btrfs_free_device(device);
error:
- blkdev_put(bdev, FMODE_EXCL);
+ bdev_fput(bdev_file);
if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
@@ -2861,7 +2925,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = device->fs_info->chunk_root;
struct btrfs_dev_item *dev_item;
struct extent_buffer *leaf;
@@ -2877,12 +2941,10 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0)
- goto out;
+ return ret;
- if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ if (ret > 0)
+ return -ENOENT;
leaf = path->nodes[0];
dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
@@ -2896,10 +2958,6 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
btrfs_device_get_disk_total_bytes(device));
btrfs_set_device_bytes_used(leaf, dev_item,
btrfs_device_get_bytes_used(device));
- btrfs_mark_buffer_dirty(leaf);
-
-out:
- btrfs_free_path(path);
return ret;
}
@@ -2930,6 +2988,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
btrfs_set_super_total_bytes(super_copy,
round_down(old_total + diff, fs_info->sectorsize));
device->fs_devices->total_rw_bytes += diff;
+ atomic64_add(diff, &fs_info->free_chunk_space);
btrfs_device_set_total_bytes(device, new_size);
btrfs_device_set_disk_total_bytes(device, new_size);
@@ -2951,7 +3010,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->chunk_root;
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
path = btrfs_alloc_path();
@@ -2959,25 +3018,26 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
return -ENOMEM;
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
- key.offset = chunk_offset;
key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = chunk_offset;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
- goto out;
- else if (ret > 0) { /* Logic error or corruption */
- btrfs_handle_fs_error(fs_info, -ENOENT,
- "Failed lookup while freeing chunk.");
- ret = -ENOENT;
- goto out;
+ return ret;
+ if (unlikely(ret > 0)) {
+ /* Logic error or corruption */
+ btrfs_err(fs_info, "failed to lookup chunk %llu when freeing",
+ chunk_offset);
+ btrfs_abort_transaction(trans, -ENOENT);
+ return -EUCLEAN;
}
ret = btrfs_del_item(trans, root, path);
- if (ret < 0)
- btrfs_handle_fs_error(fs_info, ret,
- "Failed to delete chunk item.");
-out:
- btrfs_free_path(path);
+ if (unlikely(ret < 0)) {
+ btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
return ret;
}
@@ -3027,44 +3087,118 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
return ret;
}
+struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct rb_node *node = fs_info->mapping_tree.rb_root.rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *orig_prev;
+ struct btrfs_chunk_map *map;
+ struct btrfs_chunk_map *prev_map = NULL;
+
+ while (node) {
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ prev = node;
+ prev_map = map;
+
+ if (logical < map->start) {
+ node = node->rb_left;
+ } else if (logical >= map->start + map->chunk_len) {
+ node = node->rb_right;
+ } else {
+ refcount_inc(&map->refs);
+ return map;
+ }
+ }
+
+ if (!prev)
+ return NULL;
+
+ orig_prev = prev;
+ while (prev && logical >= prev_map->start + prev_map->chunk_len) {
+ prev = rb_next(prev);
+ prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+ }
+
+ if (!prev) {
+ prev = orig_prev;
+ prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+ while (prev && logical < prev_map->start) {
+ prev = rb_prev(prev);
+ prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+ }
+ }
+
+ if (prev) {
+ u64 end = logical + length;
+
+ /*
+ * Caller can pass a U64_MAX length when it wants to get any
+ * chunk starting at an offset of 'logical' or higher, so deal
+ * with underflow by resetting the end offset to U64_MAX.
+ */
+ if (end < logical)
+ end = U64_MAX;
+
+ if (end > prev_map->start &&
+ logical < prev_map->start + prev_map->chunk_len) {
+ refcount_inc(&prev_map->refs);
+ return prev_map;
+ }
+ }
+
+ return NULL;
+}
+
+struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct btrfs_chunk_map *map;
+
+ read_lock(&fs_info->mapping_tree_lock);
+ map = btrfs_find_chunk_map_nolock(fs_info, logical, length);
+ read_unlock(&fs_info->mapping_tree_lock);
+
+ return map;
+}
+
/*
- * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
+ * Find the mapping containing the given logical extent.
+ *
* @logical: Logical block offset in bytes.
* @length: Length of extent in bytes.
*
* Return: Chunk mapping or ERR_PTR.
*/
-struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length)
+struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
{
- struct extent_map_tree *em_tree;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
- em_tree = &fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, length);
- read_unlock(&em_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, logical, length);
- if (!em) {
- btrfs_crit(fs_info, "unable to find logical %llu length %llu",
+ if (unlikely(!map)) {
+ btrfs_crit(fs_info,
+ "unable to find chunk map for logical %llu length %llu",
logical, length);
return ERR_PTR(-EINVAL);
}
- if (em->start > logical || em->start + em->len < logical) {
+ if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) {
btrfs_crit(fs_info,
- "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
- logical, length, em->start, em->start + em->len);
- free_extent_map(em);
+ "found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
+ logical, logical + length, map->start,
+ map->start + map->chunk_len);
+ btrfs_free_chunk_map(map);
return ERR_PTR(-EINVAL);
}
- /* callers are responsible for dropping em's ref. */
- return em;
+ /* Callers are responsible for dropping the reference. */
+ return map;
}
static int remove_chunk_item(struct btrfs_trans_handle *trans,
- struct map_lookup *map, u64 chunk_offset)
+ struct btrfs_chunk_map *map, u64 chunk_offset)
{
int i;
@@ -3089,23 +3223,22 @@ static int remove_chunk_item(struct btrfs_trans_handle *trans,
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 dev_extent_len = 0;
int i, ret = 0;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
- if (IS_ERR(em)) {
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(map)) {
/*
* This is a logic error, but we don't want to just rely on the
* user having built with ASSERT enabled, so if ASSERT doesn't
* do anything we still error out.
*/
- ASSERT(0);
- return PTR_ERR(em);
+ DEBUG_WARN("errr %ld reading chunk map at offset %llu",
+ PTR_ERR(map), chunk_offset);
+ return PTR_ERR(map);
}
- map = em->map_lookup;
/*
* First delete the device extent items from the devices btree.
@@ -3123,7 +3256,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
ret = btrfs_free_dev_extent(trans, device,
map->stripes[i].physical,
&dev_extent_len);
- if (ret) {
+ if (unlikely(ret)) {
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3135,6 +3268,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
device->bytes_used - dev_extent_len);
atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
btrfs_clear_space_info_full(fs_info);
+
+ if (list_empty(&device->post_commit_list)) {
+ list_add_tail(&device->post_commit_list,
+ &trans->transaction->dev_update_list);
+ }
+
mutex_unlock(&fs_info->chunk_mutex);
}
}
@@ -3184,8 +3323,16 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
if (ret == -ENOSPC) {
const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
struct btrfs_block_group *sys_bg;
+ struct btrfs_space_info *space_info;
+
+ space_info = btrfs_find_space_info(fs_info, sys_flags);
+ if (unlikely(!space_info)) {
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
- sys_bg = btrfs_create_chunk(trans, sys_flags);
+ sys_bg = btrfs_create_chunk(trans, space_info, sys_flags);
if (IS_ERR(sys_bg)) {
ret = PTR_ERR(sys_bg);
btrfs_abort_transaction(trans, ret);
@@ -3193,26 +3340,26 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
}
ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = remove_chunk_item(trans, map, chunk_offset);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- } else if (ret) {
+ } else if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
+ trace_btrfs_chunk_free(fs_info, map, chunk_offset, map->chunk_len);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3227,8 +3374,8 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
*/
btrfs_trans_release_chunk_metadata(trans);
- ret = btrfs_remove_block_group(trans, chunk_offset, em);
- if (ret) {
+ ret = btrfs_remove_block_group(trans, map);
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3239,11 +3386,12 @@ out:
trans->removing_chunk = false;
}
/* once for us */
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
-int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ bool verbose)
{
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_trans_handle *trans;
@@ -3251,6 +3399,12 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
u64 length;
int ret;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info,
+ "relocate: not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
/*
* Prevent races with automatic removal of unused block groups.
* After we relocate and before we remove the chunk with offset
@@ -3267,10 +3421,17 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
/* step one, relocate all the extents inside this chunk */
btrfs_scrub_pause(fs_info);
- ret = btrfs_relocate_block_group(fs_info, chunk_offset);
+ ret = btrfs_relocate_block_group(fs_info, chunk_offset, true);
btrfs_scrub_continue(fs_info);
- if (ret)
+ if (ret) {
+ /*
+ * If we had a transaction abort, stop all running scrubs.
+ * See transaction.c:cleanup_transaction() why we do it here.
+ */
+ if (BTRFS_FS_ERROR(fs_info))
+ btrfs_scrub_cancel(fs_info);
return ret;
+ }
block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
if (!block_group)
@@ -3313,7 +3474,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *chunk_root = fs_info->chunk_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_chunk *chunk;
struct btrfs_key key;
@@ -3329,24 +3490,34 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
again:
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
- key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = (u64)-1;
while (1) {
mutex_lock(&fs_info->reclaim_bgs_lock);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0) {
mutex_unlock(&fs_info->reclaim_bgs_lock);
- goto error;
+ return ret;
+ }
+ if (unlikely(ret == 0)) {
+ /*
+ * On the first search we would find chunk tree with
+ * offset -1, which is not possible. On subsequent
+ * loops this would find an existing item on an invalid
+ * offset (one less than the previous one, wrong
+ * alignment and size).
+ */
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ return -EUCLEAN;
}
- BUG_ON(ret == 0); /* Corruption */
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
if (ret)
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret < 0)
- goto error;
+ return ret;
if (ret > 0)
break;
@@ -3359,7 +3530,8 @@ again:
btrfs_release_path(path);
if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
- ret = btrfs_relocate_chunk(fs_info, found_key.offset);
+ ret = btrfs_relocate_chunk(fs_info, found_key.offset,
+ true);
if (ret == -ENOSPC)
failed++;
else
@@ -3379,8 +3551,6 @@ again:
} else if (WARN_ON(failed && retried)) {
ret = -ENOSPC;
}
-error:
- btrfs_free_path(path);
return ret;
}
@@ -3426,6 +3596,44 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
return 0;
}
+static void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+ const struct btrfs_disk_balance_args *disk)
+{
+ memset(cpu, 0, sizeof(*cpu));
+
+ cpu->profiles = le64_to_cpu(disk->profiles);
+ cpu->usage = le64_to_cpu(disk->usage);
+ cpu->devid = le64_to_cpu(disk->devid);
+ cpu->pstart = le64_to_cpu(disk->pstart);
+ cpu->pend = le64_to_cpu(disk->pend);
+ cpu->vstart = le64_to_cpu(disk->vstart);
+ cpu->vend = le64_to_cpu(disk->vend);
+ cpu->target = le64_to_cpu(disk->target);
+ cpu->flags = le64_to_cpu(disk->flags);
+ cpu->limit = le64_to_cpu(disk->limit);
+ cpu->stripes_min = le32_to_cpu(disk->stripes_min);
+ cpu->stripes_max = le32_to_cpu(disk->stripes_max);
+}
+
+static void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+ const struct btrfs_balance_args *cpu)
+{
+ memset(disk, 0, sizeof(*disk));
+
+ disk->profiles = cpu_to_le64(cpu->profiles);
+ disk->usage = cpu_to_le64(cpu->usage);
+ disk->devid = cpu_to_le64(cpu->devid);
+ disk->pstart = cpu_to_le64(cpu->pstart);
+ disk->pend = cpu_to_le64(cpu->pend);
+ disk->vstart = cpu_to_le64(cpu->vstart);
+ disk->vend = cpu_to_le64(cpu->vend);
+ disk->target = cpu_to_le64(cpu->target);
+ disk->flags = cpu_to_le64(cpu->flags);
+ disk->limit = cpu_to_le64(cpu->limit);
+ disk->stripes_min = cpu_to_le32(cpu->stripes_min);
+ disk->stripes_max = cpu_to_le32(cpu->stripes_max);
+}
+
static int insert_balance_item(struct btrfs_fs_info *fs_info,
struct btrfs_balance_control *bctl)
{
@@ -3468,10 +3676,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info,
btrfs_set_balance_meta(leaf, item, &disk_bargs);
btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
btrfs_set_balance_sys(leaf, item, &disk_bargs);
-
btrfs_set_balance_flags(leaf, item, bctl->flags);
-
- btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
err = btrfs_commit_transaction(trans);
@@ -3570,7 +3775,7 @@ static void reset_balance_state(struct btrfs_fs_info *fs_info)
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
int ret;
- BUG_ON(!fs_info->balance_ctl);
+ ASSERT(fs_info->balance_ctl);
spin_lock(&fs_info->balance_lock);
fs_info->balance_ctl = NULL;
@@ -3586,26 +3791,25 @@ static void reset_balance_state(struct btrfs_fs_info *fs_info)
* Balance filters. Return 1 if chunk should be filtered out
* (should not be balanced).
*/
-static int chunk_profiles_filter(u64 chunk_type,
- struct btrfs_balance_args *bargs)
+static bool chunk_profiles_filter(u64 chunk_type, struct btrfs_balance_args *bargs)
{
chunk_type = chunk_to_extended(chunk_type) &
BTRFS_EXTENDED_PROFILE_MASK;
if (bargs->profiles & chunk_type)
- return 0;
+ return false;
- return 1;
+ return true;
}
-static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
- struct btrfs_balance_args *bargs)
+static bool chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
{
struct btrfs_block_group *cache;
u64 chunk_used;
u64 user_thresh_min;
u64 user_thresh_max;
- int ret = 1;
+ bool ret = true;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = cache->used;
@@ -3613,30 +3817,28 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_off
if (bargs->usage_min == 0)
user_thresh_min = 0;
else
- user_thresh_min = div_factor_fine(cache->length,
- bargs->usage_min);
+ user_thresh_min = mult_perc(cache->length, bargs->usage_min);
if (bargs->usage_max == 0)
user_thresh_max = 1;
else if (bargs->usage_max > 100)
user_thresh_max = cache->length;
else
- user_thresh_max = div_factor_fine(cache->length,
- bargs->usage_max);
+ user_thresh_max = mult_perc(cache->length, bargs->usage_max);
if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
- ret = 0;
+ ret = false;
btrfs_put_block_group(cache);
return ret;
}
-static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
- u64 chunk_offset, struct btrfs_balance_args *bargs)
+static bool chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
{
struct btrfs_block_group *cache;
u64 chunk_used, user_thresh;
- int ret = 1;
+ bool ret = true;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = cache->used;
@@ -3646,18 +3848,17 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
else if (bargs->usage > 100)
user_thresh = cache->length;
else
- user_thresh = div_factor_fine(cache->length, bargs->usage);
+ user_thresh = mult_perc(cache->length, bargs->usage);
if (chunk_used < user_thresh)
- ret = 0;
+ ret = false;
btrfs_put_block_group(cache);
return ret;
}
-static int chunk_devid_filter(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk,
- struct btrfs_balance_args *bargs)
+static bool chunk_devid_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
{
struct btrfs_stripe *stripe;
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
@@ -3666,10 +3867,10 @@ static int chunk_devid_filter(struct extent_buffer *leaf,
for (i = 0; i < num_stripes; i++) {
stripe = btrfs_stripe_nr(chunk, i);
if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
- return 0;
+ return false;
}
- return 1;
+ return true;
}
static u64 calc_data_stripes(u64 type, int num_stripes)
@@ -3682,9 +3883,8 @@ static u64 calc_data_stripes(u64 type, int num_stripes)
}
/* [pstart, pend) */
-static int chunk_drange_filter(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk,
- struct btrfs_balance_args *bargs)
+static bool chunk_drange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
{
struct btrfs_stripe *stripe;
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
@@ -3695,7 +3895,7 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
int i;
if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
- return 0;
+ return false;
type = btrfs_chunk_type(leaf, chunk);
factor = calc_data_stripes(type, num_stripes);
@@ -3711,56 +3911,53 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
if (stripe_offset < bargs->pend &&
stripe_offset + stripe_length > bargs->pstart)
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* [vstart, vend) */
-static int chunk_vrange_filter(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk,
- u64 chunk_offset,
- struct btrfs_balance_args *bargs)
+static bool chunk_vrange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
+ u64 chunk_offset, struct btrfs_balance_args *bargs)
{
if (chunk_offset < bargs->vend &&
chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
/* at least part of the chunk is inside this vrange */
- return 0;
+ return false;
- return 1;
+ return true;
}
-static int chunk_stripes_range_filter(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk,
- struct btrfs_balance_args *bargs)
+static bool chunk_stripes_range_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
{
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
if (bargs->stripes_min <= num_stripes
&& num_stripes <= bargs->stripes_max)
- return 0;
+ return false;
- return 1;
+ return true;
}
-static int chunk_soft_convert_filter(u64 chunk_type,
- struct btrfs_balance_args *bargs)
+static bool chunk_soft_convert_filter(u64 chunk_type, struct btrfs_balance_args *bargs)
{
if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
- return 0;
+ return false;
chunk_type = chunk_to_extended(chunk_type) &
BTRFS_EXTENDED_PROFILE_MASK;
if (bargs->target == chunk_type)
- return 1;
+ return true;
- return 0;
+ return false;
}
-static int should_balance_chunk(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk, u64 chunk_offset)
+static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
+ u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -3770,7 +3967,7 @@ static int should_balance_chunk(struct extent_buffer *leaf,
/* type filter */
if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
(bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
- return 0;
+ return false;
}
if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
@@ -3783,46 +3980,46 @@ static int should_balance_chunk(struct extent_buffer *leaf,
/* profiles filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
chunk_profiles_filter(chunk_type, bargs)) {
- return 0;
+ return false;
}
/* usage filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
chunk_usage_filter(fs_info, chunk_offset, bargs)) {
- return 0;
+ return false;
} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
- return 0;
+ return false;
}
/* devid filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
chunk_devid_filter(leaf, chunk, bargs)) {
- return 0;
+ return false;
}
/* drange filter, makes sense only with devid filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
chunk_drange_filter(leaf, chunk, bargs)) {
- return 0;
+ return false;
}
/* vrange filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
- return 0;
+ return false;
}
/* stripes filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
chunk_stripes_range_filter(leaf, chunk, bargs)) {
- return 0;
+ return false;
}
/* soft profile changing mode */
if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
chunk_soft_convert_filter(chunk_type, bargs)) {
- return 0;
+ return false;
}
/*
@@ -3830,7 +4027,7 @@ static int should_balance_chunk(struct extent_buffer *leaf,
*/
if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
if (bargs->limit == 0)
- return 0;
+ return false;
else
bargs->limit--;
} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
@@ -3840,12 +4037,12 @@ static int should_balance_chunk(struct extent_buffer *leaf,
* about the count of all chunks that satisfy the filters.
*/
if (bargs->limit_max == 0)
- return 0;
+ return false;
else
bargs->limit_max--;
}
- return 1;
+ return true;
}
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
@@ -3854,7 +4051,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
struct btrfs_root *chunk_root = fs_info->chunk_root;
u64 chunk_type;
struct btrfs_chunk *chunk;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
struct extent_buffer *leaf;
@@ -3892,8 +4089,8 @@ again:
bctl->sys.limit = limit_sys;
}
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
- key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = (u64)-1;
while (1) {
if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
@@ -3997,7 +4194,7 @@ again:
}
}
- ret = btrfs_relocate_chunk(fs_info, found_key.offset);
+ ret = btrfs_relocate_chunk(fs_info, found_key.offset, true);
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
enospc_errors++;
@@ -4025,7 +4222,6 @@ loop:
goto again;
}
error:
- btrfs_free_path(path);
if (enospc_errors) {
btrfs_info(fs_info, "%d enospc errors during balance",
enospc_errors);
@@ -4036,12 +4232,13 @@ error:
return ret;
}
-/**
- * alloc_profile_is_valid - see if a given profile is valid and reduced
- * @flags: profile to validate
- * @extended: if true @flags is treated as an extended profile
+/*
+ * See if a given profile is valid and reduced.
+ *
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
*/
-static int alloc_profile_is_valid(u64 flags, int extended)
+static int alloc_profile_is_valid(u64 flags, bool extended)
{
u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
BTRFS_BLOCK_GROUP_PROFILE_MASK);
@@ -4059,14 +4256,6 @@ static int alloc_profile_is_valid(u64 flags, int extended)
return has_single_bit_set(flags);
}
-static inline int balance_need_close(struct btrfs_fs_info *fs_info)
-{
- /* cancel requested || normal exit path */
- return atomic_read(&fs_info->balance_cancel_req) ||
- (atomic_read(&fs_info->balance_pause_req) == 0 &&
- atomic_read(&fs_info->balance_cancel_req) == 0);
-}
-
/*
* Validate target profile against allowed profiles and return true if it's OK.
* Otherwise print the error message and return false.
@@ -4078,13 +4267,6 @@ static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
return true;
- if (fs_info->sectorsize < PAGE_SIZE &&
- bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- btrfs_err(fs_info,
- "RAID56 is not yet supported for sectorsize %u with page size %lu",
- fs_info->sectorsize, PAGE_SIZE);
- return false;
- }
/* Profile is valid and does not have bits outside of the allowed set */
if (alloc_profile_is_valid(bargs->target, 1) &&
(bargs->target & ~allowed) == 0)
@@ -4197,7 +4379,7 @@ static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
{
u32 size_buf = 1024;
char tmp_buf[192] = {'\0'};
- char *buf;
+ char AUTO_KFREE(buf);
char *bp;
u32 size_bp = size_buf;
int ret;
@@ -4245,12 +4427,10 @@ out_overflow:
btrfs_info(fs_info, "balance: %s %s",
(bctl->flags & BTRFS_BALANCE_RESUME) ?
"resume" : "start", buf);
-
- kfree(buf);
}
/*
- * Should be called with balance mutexe held
+ * Should be called with balance mutex held
*/
int btrfs_balance(struct btrfs_fs_info *fs_info,
struct btrfs_balance_control *bctl,
@@ -4263,6 +4443,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
u64 num_devices;
unsigned seq;
bool reducing_redundancy;
+ bool paused = false;
int i;
if (btrfs_fs_closing(fs_info) ||
@@ -4393,6 +4574,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
btrfs_info(fs_info, "balance: paused");
btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
+ paused = true;
}
/*
* Balance can be canceled by:
@@ -4421,8 +4603,8 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
btrfs_update_ioctl_balance_args(fs_info, bargs);
}
- if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
- balance_need_close(fs_info)) {
+ /* We didn't pause, we can clean everything up. */
+ if (!paused) {
reset_balance_state(fs_info);
btrfs_exclop_finish(fs_info);
}
@@ -4445,6 +4627,8 @@ static int balance_kthread(void *data)
struct btrfs_fs_info *fs_info = data;
int ret = 0;
+ guard(super_write)(fs_info->sb);
+
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl)
ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
@@ -4470,7 +4654,8 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
}
spin_lock(&fs_info->super_lock);
- ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED,
+ "exclusive_operation=%d", fs_info->exclusive_operation);
fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
spin_unlock(&fs_info->super_lock);
/*
@@ -4491,7 +4676,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
struct btrfs_balance_control *bctl;
struct btrfs_balance_item *item;
struct btrfs_disk_balance_args disk_bargs;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key key;
int ret;
@@ -4506,17 +4691,14 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0) { /* ret = -ENOENT; */
- ret = 0;
- goto out;
+ return 0;
}
bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
- if (!bctl) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!bctl)
+ return -ENOMEM;
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
@@ -4553,8 +4735,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
fs_info->balance_ctl = bctl;
spin_unlock(&fs_info->balance_lock);
mutex_unlock(&fs_info->balance_mutex);
-out:
- btrfs_free_path(path);
return ret;
}
@@ -4630,190 +4810,12 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
}
}
- BUG_ON(fs_info->balance_ctl ||
- test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
+ ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
atomic_dec(&fs_info->balance_cancel_req);
mutex_unlock(&fs_info->balance_mutex);
return 0;
}
-int btrfs_uuid_scan_kthread(void *data)
-{
- struct btrfs_fs_info *fs_info = data;
- struct btrfs_root *root = fs_info->tree_root;
- struct btrfs_key key;
- struct btrfs_path *path = NULL;
- int ret = 0;
- struct extent_buffer *eb;
- int slot;
- struct btrfs_root_item root_item;
- u32 item_size;
- struct btrfs_trans_handle *trans = NULL;
- bool closing = false;
-
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
-
- key.objectid = 0;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = 0;
-
- while (1) {
- if (btrfs_fs_closing(fs_info)) {
- closing = true;
- break;
- }
- ret = btrfs_search_forward(root, &key, path,
- BTRFS_OLDEST_GENERATION);
- if (ret) {
- if (ret > 0)
- ret = 0;
- break;
- }
-
- if (key.type != BTRFS_ROOT_ITEM_KEY ||
- (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
- key.objectid != BTRFS_FS_TREE_OBJECTID) ||
- key.objectid > BTRFS_LAST_FREE_OBJECTID)
- goto skip;
-
- eb = path->nodes[0];
- slot = path->slots[0];
- item_size = btrfs_item_size(eb, slot);
- if (item_size < sizeof(root_item))
- goto skip;
-
- read_extent_buffer(eb, &root_item,
- btrfs_item_ptr_offset(eb, slot),
- (int)sizeof(root_item));
- if (btrfs_root_refs(&root_item) == 0)
- goto skip;
-
- if (!btrfs_is_empty_uuid(root_item.uuid) ||
- !btrfs_is_empty_uuid(root_item.received_uuid)) {
- if (trans)
- goto update_tree;
-
- btrfs_release_path(path);
- /*
- * 1 - subvol uuid item
- * 1 - received_subvol uuid item
- */
- trans = btrfs_start_transaction(fs_info->uuid_root, 2);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
- }
- continue;
- } else {
- goto skip;
- }
-update_tree:
- btrfs_release_path(path);
- if (!btrfs_is_empty_uuid(root_item.uuid)) {
- ret = btrfs_uuid_tree_add(trans, root_item.uuid,
- BTRFS_UUID_KEY_SUBVOL,
- key.objectid);
- if (ret < 0) {
- btrfs_warn(fs_info, "uuid_tree_add failed %d",
- ret);
- break;
- }
- }
-
- if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
- ret = btrfs_uuid_tree_add(trans,
- root_item.received_uuid,
- BTRFS_UUID_KEY_RECEIVED_SUBVOL,
- key.objectid);
- if (ret < 0) {
- btrfs_warn(fs_info, "uuid_tree_add failed %d",
- ret);
- break;
- }
- }
-
-skip:
- btrfs_release_path(path);
- if (trans) {
- ret = btrfs_end_transaction(trans);
- trans = NULL;
- if (ret)
- break;
- }
-
- if (key.offset < (u64)-1) {
- key.offset++;
- } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
- key.offset = 0;
- key.type = BTRFS_ROOT_ITEM_KEY;
- } else if (key.objectid < (u64)-1) {
- key.offset = 0;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.objectid++;
- } else {
- break;
- }
- cond_resched();
- }
-
-out:
- btrfs_free_path(path);
- if (trans && !IS_ERR(trans))
- btrfs_end_transaction(trans);
- if (ret)
- btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
- else if (!closing)
- set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
- up(&fs_info->uuid_tree_rescan_sem);
- return 0;
-}
-
-int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_trans_handle *trans;
- struct btrfs_root *tree_root = fs_info->tree_root;
- struct btrfs_root *uuid_root;
- struct task_struct *task;
- int ret;
-
- /*
- * 1 - root node
- * 1 - root item
- */
- trans = btrfs_start_transaction(tree_root, 2);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
- if (IS_ERR(uuid_root)) {
- ret = PTR_ERR(uuid_root);
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- return ret;
- }
-
- fs_info->uuid_root = uuid_root;
-
- ret = btrfs_commit_transaction(trans);
- if (ret)
- return ret;
-
- down(&fs_info->uuid_tree_rescan_sem);
- task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
- if (IS_ERR(task)) {
- /* fs_info->update_uuid_tree_gen remains 0 in all error case */
- btrfs_warn(fs_info, "failed to start uuid_scan task");
- up(&fs_info->uuid_tree_rescan_sem);
- return PTR_ERR(task);
- }
-
- return 0;
-}
-
/*
* shrinking a device means finding all of the device extents past
* the new size, and then following the back refs to the chunks.
@@ -4839,6 +4841,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
u64 old_size = btrfs_device_get_total_bytes(device);
u64 diff;
u64 start;
+ u64 free_diff = 0;
new_size = round_down(new_size, fs_info->sectorsize);
start = new_size;
@@ -4864,7 +4867,19 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
btrfs_device_set_total_bytes(device, new_size);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
device->fs_devices->total_rw_bytes -= diff;
- atomic64_sub(diff, &fs_info->free_chunk_space);
+
+ /*
+ * The new free_chunk_space is new_size - used, so we have to
+ * subtract the delta of the old free_chunk_space which included
+ * old_size - used. If used > new_size then just subtract this
+ * entire device's free space.
+ */
+ if (device->bytes_used < new_size)
+ free_diff = (old_size - device->bytes_used) -
+ (new_size - device->bytes_used);
+ else
+ free_diff = old_size - device->bytes_used;
+ atomic64_sub(free_diff, &fs_info->free_chunk_space);
}
/*
@@ -4884,8 +4899,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
again:
key.objectid = device->devid;
- key.offset = (u64)-1;
key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = (u64)-1;
do {
mutex_lock(&fs_info->reclaim_bgs_lock);
@@ -4939,7 +4954,7 @@ again:
goto done;
}
- ret = btrfs_relocate_chunk(fs_info, chunk_offset);
+ ret = btrfs_relocate_chunk(fs_info, chunk_offset, true);
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
failed++;
@@ -4971,8 +4986,8 @@ again:
mutex_lock(&fs_info->chunk_mutex);
/* Clear all state bits beyond the shrunk device size */
- clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
- CHUNK_STATE_MASK);
+ btrfs_clear_extent_bit(&device->alloc_state, new_size, (u64)-1,
+ CHUNK_STATE_MASK, NULL);
btrfs_device_set_disk_total_bytes(device, new_size);
if (list_empty(&device->post_commit_list))
@@ -4988,7 +5003,7 @@ again:
/* Now btrfs_update_device() will change the on-disk size. */
ret = btrfs_update_device(trans, device);
btrfs_trans_release_chunk_metadata(trans);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
} else {
@@ -4999,9 +5014,10 @@ done:
if (ret) {
mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_total_bytes(device, old_size);
- if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
device->fs_devices->total_rw_bytes += diff;
- atomic64_add(diff, &fs_info->free_chunk_space);
+ atomic64_add(free_diff, &fs_info->free_chunk_space);
+ }
mutex_unlock(&fs_info->chunk_mutex);
}
return ret;
@@ -5098,37 +5114,29 @@ struct alloc_chunk_ctl {
u64 stripe_size;
u64 chunk_size;
int ndevs;
+ /* Space_info the block group is going to belong. */
+ struct btrfs_space_info *space_info;
};
static void init_alloc_chunk_ctl_policy_regular(
struct btrfs_fs_devices *fs_devices,
struct alloc_chunk_ctl *ctl)
{
- u64 type = ctl->type;
+ struct btrfs_space_info *space_info;
- if (type & BTRFS_BLOCK_GROUP_DATA) {
- ctl->max_stripe_size = SZ_1G;
- ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
- } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
- /* For larger filesystems, use larger metadata chunks */
- if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
- ctl->max_stripe_size = SZ_1G;
- else
- ctl->max_stripe_size = SZ_256M;
- ctl->max_chunk_size = ctl->max_stripe_size;
- } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
- ctl->max_stripe_size = SZ_32M;
- ctl->max_chunk_size = 2 * ctl->max_stripe_size;
- ctl->devs_max = min_t(int, ctl->devs_max,
- BTRFS_MAX_DEVS_SYS_CHUNK);
- } else {
- BUG();
- }
+ space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type);
+ ASSERT(space_info);
+
+ ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
+ ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G);
+
+ if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
+ ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
/* We don't want a chunk larger than 10% of writable space */
- ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+ ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10),
ctl->max_chunk_size);
- ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
+ ctl->dev_extent_min = btrfs_stripe_nr_to_offset(ctl->dev_stripes);
}
static void init_alloc_chunk_ctl_policy_zoned(
@@ -5157,7 +5165,7 @@ static void init_alloc_chunk_ctl_policy_zoned(
}
/* We don't want a chunk larger than 10% of writable space */
- limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
+ limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10),
zone_size),
min_chunk_size);
ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
@@ -5181,14 +5189,15 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
ctl->ndevs = 0;
switch (fs_devices->chunk_alloc_policy) {
+ default:
+ btrfs_warn_unknown_chunk_allocation(fs_devices->chunk_alloc_policy);
+ fallthrough;
case BTRFS_CHUNK_ALLOC_REGULAR:
init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
break;
case BTRFS_CHUNK_ALLOC_ZONED:
init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
break;
- default:
- BUG();
}
}
@@ -5306,6 +5315,9 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
ctl->stripe_size);
}
+ /* Stripe size should not go beyond 1G. */
+ ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G);
+
/* Align to BTRFS_STRIPE_LEN */
ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
ctl->chunk_size = ctl->stripe_size * data_stripes;
@@ -5324,20 +5336,24 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
* It should hold because:
* dev_extent_min == dev_extent_want == zone_size * dev_stripes
*/
- ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
+ ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min,
+ "ndevs=%d max_avail=%llu dev_extent_min=%llu", ctl->ndevs,
+ devices_info[ctl->ndevs - 1].max_avail, ctl->dev_extent_min);
ctl->stripe_size = zone_size;
ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
- /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
+ /* stripe_size is fixed in zoned filesystem. Reduce ndevs instead. */
if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
ctl->stripe_size) + ctl->nparity,
ctl->dev_stripes);
ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
- ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
+ ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size,
+ "stripe_size=%llu data_stripes=%d max_chunk_size=%llu",
+ ctl->stripe_size, data_stripes, ctl->max_chunk_size);
}
ctl->chunk_size = ctl->stripe_size * data_stripes;
@@ -5370,79 +5386,151 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
switch (fs_devices->chunk_alloc_policy) {
+ default:
+ btrfs_warn_unknown_chunk_allocation(fs_devices->chunk_alloc_policy);
+ fallthrough;
case BTRFS_CHUNK_ALLOC_REGULAR:
return decide_stripe_size_regular(ctl, devices_info);
case BTRFS_CHUNK_ALLOC_ZONED:
return decide_stripe_size_zoned(ctl, devices_info);
- default:
- BUG();
}
}
+static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int bits)
+{
+ for (int i = 0; i < map->num_stripes; i++) {
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
+ struct btrfs_device *device = stripe->dev;
+
+ btrfs_set_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + map->stripe_size - 1,
+ bits | EXTENT_NOWAIT, NULL);
+ }
+}
+
+static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
+{
+ for (int i = 0; i < map->num_stripes; i++) {
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
+ struct btrfs_device *device = stripe->dev;
+
+ btrfs_clear_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + map->stripe_size - 1,
+ bits | EXTENT_NOWAIT, NULL);
+ }
+}
+
+void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
+{
+ write_lock(&fs_info->mapping_tree_lock);
+ rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
+ RB_CLEAR_NODE(&map->rb_node);
+ chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
+ write_unlock(&fs_info->mapping_tree_lock);
+
+ /* Once for the tree reference. */
+ btrfs_free_chunk_map(map);
+}
+
+static int btrfs_chunk_map_cmp(const struct rb_node *new,
+ const struct rb_node *exist)
+{
+ const struct btrfs_chunk_map *new_map =
+ rb_entry(new, struct btrfs_chunk_map, rb_node);
+ const struct btrfs_chunk_map *exist_map =
+ rb_entry(exist, struct btrfs_chunk_map, rb_node);
+
+ if (new_map->start == exist_map->start)
+ return 0;
+ if (new_map->start < exist_map->start)
+ return -1;
+ return 1;
+}
+
+EXPORT_FOR_TESTS
+int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
+{
+ struct rb_node *exist;
+
+ write_lock(&fs_info->mapping_tree_lock);
+ exist = rb_find_add_cached(&map->rb_node, &fs_info->mapping_tree,
+ btrfs_chunk_map_cmp);
+
+ if (exist) {
+ write_unlock(&fs_info->mapping_tree_lock);
+ return -EEXIST;
+ }
+ chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
+ chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
+ write_unlock(&fs_info->mapping_tree_lock);
+
+ return 0;
+}
+
+EXPORT_FOR_TESTS
+struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp)
+{
+ struct btrfs_chunk_map *map;
+
+ map = kmalloc(btrfs_chunk_map_size(num_stripes), gfp);
+ if (!map)
+ return NULL;
+
+ refcount_set(&map->refs, 1);
+ RB_CLEAR_NODE(&map->rb_node);
+
+ return map;
+}
+
static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
struct alloc_chunk_ctl *ctl,
struct btrfs_device_info *devices_info)
{
struct btrfs_fs_info *info = trans->fs_info;
- struct map_lookup *map = NULL;
- struct extent_map_tree *em_tree;
+ struct btrfs_chunk_map *map;
struct btrfs_block_group *block_group;
- struct extent_map *em;
u64 start = ctl->start;
u64 type = ctl->type;
int ret;
- int i;
- int j;
- map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
+ map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS);
if (!map)
return ERR_PTR(-ENOMEM);
+
+ map->start = start;
+ map->chunk_len = ctl->chunk_size;
+ map->stripe_size = ctl->stripe_size;
+ map->type = type;
+ map->io_align = BTRFS_STRIPE_LEN;
+ map->io_width = BTRFS_STRIPE_LEN;
+ map->sub_stripes = ctl->sub_stripes;
map->num_stripes = ctl->num_stripes;
- for (i = 0; i < ctl->ndevs; ++i) {
- for (j = 0; j < ctl->dev_stripes; ++j) {
+ for (int i = 0; i < ctl->ndevs; i++) {
+ for (int j = 0; j < ctl->dev_stripes; j++) {
int s = i * ctl->dev_stripes + j;
map->stripes[s].dev = devices_info[i].dev;
map->stripes[s].physical = devices_info[i].dev_offset +
j * ctl->stripe_size;
}
}
- map->stripe_len = BTRFS_STRIPE_LEN;
- map->io_align = BTRFS_STRIPE_LEN;
- map->io_width = BTRFS_STRIPE_LEN;
- map->type = type;
- map->sub_stripes = ctl->sub_stripes;
trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
- em = alloc_extent_map();
- if (!em) {
- kfree(map);
- return ERR_PTR(-ENOMEM);
- }
- set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->map_lookup = map;
- em->start = start;
- em->len = ctl->chunk_size;
- em->block_start = 0;
- em->block_len = em->len;
- em->orig_block_len = ctl->stripe_size;
-
- em_tree = &info->mapping_tree;
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_chunk_map(info, map);
if (ret) {
- write_unlock(&em_tree->lock);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ERR_PTR(ret);
}
- write_unlock(&em_tree->lock);
- block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
- if (IS_ERR(block_group))
- goto error_del_extent;
+ block_group = btrfs_make_block_group(trans, ctl->space_info, type, start,
+ ctl->chunk_size);
+ if (IS_ERR(block_group)) {
+ btrfs_remove_chunk_map(info, map);
+ return block_group;
+ }
- for (i = 0; i < map->num_stripes; i++) {
+ for (int i = 0; i < map->num_stripes; i++) {
struct btrfs_device *dev = map->stripes[i].dev;
btrfs_device_set_bytes_used(dev,
@@ -5455,39 +5543,26 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
atomic64_sub(ctl->stripe_size * map->num_stripes,
&info->free_chunk_space);
- free_extent_map(em);
check_raid56_incompat_flag(info, type);
check_raid1c34_incompat_flag(info, type);
return block_group;
-
-error_del_extent:
- write_lock(&em_tree->lock);
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
-
- /* One for our allocation */
- free_extent_map(em);
- /* One for the tree reference */
- free_extent_map(em);
-
- return block_group;
}
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
- u64 type)
+ struct btrfs_space_info *space_info,
+ u64 type)
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_fs_devices *fs_devices = info->fs_devices;
- struct btrfs_device_info *devices_info = NULL;
+ struct btrfs_device_info AUTO_KFREE(devices_info);
struct alloc_chunk_ctl ctl;
- struct btrfs_block_group *block_group;
int ret;
lockdep_assert_held(&info->chunk_mutex);
if (!alloc_profile_is_valid(type, 0)) {
- ASSERT(0);
+ DEBUG_WARN("invalid alloc profile for type %llu", type);
return ERR_PTR(-EINVAL);
}
@@ -5499,12 +5574,13 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(info, "invalid chunk type 0x%llx requested", type);
- ASSERT(0);
+ DEBUG_WARN();
return ERR_PTR(-EINVAL);
}
ctl.start = find_next_chunk(info);
ctl.type = type;
+ ctl.space_info = space_info;
init_alloc_chunk_ctl(fs_devices, &ctl);
devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
@@ -5513,22 +5589,14 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
return ERR_PTR(-ENOMEM);
ret = gather_device_info(fs_devices, &ctl, devices_info);
- if (ret < 0) {
- block_group = ERR_PTR(ret);
- goto out;
- }
+ if (ret < 0)
+ return ERR_PTR(ret);
ret = decide_stripe_size(fs_devices, &ctl, devices_info);
- if (ret < 0) {
- block_group = ERR_PTR(ret);
- goto out;
- }
-
- block_group = create_chunk(trans, &ctl, devices_info);
+ if (ret < 0)
+ return ERR_PTR(ret);
-out:
- kfree(devices_info);
- return block_group;
+ return create_chunk(trans, &ctl, devices_info);
}
/*
@@ -5547,8 +5615,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_key key;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
size_t item_size;
int i;
int ret;
@@ -5577,18 +5644,17 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
*/
lockdep_assert_held(&fs_info->chunk_mutex);
- em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+ map = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
btrfs_abort_transaction(trans, ret);
return ret;
}
- map = em->map_lookup;
item_size = btrfs_chunk_item_size(map->num_stripes);
chunk = kzalloc(item_size, GFP_NOFS);
- if (!chunk) {
+ if (unlikely(!chunk)) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -5615,11 +5681,11 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
btrfs_set_stack_chunk_length(chunk, bg->length);
btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
- btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN);
btrfs_set_stack_chunk_type(chunk, map->type);
btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
- btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
- btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN);
+ btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN);
btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
@@ -5631,7 +5697,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- bg->chunk_item_inserted = 1;
+ set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
@@ -5641,7 +5707,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
out:
kfree(chunk);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -5650,7 +5716,9 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
u64 alloc_profile;
struct btrfs_block_group *meta_bg;
+ struct btrfs_space_info *meta_space_info;
struct btrfs_block_group *sys_bg;
+ struct btrfs_space_info *sys_space_info;
/*
* When adding a new device for sprouting, the seed device is read-only
@@ -5674,19 +5742,29 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
*/
alloc_profile = btrfs_metadata_alloc_profile(fs_info);
- meta_bg = btrfs_create_chunk(trans, alloc_profile);
+ meta_space_info = btrfs_find_space_info(fs_info, alloc_profile);
+ if (!meta_space_info) {
+ DEBUG_WARN();
+ return -EINVAL;
+ }
+ meta_bg = btrfs_create_chunk(trans, meta_space_info, alloc_profile);
if (IS_ERR(meta_bg))
return PTR_ERR(meta_bg);
alloc_profile = btrfs_system_alloc_profile(fs_info);
- sys_bg = btrfs_create_chunk(trans, alloc_profile);
+ sys_space_info = btrfs_find_space_info(fs_info, alloc_profile);
+ if (!sys_space_info) {
+ DEBUG_WARN();
+ return -EINVAL;
+ }
+ sys_bg = btrfs_create_chunk(trans, sys_space_info, alloc_profile);
if (IS_ERR(sys_bg))
return PTR_ERR(sys_bg);
return 0;
}
-static inline int btrfs_chunk_max_errors(struct map_lookup *map)
+static inline int btrfs_chunk_max_errors(struct btrfs_chunk_map *map)
{
const int index = btrfs_bg_flags_to_raid_index(map->type);
@@ -5695,17 +5773,15 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
int miss_ndevs = 0;
int i;
bool ret = true;
- em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
- if (IS_ERR(em))
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(map))
return false;
- map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
if (test_bit(BTRFS_DEV_STATE_MISSING,
&map->stripes[i].dev->dev_state)) {
@@ -5726,37 +5802,57 @@ bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
if (miss_ndevs > btrfs_chunk_max_errors(map))
ret = false;
end:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
-void btrfs_mapping_tree_free(struct extent_map_tree *tree)
+void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info)
{
- struct extent_map *em;
+ write_lock(&fs_info->mapping_tree_lock);
+ while (!RB_EMPTY_ROOT(&fs_info->mapping_tree.rb_root)) {
+ struct btrfs_chunk_map *map;
+ struct rb_node *node;
- while (1) {
- write_lock(&tree->lock);
- em = lookup_extent_mapping(tree, 0, (u64)-1);
- if (em)
- remove_extent_mapping(tree, em);
- write_unlock(&tree->lock);
- if (!em)
- break;
- /* once for us */
- free_extent_map(em);
- /* once for the tree */
- free_extent_map(em);
+ node = rb_first_cached(&fs_info->mapping_tree);
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
+ RB_CLEAR_NODE(&map->rb_node);
+ chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
+ /* Once for the tree ref. */
+ btrfs_free_chunk_map(map);
+ cond_resched_rwlock_write(&fs_info->mapping_tree_lock);
}
+ write_unlock(&fs_info->mapping_tree_lock);
+}
+
+static int btrfs_chunk_map_num_copies(const struct btrfs_chunk_map *map)
+{
+ enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(map->type);
+
+ if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+ return 2;
+
+ /*
+ * There could be two corrupted data stripes, we need to loop retry in
+ * order to rebuild the correct data.
+ *
+ * Fail a stripe at a time on every retry except the stripe under
+ * reconstruction.
+ */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ return map->num_stripes;
+
+ /* Non-RAID56, use their ncopies from btrfs_raid_array. */
+ return btrfs_raid_array[index].ncopies;
}
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
int ret;
- em = btrfs_get_chunk_map(fs_info, logical, len);
- if (IS_ERR(em))
+ map = btrfs_get_chunk_map(fs_info, logical, len);
+ if (IS_ERR(map))
/*
* We could return errors for these cases, but that could get
* ugly and we'd probably do the same thing which is just not do
@@ -5765,99 +5861,137 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
*/
return 1;
- map = em->map_lookup;
- if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
- ret = map->num_stripes;
- else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
- ret = map->sub_stripes;
- else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
- ret = 2;
- else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- /*
- * There could be two corrupted data stripes, we need
- * to loop retry in order to rebuild the correct data.
- *
- * Fail a stripe at a time on every retry except the
- * stripe under reconstruction.
- */
- ret = map->num_stripes;
- else
- ret = 1;
- free_extent_map(em);
-
- down_read(&fs_info->dev_replace.rwsem);
- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
- fs_info->dev_replace.tgtdev)
- ret++;
- up_read(&fs_info->dev_replace.rwsem);
-
+ ret = btrfs_chunk_map_num_copies(map);
+ btrfs_free_chunk_map(map);
return ret;
}
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
unsigned long len = fs_info->sectorsize;
- em = btrfs_get_chunk_map(fs_info, logical, len);
+ if (!btrfs_fs_incompat(fs_info, RAID56))
+ return len;
- if (!WARN_ON(IS_ERR(em))) {
- map = em->map_lookup;
+ map = btrfs_get_chunk_map(fs_info, logical, len);
+
+ if (!WARN_ON(IS_ERR(map))) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
- len = map->stripe_len * nr_data_stripes(map);
- free_extent_map(em);
+ len = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
+ btrfs_free_chunk_map(map);
}
return len;
}
-int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, int num_stripes)
{
- struct extent_map *em;
- struct map_lookup *map;
- int ret = 0;
+ for (int index = first; index < first + num_stripes; index++) {
+ const struct btrfs_device *device = map->stripes[index].dev;
+
+ if (device->devid == READ_ONCE(device->fs_devices->read_devid))
+ return index;
+ }
- em = btrfs_get_chunk_map(fs_info, logical, len);
+ /* If no read-preferred device is set use the first stripe. */
+ return first;
+}
- if(!WARN_ON(IS_ERR(em))) {
- map = em->map_lookup;
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
- ret = 1;
- free_extent_map(em);
+struct stripe_mirror {
+ u64 devid;
+ int num;
+};
+
+static int btrfs_cmp_devid(const void *a, const void *b)
+{
+ const struct stripe_mirror *s1 = (const struct stripe_mirror *)a;
+ const struct stripe_mirror *s2 = (const struct stripe_mirror *)b;
+
+ if (s1->devid < s2->devid)
+ return -1;
+ if (s1->devid > s2->devid)
+ return 1;
+ return 0;
+}
+
+/*
+ * Select a stripe for reading using the round-robin algorithm.
+ *
+ * 1. Compute the read cycle as the total sectors read divided by the minimum
+ * sectors per device.
+ * 2. Determine the stripe number for the current read by taking the modulus
+ * of the read cycle with the total number of stripes:
+ *
+ * stripe index = (total sectors / min sectors per dev) % num stripes
+ *
+ * The calculated stripe index is then used to select the corresponding device
+ * from the list of devices, which is ordered by devid.
+ */
+static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_stripes)
+{
+ struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = { 0 };
+ struct btrfs_device *device = map->stripes[first].dev;
+ struct btrfs_fs_info *fs_info = device->fs_devices->fs_info;
+ unsigned int read_cycle;
+ unsigned int total_reads;
+ unsigned int min_reads_per_dev;
+
+ total_reads = percpu_counter_sum(&fs_info->stats_read_blocks);
+ min_reads_per_dev = READ_ONCE(fs_info->fs_devices->rr_min_contig_read) >>
+ fs_info->sectorsize_bits;
+
+ for (int index = 0, i = first; i < first + num_stripes; i++) {
+ stripes[index].devid = map->stripes[i].dev->devid;
+ stripes[index].num = i;
+ index++;
}
- return ret;
+ sort(stripes, num_stripes, sizeof(struct stripe_mirror),
+ btrfs_cmp_devid, NULL);
+
+ read_cycle = total_reads / min_reads_per_dev;
+ return stripes[read_cycle % num_stripes].num;
}
+#endif
static int find_live_mirror(struct btrfs_fs_info *fs_info,
- struct map_lookup *map, int first,
- int dev_replace_is_ongoing)
+ struct btrfs_chunk_map *map, int first,
+ bool dev_replace_is_ongoing)
{
+ const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy);
int i;
int num_stripes;
int preferred_mirror;
int tolerance;
struct btrfs_device *srcdev;
- ASSERT((map->type &
- (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
+ ASSERT((map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)),
+ "type=%llu", map->type);
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
num_stripes = map->sub_stripes;
else
num_stripes = map->num_stripes;
- switch (fs_info->fs_devices->read_policy) {
+ switch (policy) {
default:
/* Shouldn't happen, just warn and use pid instead of failing */
- btrfs_warn_rl(fs_info,
- "unknown read_policy type %u, reset to pid",
- fs_info->fs_devices->read_policy);
- fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+ btrfs_warn_rl(fs_info, "unknown read_policy type %u, reset to pid",
+ policy);
+ WRITE_ONCE(fs_info->fs_devices->read_policy, BTRFS_READ_POLICY_PID);
fallthrough;
case BTRFS_READ_POLICY_PID:
preferred_mirror = first + (current->pid % num_stripes);
break;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ case BTRFS_READ_POLICY_RR:
+ preferred_mirror = btrfs_read_rr(map, first, num_stripes);
+ break;
+ case BTRFS_READ_POLICY_DEVID:
+ preferred_mirror = btrfs_read_preferred(map, first, num_stripes);
+ break;
+#endif
}
if (dev_replace_is_ongoing &&
@@ -5889,49 +6023,23 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
return preferred_mirror;
}
-/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
-static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
+EXPORT_FOR_TESTS
+struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+ u64 logical, u16 total_stripes)
{
- int i;
- int again = 1;
-
- while (again) {
- again = 0;
- for (i = 0; i < num_stripes - 1; i++) {
- /* Swap if parity is on a smaller index */
- if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
- swap(bioc->stripes[i], bioc->stripes[i + 1]);
- swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
- again = 1;
- }
- }
- }
-}
+ struct btrfs_io_context *bioc;
-static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
- int total_stripes,
- int real_stripes)
-{
- struct btrfs_io_context *bioc = kzalloc(
- /* The size of btrfs_io_context */
- sizeof(struct btrfs_io_context) +
- /* Plus the variable array for the stripes */
- sizeof(struct btrfs_io_stripe) * (total_stripes) +
- /* Plus the variable array for the tgt dev */
- sizeof(int) * (real_stripes) +
- /*
- * Plus the raid_map, which includes both the tgt dev
- * and the stripes.
- */
- sizeof(u64) * (total_stripes),
- GFP_NOFS|__GFP_NOFAIL);
+ bioc = kzalloc(struct_size(bioc, stripes, total_stripes), GFP_NOFS);
+
+ if (!bioc)
+ return NULL;
- atomic_set(&bioc->error, 0);
refcount_set(&bioc->refs, 1);
bioc->fs_info = fs_info;
- bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
- bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
+ bioc->replace_stripe_src = -1;
+ bioc->full_stripe_logical = (u64)-1;
+ bioc->logical = logical;
return bioc;
}
@@ -5950,75 +6058,66 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc)
kfree(bioc);
}
-/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
/*
* Please note that, discard won't be sent to target device of device
* replace.
*/
-static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
- u64 logical, u64 *length_ret,
- struct btrfs_io_context **bioc_ret)
+struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length_ret,
+ u32 *num_stripes)
{
- struct extent_map *em;
- struct map_lookup *map;
- struct btrfs_io_context *bioc;
+ struct btrfs_chunk_map *map;
+ struct btrfs_discard_stripe *stripes;
u64 length = *length_ret;
u64 offset;
- u64 stripe_nr;
- u64 stripe_nr_end;
+ u32 stripe_nr;
+ u32 stripe_nr_end;
+ u32 stripe_cnt;
u64 stripe_end_offset;
- u64 stripe_cnt;
- u64 stripe_len;
u64 stripe_offset;
- u64 num_stripes;
u32 stripe_index;
u32 factor = 0;
u32 sub_stripes = 0;
- u64 stripes_per_dev = 0;
+ u32 stripes_per_dev = 0;
u32 remaining_stripes = 0;
u32 last_stripe = 0;
- int ret = 0;
+ int ret;
int i;
- /* Discard always returns a bioc. */
- ASSERT(bioc_ret);
-
- em = btrfs_get_chunk_map(fs_info, logical, length);
- if (IS_ERR(em))
- return PTR_ERR(em);
+ map = btrfs_get_chunk_map(fs_info, logical, length);
+ if (IS_ERR(map))
+ return ERR_CAST(map);
- map = em->map_lookup;
/* we don't discard raid56 yet */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = -EOPNOTSUPP;
- goto out;
+ goto out_free_map;
}
- offset = logical - em->start;
- length = min_t(u64, em->start + em->len - logical, length);
+ offset = logical - map->start;
+ length = min_t(u64, map->start + map->chunk_len - logical, length);
*length_ret = length;
- stripe_len = map->stripe_len;
/*
* stripe_nr counts the total number of stripes we have to stride
* to get to this block
*/
- stripe_nr = div64_u64(offset, stripe_len);
+ stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
/* stripe_offset is the offset of this block in its stripe */
- stripe_offset = offset - stripe_nr * stripe_len;
+ stripe_offset = offset - btrfs_stripe_nr_to_offset(stripe_nr);
- stripe_nr_end = round_up(offset + length, map->stripe_len);
- stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
+ stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >>
+ BTRFS_STRIPE_LEN_SHIFT;
stripe_cnt = stripe_nr_end - stripe_nr;
- stripe_end_offset = stripe_nr_end * map->stripe_len -
+ stripe_end_offset = btrfs_stripe_nr_to_offset(stripe_nr_end) -
(offset + length);
/*
* after this, stripe_nr is the number of stripes on this
* device we have to walk to find the data, and stripe_index is
* the number of our device in the stripe array
*/
- num_stripes = 1;
+ *num_stripes = 1;
stripe_index = 0;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
@@ -6028,41 +6127,41 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
sub_stripes = map->sub_stripes;
factor = map->num_stripes / sub_stripes;
- num_stripes = min_t(u64, map->num_stripes,
+ *num_stripes = min_t(u64, map->num_stripes,
sub_stripes * stripe_cnt);
- stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+ stripe_index = stripe_nr % factor;
+ stripe_nr /= factor;
stripe_index *= sub_stripes;
- stripes_per_dev = div_u64_rem(stripe_cnt, factor,
- &remaining_stripes);
- div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
- last_stripe *= sub_stripes;
+
+ remaining_stripes = stripe_cnt % factor;
+ stripes_per_dev = stripe_cnt / factor;
+ last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes;
} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
BTRFS_BLOCK_GROUP_DUP)) {
- num_stripes = map->num_stripes;
+ *num_stripes = map->num_stripes;
} else {
- stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
- &stripe_index);
+ stripe_index = stripe_nr % map->num_stripes;
+ stripe_nr /= map->num_stripes;
}
- bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0);
- if (!bioc) {
+ stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
+ if (!stripes) {
ret = -ENOMEM;
- goto out;
+ goto out_free_map;
}
- for (i = 0; i < num_stripes; i++) {
- bioc->stripes[i].physical =
+ for (i = 0; i < *num_stripes; i++) {
+ stripes[i].physical =
map->stripes[stripe_index].physical +
- stripe_offset + stripe_nr * map->stripe_len;
- bioc->stripes[i].dev = map->stripes[stripe_index].dev;
+ stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
+ stripes[i].dev = map->stripes[stripe_index].dev;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
- bioc->stripes[i].length = stripes_per_dev *
- map->stripe_len;
+ stripes[i].length = btrfs_stripe_nr_to_offset(stripes_per_dev);
if (i / sub_stripes < remaining_stripes)
- bioc->stripes[i].length += map->stripe_len;
+ stripes[i].length += BTRFS_STRIPE_LEN;
/*
* Special for the first stripe and
@@ -6073,17 +6172,17 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
* off end_off
*/
if (i < sub_stripes)
- bioc->stripes[i].length -= stripe_offset;
+ stripes[i].length -= stripe_offset;
if (stripe_index >= last_stripe &&
stripe_index <= (last_stripe +
sub_stripes - 1))
- bioc->stripes[i].length -= stripe_end_offset;
+ stripes[i].length -= stripe_end_offset;
if (i == sub_stripes - 1)
stripe_offset = 0;
} else {
- bioc->stripes[i].length = length;
+ stripes[i].length = length;
}
stripe_index++;
@@ -6093,89 +6192,11 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
}
}
- *bioc_ret = bioc;
- bioc->map_type = map->type;
- bioc->num_stripes = num_stripes;
-out:
- free_extent_map(em);
- return ret;
-}
-
-/*
- * In dev-replace case, for repair case (that's the only case where the mirror
- * is selected explicitly when calling btrfs_map_block), blocks left of the
- * left cursor can also be read from the target drive.
- *
- * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
- * array of stripes.
- * For READ, it also needs to be supported using the same mirror number.
- *
- * If the requested block is not left of the left cursor, EIO is returned. This
- * can happen because btrfs_num_copies() returns one more in the dev-replace
- * case.
- */
-static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length,
- u64 srcdev_devid, int *mirror_num,
- u64 *physical)
-{
- struct btrfs_io_context *bioc = NULL;
- int num_stripes;
- int index_srcdev = 0;
- int found = 0;
- u64 physical_of_found = 0;
- int i;
- int ret = 0;
-
- ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &length, &bioc, 0, 0);
- if (ret) {
- ASSERT(bioc == NULL);
- return ret;
- }
-
- num_stripes = bioc->num_stripes;
- if (*mirror_num > num_stripes) {
- /*
- * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
- * that means that the requested area is not left of the left
- * cursor
- */
- btrfs_put_bioc(bioc);
- return -EIO;
- }
-
- /*
- * process the rest of the function using the mirror_num of the source
- * drive. Therefore look it up first. At the end, patch the device
- * pointer to the one of the target drive.
- */
- for (i = 0; i < num_stripes; i++) {
- if (bioc->stripes[i].dev->devid != srcdev_devid)
- continue;
-
- /*
- * In case of DUP, in order to keep it simple, only add the
- * mirror with the lowest physical address
- */
- if (found &&
- physical_of_found <= bioc->stripes[i].physical)
- continue;
-
- index_srcdev = i;
- found = 1;
- physical_of_found = bioc->stripes[i].physical;
- }
-
- btrfs_put_bioc(bioc);
-
- ASSERT(found);
- if (!found)
- return -EIO;
-
- *mirror_num = index_srcdev + 1;
- *physical = physical_of_found;
- return ret;
+ btrfs_free_chunk_map(map);
+ return stripes;
+out_free_map:
+ btrfs_free_chunk_map(map);
+ return ERR_PTR(ret);
}
static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
@@ -6189,663 +6210,536 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
cache = btrfs_lookup_block_group(fs_info, logical);
- spin_lock(&cache->lock);
- ret = cache->to_copy;
- spin_unlock(&cache->lock);
+ ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
btrfs_put_block_group(cache);
return ret;
}
-static void handle_ops_on_dev_replace(enum btrfs_map_op op,
- struct btrfs_io_context **bioc_ret,
+static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc,
struct btrfs_dev_replace *dev_replace,
u64 logical,
- int *num_stripes_ret, int *max_errors_ret)
+ struct btrfs_io_geometry *io_geom)
{
- struct btrfs_io_context *bioc = *bioc_ret;
u64 srcdev_devid = dev_replace->srcdev->devid;
- int tgtdev_indexes = 0;
- int num_stripes = *num_stripes_ret;
- int max_errors = *max_errors_ret;
+ /*
+ * At this stage, num_stripes is still the real number of stripes,
+ * excluding the duplicated stripes.
+ */
+ int num_stripes = io_geom->num_stripes;
+ int max_errors = io_geom->max_errors;
+ int nr_extra_stripes = 0;
int i;
- if (op == BTRFS_MAP_WRITE) {
- int index_where_to_add;
+ /*
+ * A block group which has "to_copy" set will eventually be copied by
+ * the dev-replace process. We can avoid cloning IO here.
+ */
+ if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
+ return;
+
+ /*
+ * Duplicate the write operations while the dev-replace procedure is
+ * running. Since the copying of the old disk to the new disk takes
+ * place at run time while the filesystem is mounted writable, the
+ * regular write operations to the old disk have to be duplicated to go
+ * to the new disk as well.
+ *
+ * Note that device->missing is handled by the caller, and that the
+ * write to the old disk is already set up in the stripes array.
+ */
+ for (i = 0; i < num_stripes; i++) {
+ struct btrfs_io_stripe *old = &bioc->stripes[i];
+ struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes];
- /*
- * A block group which have "to_copy" set will eventually
- * copied by dev-replace process. We can avoid cloning IO here.
- */
- if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
- return;
+ if (old->dev->devid != srcdev_devid)
+ continue;
- /*
- * duplicate the write operations while the dev replace
- * procedure is running. Since the copying of the old disk to
- * the new disk takes place at run time while the filesystem is
- * mounted writable, the regular write operations to the old
- * disk have to be duplicated to go to the new disk as well.
- *
- * Note that device->missing is handled by the caller, and that
- * the write to the old disk is already set up in the stripes
- * array.
- */
- index_where_to_add = num_stripes;
- for (i = 0; i < num_stripes; i++) {
- if (bioc->stripes[i].dev->devid == srcdev_devid) {
- /* write to new disk, too */
- struct btrfs_io_stripe *new =
- bioc->stripes + index_where_to_add;
- struct btrfs_io_stripe *old =
- bioc->stripes + i;
-
- new->physical = old->physical;
- new->length = old->length;
- new->dev = dev_replace->tgtdev;
- bioc->tgtdev_map[i] = index_where_to_add;
- index_where_to_add++;
- max_errors++;
- tgtdev_indexes++;
- }
- }
- num_stripes = index_where_to_add;
- } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
- int index_srcdev = 0;
- int found = 0;
- u64 physical_of_found = 0;
+ new->physical = old->physical;
+ new->dev = dev_replace->tgtdev;
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ bioc->replace_stripe_src = i;
+ nr_extra_stripes++;
+ }
+
+ /* We can only have at most 2 extra nr_stripes (for DUP). */
+ ASSERT(nr_extra_stripes <= 2, "nr_extra_stripes=%d", nr_extra_stripes);
+ /*
+ * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for
+ * replace.
+ * If we have 2 extra stripes, only choose the one with smaller physical.
+ */
+ if (io_geom->op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
+ struct btrfs_io_stripe *first = &bioc->stripes[num_stripes];
+ struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];
+
+ /* Only DUP can have two extra stripes. */
+ ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP,
+ "map_type=%llu", bioc->map_type);
/*
- * During the dev-replace procedure, the target drive can also
- * be used to read data in case it is needed to repair a corrupt
- * block elsewhere. This is possible if the requested area is
- * left of the left cursor. In this area, the target drive is a
- * full copy of the source drive.
+ * Swap the last stripe stripes and reduce @nr_extra_stripes.
+ * The extra stripe would still be there, but won't be accessed.
*/
- for (i = 0; i < num_stripes; i++) {
- if (bioc->stripes[i].dev->devid == srcdev_devid) {
- /*
- * In case of DUP, in order to keep it simple,
- * only add the mirror with the lowest physical
- * address
- */
- if (found &&
- physical_of_found <= bioc->stripes[i].physical)
- continue;
- index_srcdev = i;
- found = 1;
- physical_of_found = bioc->stripes[i].physical;
- }
- }
- if (found) {
- struct btrfs_io_stripe *tgtdev_stripe =
- bioc->stripes + num_stripes;
-
- tgtdev_stripe->physical = physical_of_found;
- tgtdev_stripe->length =
- bioc->stripes[index_srcdev].length;
- tgtdev_stripe->dev = dev_replace->tgtdev;
- bioc->tgtdev_map[index_srcdev] = num_stripes;
-
- tgtdev_indexes++;
- num_stripes++;
+ if (first->physical > second->physical) {
+ swap(second->physical, first->physical);
+ swap(second->dev, first->dev);
+ nr_extra_stripes--;
}
}
- *num_stripes_ret = num_stripes;
- *max_errors_ret = max_errors;
- bioc->num_tgtdevs = tgtdev_indexes;
- *bioc_ret = bioc;
+ io_geom->num_stripes = num_stripes + nr_extra_stripes;
+ io_geom->max_errors = max_errors + nr_extra_stripes;
+ bioc->replace_nr_stripes = nr_extra_stripes;
}
-static bool need_full_stripe(enum btrfs_map_op op)
+static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset,
+ struct btrfs_io_geometry *io_geom)
{
- return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
-}
-
-/*
- * Calculate the geometry of a particular (address, len) tuple. This
- * information is used to calculate how big a particular bio can get before it
- * straddles a stripe.
- *
- * @fs_info: the filesystem
- * @em: mapping containing the logical extent
- * @op: type of operation - write or read
- * @logical: address that we want to figure out the geometry of
- * @io_geom: pointer used to return values
- *
- * Returns < 0 in case a chunk for the given logical address cannot be found,
- * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
- */
-int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
- enum btrfs_map_op op, u64 logical,
- struct btrfs_io_geometry *io_geom)
-{
- struct map_lookup *map;
- u64 len;
- u64 offset;
- u64 stripe_offset;
- u64 stripe_nr;
- u64 stripe_len;
- u64 raid56_full_stripe_start = (u64)-1;
- int data_stripes;
-
- ASSERT(op != BTRFS_MAP_DISCARD);
-
- map = em->map_lookup;
- /* Offset of this logical address in the chunk */
- offset = logical - em->start;
- /* Len of a stripe in a chunk */
- stripe_len = map->stripe_len;
- /* Stripe where this block falls in */
- stripe_nr = div64_u64(offset, stripe_len);
- /* Offset of stripe in the chunk */
- stripe_offset = stripe_nr * stripe_len;
- if (offset < stripe_offset) {
- btrfs_crit(fs_info,
-"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
- stripe_offset, offset, em->start, logical, stripe_len);
- return -EINVAL;
- }
-
- /* stripe_offset is the offset of this block in its stripe */
- stripe_offset = offset - stripe_offset;
- data_stripes = nr_data_stripes(map);
+ /*
+ * Stripe_nr is the stripe where this block falls. stripe_offset is
+ * the offset of this block in its stripe.
+ */
+ io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
+ io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
+ ASSERT(io_geom->stripe_offset < U32_MAX,
+ "stripe_offset=%llu", io_geom->stripe_offset);
- /* Only stripe based profiles needs to check against stripe length. */
- if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) {
- u64 max_len = stripe_len - stripe_offset;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ unsigned long full_stripe_len =
+ btrfs_stripe_nr_to_offset(nr_data_stripes(map));
/*
- * In case of raid56, we need to know the stripe aligned start
+ * For full stripe start, we use previously calculated
+ * @stripe_nr. Align it to nr_data_stripes, then multiply with
+ * STRIPE_LEN.
+ *
+ * By this we can avoid u64 division completely. And we have
+ * to go rounddown(), not round_down(), as nr_data_stripes is
+ * not ensured to be power of 2.
+ */
+ io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset(
+ rounddown(io_geom->stripe_nr, nr_data_stripes(map)));
+
+ ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset,
+ "raid56_full_stripe_start=%llu full_stripe_len=%lu offset=%llu",
+ io_geom->raid56_full_stripe_start, full_stripe_len, offset);
+ ASSERT(io_geom->raid56_full_stripe_start <= offset,
+ "raid56_full_stripe_start=%llu offset=%llu",
+ io_geom->raid56_full_stripe_start, offset);
+ /*
+ * For writes to RAID56, allow to write a full stripe set, but
+ * no straddling of stripe sets.
*/
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- unsigned long full_stripe_len = stripe_len * data_stripes;
- raid56_full_stripe_start = offset;
+ if (io_geom->op == BTRFS_MAP_WRITE)
+ return full_stripe_len - (offset - io_geom->raid56_full_stripe_start);
+ }
- /*
- * Allow a write of a full stripe, but make sure we
- * don't allow straddling of stripes
- */
- raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
- full_stripe_len);
- raid56_full_stripe_start *= full_stripe_len;
+ /*
+ * For other RAID types and for RAID56 reads, allow a single stripe (on
+ * a single disk).
+ */
+ if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
+ return BTRFS_STRIPE_LEN - io_geom->stripe_offset;
+ return U64_MAX;
+}
- /*
- * For writes to RAID[56], allow a full stripeset across
- * all disks. For other RAID types and for RAID[56]
- * reads, just allow a single stripe (on a single disk).
- */
- if (op == BTRFS_MAP_WRITE) {
- max_len = stripe_len * data_stripes -
- (offset - raid56_full_stripe_start);
- }
- }
- len = min_t(u64, em->len - offset, max_len);
- } else {
- len = em->len - offset;
- }
+static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 *length, struct btrfs_io_stripe *dst,
+ struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ dst->dev = map->stripes[io_geom->stripe_index].dev;
- io_geom->len = len;
- io_geom->offset = offset;
- io_geom->stripe_len = stripe_len;
- io_geom->stripe_nr = stripe_nr;
- io_geom->stripe_offset = stripe_offset;
- io_geom->raid56_stripe_offset = raid56_full_stripe_start;
+ if (io_geom->op == BTRFS_MAP_READ && io_geom->use_rst)
+ return btrfs_get_raid_extent_offset(fs_info, logical, length,
+ map->type,
+ io_geom->stripe_index, dst);
+ dst->physical = map->stripes[io_geom->stripe_index].physical +
+ io_geom->stripe_offset +
+ btrfs_stripe_nr_to_offset(io_geom->stripe_nr);
return 0;
}
-static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
- enum btrfs_map_op op,
- u64 logical, u64 *length,
- struct btrfs_io_context **bioc_ret,
- int mirror_num, int need_raid_map)
+static bool is_single_device_io(struct btrfs_fs_info *fs_info,
+ const struct btrfs_io_stripe *smap,
+ const struct btrfs_chunk_map *map,
+ int num_alloc_stripes,
+ struct btrfs_io_geometry *io_geom)
{
- struct extent_map *em;
- struct map_lookup *map;
- u64 stripe_offset;
- u64 stripe_nr;
- u64 stripe_len;
- u32 stripe_index;
- int data_stripes;
- int i;
- int ret = 0;
- int num_stripes;
- int max_errors = 0;
- int tgtdev_indexes = 0;
- struct btrfs_io_context *bioc = NULL;
- struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- int dev_replace_is_ongoing = 0;
- int num_alloc_stripes;
- int patch_the_first_stripe_for_dev_replace = 0;
- u64 physical_to_patch_in_first_stripe = 0;
- u64 raid56_full_stripe_start = (u64)-1;
- struct btrfs_io_geometry geom;
-
- ASSERT(bioc_ret);
- ASSERT(op != BTRFS_MAP_DISCARD);
+ if (!smap)
+ return false;
- em = btrfs_get_chunk_map(fs_info, logical, *length);
- ASSERT(!IS_ERR(em));
+ if (num_alloc_stripes != 1)
+ return false;
- ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom);
- if (ret < 0)
- return ret;
+ if (io_geom->use_rst && io_geom->op != BTRFS_MAP_READ)
+ return false;
- map = em->map_lookup;
+ if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && io_geom->mirror_num > 1)
+ return false;
- *length = geom.len;
- stripe_len = geom.stripe_len;
- stripe_nr = geom.stripe_nr;
- stripe_offset = geom.stripe_offset;
- raid56_full_stripe_start = geom.raid56_stripe_offset;
- data_stripes = nr_data_stripes(map);
+ return true;
+}
- down_read(&dev_replace->rwsem);
- dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
- /*
- * Hold the semaphore for read during the whole operation, write is
- * requested at commit time but must wait.
- */
- if (!dev_replace_is_ongoing)
- up_read(&dev_replace->rwsem);
+static void map_blocks_raid0(const struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
+ io_geom->stripe_nr /= map->num_stripes;
+ if (io_geom->op == BTRFS_MAP_READ)
+ io_geom->mirror_num = 1;
+}
- if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
- !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
- ret = get_extra_mirror_from_replace(fs_info, logical, *length,
- dev_replace->srcdev->devid,
- &mirror_num,
- &physical_to_patch_in_first_stripe);
- if (ret)
- goto out;
- else
- patch_the_first_stripe_for_dev_replace = 1;
- } else if (mirror_num > map->num_stripes) {
- mirror_num = 0;
+static void map_blocks_raid1(struct btrfs_fs_info *fs_info,
+ struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom,
+ bool dev_replace_is_ongoing)
+{
+ if (io_geom->op != BTRFS_MAP_READ) {
+ io_geom->num_stripes = map->num_stripes;
+ return;
}
- num_stripes = 1;
- stripe_index = 0;
- if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
- &stripe_index);
- if (!need_full_stripe(op))
- mirror_num = 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
- if (need_full_stripe(op))
- num_stripes = map->num_stripes;
- else if (mirror_num)
- stripe_index = mirror_num - 1;
- else {
- stripe_index = find_live_mirror(fs_info, map, 0,
- dev_replace_is_ongoing);
- mirror_num = stripe_index + 1;
- }
-
- } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- if (need_full_stripe(op)) {
- num_stripes = map->num_stripes;
- } else if (mirror_num) {
- stripe_index = mirror_num - 1;
- } else {
- mirror_num = 1;
- }
-
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- u32 factor = map->num_stripes / map->sub_stripes;
-
- stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
- stripe_index *= map->sub_stripes;
-
- if (need_full_stripe(op))
- num_stripes = map->sub_stripes;
- else if (mirror_num)
- stripe_index += mirror_num - 1;
- else {
- int old_stripe_index = stripe_index;
- stripe_index = find_live_mirror(fs_info, map,
- stripe_index,
- dev_replace_is_ongoing);
- mirror_num = stripe_index - old_stripe_index + 1;
- }
-
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
- /* push stripe_nr back to the start of the full stripe */
- stripe_nr = div64_u64(raid56_full_stripe_start,
- stripe_len * data_stripes);
-
- /* RAID[56] write or recovery. Return all stripes */
- num_stripes = map->num_stripes;
- max_errors = nr_parity_stripes(map);
-
- *length = map->stripe_len;
- stripe_index = 0;
- stripe_offset = 0;
- } else {
- /*
- * Mirror #0 or #1 means the original data block.
- * Mirror #2 is RAID5 parity block.
- * Mirror #3 is RAID6 Q block.
- */
- stripe_nr = div_u64_rem(stripe_nr,
- data_stripes, &stripe_index);
- if (mirror_num > 1)
- stripe_index = data_stripes + mirror_num - 2;
-
- /* We distribute the parity blocks across stripes */
- div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
- &stripe_index);
- if (!need_full_stripe(op) && mirror_num <= 1)
- mirror_num = 1;
- }
- } else {
- /*
- * after this, stripe_nr is the number of stripes on this
- * device we have to walk to find the data, and stripe_index is
- * the number of our device in the stripe array
- */
- stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
- &stripe_index);
- mirror_num = stripe_index + 1;
- }
- if (stripe_index >= map->num_stripes) {
- btrfs_crit(fs_info,
- "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
- stripe_index, map->num_stripes);
- ret = -EINVAL;
- goto out;
+ if (io_geom->mirror_num) {
+ io_geom->stripe_index = io_geom->mirror_num - 1;
+ return;
}
- num_alloc_stripes = num_stripes;
- if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
- if (op == BTRFS_MAP_WRITE)
- num_alloc_stripes <<= 1;
- if (op == BTRFS_MAP_GET_READ_MIRRORS)
- num_alloc_stripes++;
- tgtdev_indexes = num_stripes;
- }
+ io_geom->stripe_index = find_live_mirror(fs_info, map, 0,
+ dev_replace_is_ongoing);
+ io_geom->mirror_num = io_geom->stripe_index + 1;
+}
- bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
- if (!bioc) {
- ret = -ENOMEM;
- goto out;
+static void map_blocks_dup(const struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ if (io_geom->op != BTRFS_MAP_READ) {
+ io_geom->num_stripes = map->num_stripes;
+ return;
}
- for (i = 0; i < num_stripes; i++) {
- bioc->stripes[i].physical = map->stripes[stripe_index].physical +
- stripe_offset + stripe_nr * map->stripe_len;
- bioc->stripes[i].dev = map->stripes[stripe_index].dev;
- stripe_index++;
+ if (io_geom->mirror_num) {
+ io_geom->stripe_index = io_geom->mirror_num - 1;
+ return;
}
- /* Build raid_map */
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
- (need_full_stripe(op) || mirror_num > 1)) {
- u64 tmp;
- unsigned rot;
-
- /* Work out the disk rotation on this stripe-set */
- div_u64_rem(stripe_nr, num_stripes, &rot);
+ io_geom->mirror_num = 1;
+}
- /* Fill in the logical address of each stripe */
- tmp = stripe_nr * data_stripes;
- for (i = 0; i < data_stripes; i++)
- bioc->raid_map[(i + rot) % num_stripes] =
- em->start + (tmp + i) * map->stripe_len;
+static void map_blocks_raid10(struct btrfs_fs_info *fs_info,
+ struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom,
+ bool dev_replace_is_ongoing)
+{
+ u32 factor = map->num_stripes / map->sub_stripes;
+ int old_stripe_index;
- bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
- if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- bioc->raid_map[(i + rot + 1) % num_stripes] =
- RAID6_Q_STRIPE;
+ io_geom->stripe_index = (io_geom->stripe_nr % factor) * map->sub_stripes;
+ io_geom->stripe_nr /= factor;
- sort_parity_stripes(bioc, num_stripes);
+ if (io_geom->op != BTRFS_MAP_READ) {
+ io_geom->num_stripes = map->sub_stripes;
+ return;
}
- if (need_full_stripe(op))
- max_errors = btrfs_chunk_max_errors(map);
-
- if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
- need_full_stripe(op)) {
- handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
- &num_stripes, &max_errors);
+ if (io_geom->mirror_num) {
+ io_geom->stripe_index += io_geom->mirror_num - 1;
+ return;
}
- *bioc_ret = bioc;
- bioc->map_type = map->type;
- bioc->num_stripes = num_stripes;
- bioc->max_errors = max_errors;
- bioc->mirror_num = mirror_num;
+ old_stripe_index = io_geom->stripe_index;
+ io_geom->stripe_index = find_live_mirror(fs_info, map,
+ io_geom->stripe_index,
+ dev_replace_is_ongoing);
+ io_geom->mirror_num = io_geom->stripe_index - old_stripe_index + 1;
+}
+
+static void map_blocks_raid56_write(struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom,
+ u64 logical, u64 *length)
+{
+ int data_stripes = nr_data_stripes(map);
/*
- * this is the case that REQ_READ && dev_replace_is_ongoing &&
- * mirror_num == num_stripes + 1 && dev_replace target drive is
- * available as a mirror
+ * Needs full stripe mapping.
+ *
+ * Push stripe_nr back to the start of the full stripe For those cases
+ * needing a full stripe, @stripe_nr is the full stripe number.
+ *
+ * Originally we go raid56_full_stripe_start / full_stripe_len, but
+ * that can be expensive. Here we just divide @stripe_nr with
+ * @data_stripes.
*/
- if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
- WARN_ON(num_stripes > 1);
- bioc->stripes[0].dev = dev_replace->tgtdev;
- bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
- bioc->mirror_num = map->num_stripes + 1;
- }
-out:
- if (dev_replace_is_ongoing) {
- lockdep_assert_held(&dev_replace->rwsem);
- /* Unlock and let waiting writers proceed */
- up_read(&dev_replace->rwsem);
- }
- free_extent_map(em);
- return ret;
+ io_geom->stripe_nr /= data_stripes;
+
+ /* RAID[56] write or recovery. Return all stripes */
+ io_geom->num_stripes = map->num_stripes;
+ io_geom->max_errors = btrfs_chunk_max_errors(map);
+
+ /* Return the length to the full stripe end. */
+ *length = min(logical + *length,
+ io_geom->raid56_full_stripe_start + map->start +
+ btrfs_stripe_nr_to_offset(data_stripes)) -
+ logical;
+ io_geom->stripe_index = 0;
+ io_geom->stripe_offset = 0;
}
-int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- u64 logical, u64 *length,
- struct btrfs_io_context **bioc_ret, int mirror_num)
+static void map_blocks_raid56_read(struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
{
- if (op == BTRFS_MAP_DISCARD)
- return __btrfs_map_block_for_discard(fs_info, logical,
- length, bioc_ret);
+ int data_stripes = nr_data_stripes(map);
+
+ ASSERT(io_geom->mirror_num <= 1, "mirror_num=%d", io_geom->mirror_num);
+ /* Just grab the data stripe directly. */
+ io_geom->stripe_index = io_geom->stripe_nr % data_stripes;
+ io_geom->stripe_nr /= data_stripes;
+
+ /* We distribute the parity blocks across stripes. */
+ io_geom->stripe_index =
+ (io_geom->stripe_nr + io_geom->stripe_index) % map->num_stripes;
- return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
- mirror_num, 0);
+ if (io_geom->op == BTRFS_MAP_READ && io_geom->mirror_num < 1)
+ io_geom->mirror_num = 1;
}
-/* For Scrub/replace */
-int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- u64 logical, u64 *length,
- struct btrfs_io_context **bioc_ret)
+static void map_blocks_single(const struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
{
- return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
+ io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
+ io_geom->stripe_nr /= map->num_stripes;
+ io_geom->mirror_num = io_geom->stripe_index + 1;
}
-static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
+/*
+ * Map one logical range to one or more physical ranges.
+ *
+ * @length: (Mandatory) mapped length of this run.
+ * One logical range can be split into different segments
+ * due to factors like zones and RAID0/5/6/10 stripe
+ * boundaries.
+ *
+ * @bioc_ret: (Mandatory) returned btrfs_io_context structure.
+ * which has one or more physical ranges (btrfs_io_stripe)
+ * recorded inside.
+ * Caller should call btrfs_put_bioc() to free it after use.
+ *
+ * @smap: (Optional) single physical range optimization.
+ * If the map request can be fulfilled by one single
+ * physical range, and this is parameter is not NULL,
+ * then @bioc_ret would be NULL, and @smap would be
+ * updated.
+ *
+ * @mirror_num_ret: (Mandatory) returned mirror number if the original
+ * value is 0.
+ *
+ * Mirror number 0 means to choose any live mirrors.
+ *
+ * For non-RAID56 profiles, non-zero mirror_num means
+ * the Nth mirror. (e.g. mirror_num 1 means the first
+ * copy).
+ *
+ * For RAID56 profile, mirror 1 means rebuild from P and
+ * the remaining data stripes.
+ *
+ * For RAID6 profile, mirror > 2 means mark another
+ * data/P stripe error and rebuild from the remaining
+ * stripes..
+ */
+int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ u64 logical, u64 *length,
+ struct btrfs_io_context **bioc_ret,
+ struct btrfs_io_stripe *smap, int *mirror_num_ret)
{
- bio->bi_private = bioc->private;
- bio->bi_end_io = bioc->end_io;
- bio_endio(bio);
+ struct btrfs_chunk_map *map;
+ struct btrfs_io_geometry io_geom = { 0 };
+ u64 map_offset;
+ int ret = 0;
+ int num_copies;
+ struct btrfs_io_context *bioc = NULL;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ bool dev_replace_is_ongoing = false;
+ u16 num_alloc_stripes;
+ u64 max_len;
- btrfs_put_bioc(bioc);
-}
+ ASSERT(bioc_ret);
-static void btrfs_end_bio(struct bio *bio)
-{
- struct btrfs_io_context *bioc = bio->bi_private;
- int is_orig_bio = 0;
+ io_geom.mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
+ io_geom.num_stripes = 1;
+ io_geom.stripe_index = 0;
+ io_geom.op = op;
- if (bio->bi_status) {
- atomic_inc(&bioc->error);
- if (bio->bi_status == BLK_STS_IOERR ||
- bio->bi_status == BLK_STS_TARGET) {
- struct btrfs_device *dev = btrfs_bio(bio)->device;
+ map = btrfs_get_chunk_map(fs_info, logical, *length);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
- ASSERT(dev->bdev);
- if (btrfs_op(bio) == BTRFS_MAP_WRITE)
- btrfs_dev_stat_inc_and_print(dev,
- BTRFS_DEV_STAT_WRITE_ERRS);
- else if (!(bio->bi_opf & REQ_RAHEAD))
- btrfs_dev_stat_inc_and_print(dev,
- BTRFS_DEV_STAT_READ_ERRS);
- if (bio->bi_opf & REQ_PREFLUSH)
- btrfs_dev_stat_inc_and_print(dev,
- BTRFS_DEV_STAT_FLUSH_ERRS);
- }
- }
+ num_copies = btrfs_chunk_map_num_copies(map);
+ if (io_geom.mirror_num > num_copies)
+ return -EINVAL;
- if (bio == bioc->orig_bio)
- is_orig_bio = 1;
+ map_offset = logical - map->start;
+ io_geom.raid56_full_stripe_start = (u64)-1;
+ max_len = btrfs_max_io_len(map, map_offset, &io_geom);
+ *length = min_t(u64, map->chunk_len - map_offset, max_len);
+ io_geom.use_rst = btrfs_need_stripe_tree_update(fs_info, map->type);
- btrfs_bio_counter_dec(bioc->fs_info);
+ if (dev_replace->replace_task != current)
+ down_read(&dev_replace->rwsem);
- if (atomic_dec_and_test(&bioc->stripes_pending)) {
- if (!is_orig_bio) {
- bio_put(bio);
- bio = bioc->orig_bio;
- }
+ dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+ /*
+ * Hold the semaphore for read during the whole operation, write is
+ * requested at commit time but must wait.
+ */
+ if (!dev_replace_is_ongoing && dev_replace->replace_task != current)
+ up_read(&dev_replace->rwsem);
- btrfs_bio(bio)->mirror_num = bioc->mirror_num;
- /* only send an error to the higher layers if it is
- * beyond the tolerance of the btrfs bio
+ switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ case BTRFS_BLOCK_GROUP_RAID0:
+ map_blocks_raid0(map, &io_geom);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID1:
+ case BTRFS_BLOCK_GROUP_RAID1C3:
+ case BTRFS_BLOCK_GROUP_RAID1C4:
+ map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing);
+ break;
+ case BTRFS_BLOCK_GROUP_DUP:
+ map_blocks_dup(map, &io_geom);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID10:
+ map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID5:
+ case BTRFS_BLOCK_GROUP_RAID6:
+ if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)
+ map_blocks_raid56_write(map, &io_geom, logical, length);
+ else
+ map_blocks_raid56_read(map, &io_geom);
+ break;
+ default:
+ /*
+ * After this, stripe_nr is the number of stripes on this
+ * device we have to walk to find the data, and stripe_index is
+ * the number of our device in the stripe array
*/
- if (atomic_read(&bioc->error) > bioc->max_errors) {
- bio->bi_status = BLK_STS_IOERR;
- } else {
- /*
- * this bio is actually up to date, we didn't
- * go over the max number of errors
- */
- bio->bi_status = BLK_STS_OK;
- }
-
- btrfs_end_bioc(bioc, bio);
- } else if (!is_orig_bio) {
- bio_put(bio);
+ map_blocks_single(map, &io_geom);
+ break;
+ }
+ if (io_geom.stripe_index >= map->num_stripes) {
+ btrfs_crit(fs_info,
+ "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
+ io_geom.stripe_index, map->num_stripes);
+ ret = -EINVAL;
+ goto out;
}
-}
-static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
- u64 physical, struct btrfs_device *dev)
-{
- struct btrfs_fs_info *fs_info = bioc->fs_info;
+ num_alloc_stripes = io_geom.num_stripes;
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
+ op != BTRFS_MAP_READ)
+ /*
+ * For replace case, we need to add extra stripes for extra
+ * duplicated stripes.
+ *
+ * For both WRITE and GET_READ_MIRRORS, we may have at most
+ * 2 more stripes (DUP types, otherwise 1).
+ */
+ num_alloc_stripes += 2;
- bio->bi_private = bioc;
- btrfs_bio(bio)->device = dev;
- bio->bi_end_io = btrfs_end_bio;
- bio->bi_iter.bi_sector = physical >> 9;
/*
- * For zone append writing, bi_sector must point the beginning of the
- * zone
+ * If this I/O maps to a single device, try to return the device and
+ * physical block information on the stack instead of allocating an
+ * I/O context structure.
*/
- if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- if (btrfs_dev_is_sequential(dev, physical)) {
- u64 zone_start = round_down(physical, fs_info->zone_size);
-
- bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
- } else {
- bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
- bio->bi_opf |= REQ_OP_WRITE;
- }
+ if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, &io_geom)) {
+ ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom);
+ if (mirror_num_ret)
+ *mirror_num_ret = io_geom.mirror_num;
+ *bioc_ret = NULL;
+ goto out;
}
- btrfs_debug_in_rcu(fs_info,
- "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
- bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
- (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
- dev->devid, bio->bi_iter.bi_size);
- bio_set_dev(bio, dev->bdev);
-
- btrfs_bio_counter_inc_noblocked(fs_info);
- btrfsic_submit_bio(bio);
-}
-
-static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
-{
- atomic_inc(&bioc->error);
- if (atomic_dec_and_test(&bioc->stripes_pending)) {
- /* Should be the original bio. */
- WARN_ON(bio != bioc->orig_bio);
-
- btrfs_bio(bio)->mirror_num = bioc->mirror_num;
- bio->bi_iter.bi_sector = logical >> 9;
- if (atomic_read(&bioc->error) > bioc->max_errors)
- bio->bi_status = BLK_STS_IOERR;
- else
- bio->bi_status = BLK_STS_OK;
- btrfs_end_bioc(bioc, bio);
+ bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes);
+ if (!bioc) {
+ ret = -ENOMEM;
+ goto out;
}
-}
-
-blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num)
-{
- struct btrfs_device *dev;
- struct bio *first_bio = bio;
- u64 logical = bio->bi_iter.bi_sector << 9;
- u64 length = 0;
- u64 map_length;
- int ret;
- int dev_nr;
- int total_devs;
- struct btrfs_io_context *bioc = NULL;
-
- length = bio->bi_iter.bi_size;
- map_length = length;
+ bioc->map_type = map->type;
+ bioc->use_rst = io_geom.use_rst;
- btrfs_bio_counter_inc_blocked(fs_info);
- ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
- &map_length, &bioc, mirror_num, 1);
- if (ret) {
- btrfs_bio_counter_dec(fs_info);
- return errno_to_blk_status(ret);
- }
-
- total_devs = bioc->num_stripes;
- bioc->orig_bio = first_bio;
- bioc->private = first_bio->bi_private;
- bioc->end_io = first_bio->bi_end_io;
- atomic_set(&bioc->stripes_pending, bioc->num_stripes);
-
- if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
- ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
- /* In this case, map_length has been set to the length of
- a single stripe; not the whole write */
- if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- ret = raid56_parity_write(bio, bioc, map_length);
- } else {
- ret = raid56_parity_recover(bio, bioc, map_length,
- mirror_num, 1);
+ /*
+ * For RAID56 full map, we need to make sure the stripes[] follows the
+ * rule that data stripes are all ordered, then followed with P and Q
+ * (if we have).
+ *
+ * It's still mostly the same as other profiles, just with extra rotation.
+ */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
+ (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)) {
+ /*
+ * For RAID56 @stripe_nr is already the number of full stripes
+ * before us, which is also the rotation value (needs to modulo
+ * with num_stripes).
+ *
+ * In this case, we just add @stripe_nr with @i, then do the
+ * modulo, to reduce one modulo call.
+ */
+ bioc->full_stripe_logical = map->start +
+ btrfs_stripe_nr_to_offset(io_geom.stripe_nr *
+ nr_data_stripes(map));
+ for (int i = 0; i < io_geom.num_stripes; i++) {
+ struct btrfs_io_stripe *dst = &bioc->stripes[i];
+ u32 stripe_index;
+
+ stripe_index = (i + io_geom.stripe_nr) % io_geom.num_stripes;
+ dst->dev = map->stripes[stripe_index].dev;
+ dst->physical =
+ map->stripes[stripe_index].physical +
+ io_geom.stripe_offset +
+ btrfs_stripe_nr_to_offset(io_geom.stripe_nr);
+ }
+ } else {
+ /*
+ * For all other non-RAID56 profiles, just copy the target
+ * stripe into the bioc.
+ */
+ for (int i = 0; i < io_geom.num_stripes; i++) {
+ ret = set_io_stripe(fs_info, logical, length,
+ &bioc->stripes[i], map, &io_geom);
+ if (ret < 0)
+ break;
+ io_geom.stripe_index++;
}
-
- btrfs_bio_counter_dec(fs_info);
- return errno_to_blk_status(ret);
}
- if (map_length < length) {
- btrfs_crit(fs_info,
- "mapping failed logical %llu bio len %llu len %llu",
- logical, length, map_length);
- BUG();
+ if (ret) {
+ *bioc_ret = NULL;
+ btrfs_put_bioc(bioc);
+ goto out;
}
- for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
- dev = bioc->stripes[dev_nr].dev;
- if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
- &dev->dev_state) ||
- (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
- !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
- bioc_error(bioc, first_bio, logical);
- continue;
- }
+ if (op != BTRFS_MAP_READ)
+ io_geom.max_errors = btrfs_chunk_max_errors(map);
- if (dev_nr < total_devs - 1)
- bio = btrfs_bio_clone(first_bio);
- else
- bio = first_bio;
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
+ op != BTRFS_MAP_READ) {
+ handle_ops_on_dev_replace(bioc, dev_replace, logical, &io_geom);
+ }
+
+ *bioc_ret = bioc;
+ bioc->num_stripes = io_geom.num_stripes;
+ bioc->max_errors = io_geom.max_errors;
+ bioc->mirror_num = io_geom.mirror_num;
- submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
+out:
+ if (dev_replace_is_ongoing && dev_replace->replace_task != current) {
+ lockdep_assert_held(&dev_replace->rwsem);
+ /* Unlock and let waiting writers proceed */
+ up_read(&dev_replace->rwsem);
}
- btrfs_bio_counter_dec(fs_info);
- return BLK_STS_OK;
+ btrfs_free_chunk_map(map);
+ return ret;
}
static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
@@ -6861,18 +6755,20 @@ static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
const struct btrfs_device *device)
{
- ASSERT((args->devid != (u64)-1) || args->missing);
+ if (args->devt)
+ return device->devt == args->devt;
+ if (args->missing) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
+ !device->bdev)
+ return true;
+ return false;
+ }
- if ((args->devid != (u64)-1) && device->devid != args->devid)
+ if (device->devid != args->devid)
return false;
if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
return false;
- if (!args->missing)
- return true;
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
- !device->bdev)
- return true;
- return false;
+ return true;
}
/*
@@ -6919,8 +6815,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
* always do NOFS because we use it in a lot of other GFP_KERNEL safe
* places.
*/
+
nofs_flag = memalloc_nofs_save();
- device = btrfs_alloc_device(NULL, &devid, dev_uuid);
+ device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(device))
return device;
@@ -6935,22 +6832,24 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
return device;
}
-/**
- * btrfs_alloc_device - allocate struct btrfs_device
+/*
+ * Allocate new device struct, set up devid and UUID.
+ *
* @fs_info: used only for generating a new devid, can be NULL if
* devid is provided (i.e. @devid != NULL).
* @devid: a pointer to devid for this device. If NULL a new devid
* is generated.
* @uuid: a pointer to UUID for this device. If NULL a new UUID
* is generated.
+ * @path: a pointer to device path if available, NULL otherwise.
*
* Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
* on error. Returned struct is not linked onto any lists and must be
* destroyed with btrfs_free_device.
*/
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
- const u64 *devid,
- const u8 *uuid)
+ const u64 *devid, const u8 *uuid,
+ const char *path)
{
struct btrfs_device *dev;
u64 tmp;
@@ -6962,24 +6861,13 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
if (!dev)
return ERR_PTR(-ENOMEM);
- /*
- * Preallocate a bio that's always going to be used for flushing device
- * barriers and matches the device lifespan
- */
- dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
- if (!dev->flush_bio) {
- kfree(dev);
- return ERR_PTR(-ENOMEM);
- }
-
INIT_LIST_HEAD(&dev->dev_list);
INIT_LIST_HEAD(&dev->dev_alloc_list);
INIT_LIST_HEAD(&dev->post_commit_list);
atomic_set(&dev->dev_stats_ccnt, 0);
btrfs_device_data_ordered_init(dev);
- extent_io_tree_init(fs_info, &dev->alloc_state,
- IO_TREE_DEVICE_ALLOC_STATE, NULL);
+ btrfs_extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE);
if (devid)
tmp = *devid;
@@ -6999,6 +6887,17 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
else
generate_random_uuid(dev->uuid);
+ if (path) {
+ const char *name;
+
+ name = kstrdup(path, GFP_KERNEL);
+ if (!name) {
+ btrfs_free_device(dev);
+ return ERR_PTR(-ENOMEM);
+ }
+ rcu_assign_pointer(dev->name, name);
+ }
+
return dev;
}
@@ -7013,11 +6912,11 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
devid, uuid);
}
-static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
+u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map)
{
- const int data_stripes = calc_data_stripes(type, num_stripes);
+ const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
- return div_u64(chunk_len, data_stripes);
+ return div_u64(map->chunk_len, data_stripes);
}
#if BITS_PER_LONG == 32
@@ -7060,19 +6959,39 @@ static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
}
#endif
+static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info,
+ u64 devid, u8 *uuid)
+{
+ struct btrfs_device *dev;
+
+ if (!btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_report_missing_device(fs_info, devid, uuid, true);
+ return ERR_PTR(-ENOENT);
+ }
+
+ dev = add_missing_dev(fs_info->fs_devices, devid, uuid);
+ if (IS_ERR(dev)) {
+ btrfs_err(fs_info, "failed to init missing device %llu: %ld",
+ devid, PTR_ERR(dev));
+ return dev;
+ }
+ btrfs_report_missing_device(fs_info, devid, uuid, false);
+
+ return dev;
+}
+
static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
{
BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = leaf->fs_info;
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct map_lookup *map;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
u64 logical;
u64 length;
u64 devid;
u64 type;
u8 uuid[BTRFS_UUID_SIZE];
+ int index;
int num_stripes;
int ret;
int i;
@@ -7080,6 +6999,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
type = btrfs_chunk_type(leaf, chunk);
+ index = btrfs_bg_flags_to_raid_index(type);
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
#if BITS_PER_LONG == 32
@@ -7089,54 +7009,37 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
warn_32bit_meta_chunk(fs_info, logical, length, type);
#endif
- /*
- * Only need to verify chunk item if we're reading from sys chunk array,
- * as chunk item in tree block is already verified by tree-checker.
- */
- if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
- ret = btrfs_check_chunk_valid(leaf, chunk, logical);
- if (ret)
- return ret;
- }
-
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, logical, 1);
- read_unlock(&map_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, logical, 1);
/* already mapped? */
- if (em && em->start <= logical && em->start + em->len > logical) {
- free_extent_map(em);
+ if (map && map->start <= logical && map->start + map->chunk_len > logical) {
+ btrfs_free_chunk_map(map);
return 0;
- } else if (em) {
- free_extent_map(em);
+ } else if (map) {
+ btrfs_free_chunk_map(map);
}
- em = alloc_extent_map();
- if (!em)
- return -ENOMEM;
- map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
- if (!map) {
- free_extent_map(em);
+ map = btrfs_alloc_chunk_map(num_stripes, GFP_NOFS);
+ if (!map)
return -ENOMEM;
- }
-
- set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->map_lookup = map;
- em->start = logical;
- em->len = length;
- em->orig_start = 0;
- em->block_start = 0;
- em->block_len = em->len;
+ map->start = logical;
+ map->chunk_len = length;
map->num_stripes = num_stripes;
map->io_width = btrfs_chunk_io_width(leaf, chunk);
map->io_align = btrfs_chunk_io_align(leaf, chunk);
- map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
map->type = type;
- map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+ /*
+ * We can't use the sub_stripes value, as for profiles other than
+ * RAID10, they may have 0 as sub_stripes for filesystems created by
+ * older mkfs (<v5.4).
+ * In that case, it can cause divide-by-zero errors later.
+ * Since currently sub_stripes is fixed for each profile, let's
+ * use the trusted value instead.
+ */
+ map->sub_stripes = btrfs_raid_array[index].sub_stripes;
map->verified_stripes = 0;
- em->orig_block_len = calc_stripe_length(type, em->len,
- map->num_stripes);
+ map->stripe_size = btrfs_calc_stripe_length(map);
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
@@ -7147,39 +7050,27 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
BTRFS_UUID_SIZE);
args.uuid = uuid;
map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
- if (!map->stripes[i].dev &&
- !btrfs_test_opt(fs_info, DEGRADED)) {
- free_extent_map(em);
- btrfs_report_missing_device(fs_info, devid, uuid, true);
- return -ENOENT;
- }
if (!map->stripes[i].dev) {
- map->stripes[i].dev =
- add_missing_dev(fs_info->fs_devices, devid,
- uuid);
+ map->stripes[i].dev = handle_missing_device(fs_info,
+ devid, uuid);
if (IS_ERR(map->stripes[i].dev)) {
- free_extent_map(em);
- btrfs_err(fs_info,
- "failed to init missing dev %llu: %ld",
- devid, PTR_ERR(map->stripes[i].dev));
- return PTR_ERR(map->stripes[i].dev);
+ ret = PTR_ERR(map->stripes[i].dev);
+ btrfs_free_chunk_map(map);
+ return ret;
}
- btrfs_report_missing_device(fs_info, devid, uuid, false);
}
+
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&(map->stripes[i].dev->dev_state));
-
}
- write_lock(&map_tree->lock);
- ret = add_extent_mapping(map_tree, em, 0);
- write_unlock(&map_tree->lock);
+ ret = btrfs_add_chunk_map(fs_info, map);
if (ret < 0) {
btrfs_err(fs_info,
"failed to add chunk map, start=%llu len=%llu: %d",
- em->start, em->len, ret);
+ map->start, map->chunk_len, ret);
+ btrfs_free_chunk_map(map);
}
- free_extent_map(em);
return ret;
}
@@ -7224,10 +7115,14 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
fs_devices = find_fsid(fsid, NULL);
if (!fs_devices) {
- if (!btrfs_test_opt(fs_info, DEGRADED))
+ if (!btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_err(fs_info,
+ "failed to find fsid %pU when attempting to open seed devices",
+ fsid);
return ERR_PTR(-ENOENT);
+ }
- fs_devices = alloc_fs_devices(fsid, NULL);
+ fs_devices = alloc_fs_devices(fsid);
if (IS_ERR(fs_devices))
return fs_devices;
@@ -7244,7 +7139,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
if (IS_ERR(fs_devices))
return fs_devices;
- ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
+ ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->sb);
if (ret) {
free_fs_devices(fs_devices);
return ERR_PTR(ret);
@@ -7273,7 +7168,8 @@ static int read_one_dev(struct extent_buffer *leaf,
u8 fs_uuid[BTRFS_FSID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
- devid = args.devid = btrfs_device_id(leaf, dev_item);
+ devid = btrfs_device_id(leaf, dev_item);
+ args.devid = devid;
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
@@ -7373,46 +7269,26 @@ static int read_one_dev(struct extent_buffer *leaf,
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *root = fs_info->tree_root;
struct btrfs_super_block *super_copy = fs_info->super_copy;
struct extent_buffer *sb;
- struct btrfs_disk_key *disk_key;
- struct btrfs_chunk *chunk;
u8 *array_ptr;
unsigned long sb_array_offset;
int ret = 0;
- u32 num_stripes;
u32 array_size;
- u32 len = 0;
u32 cur_offset;
- u64 type;
struct btrfs_key key;
ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
+
/*
- * This will create extent buffer of nodesize, superblock size is
- * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
- * overallocate but we can keep it as-is, only the first page is used.
+ * We allocated a dummy extent, just to use extent buffer accessors.
+ * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
+ * that's fine, we will not go beyond system chunk array anyway.
*/
- sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
- root->root_key.objectid, 0);
- if (IS_ERR(sb))
- return PTR_ERR(sb);
+ sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
+ if (!sb)
+ return -ENOMEM;
set_extent_buffer_uptodate(sb);
- /*
- * The sb extent buffer is artificial and just used to read the system array.
- * set_extent_buffer_uptodate() call does not properly mark all it's
- * pages up-to-date when the page is larger: extent does not cover the
- * whole page and consequently check_page_uptodate does not find all
- * the page's extents up-to-date (the hole beyond sb),
- * write_extent_buffer then triggers a WARN_ON.
- *
- * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
- * but sb spans only this function. Add an explicit SetPageUptodate call
- * to silence the warning eg. on PowerPC 64.
- */
- if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
- SetPageUptodate(sb->pages[0]);
write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
array_size = btrfs_super_sys_array_size(super_copy);
@@ -7422,10 +7298,15 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
cur_offset = 0;
while (cur_offset < array_size) {
- disk_key = (struct btrfs_disk_key *)array_ptr;
- len = sizeof(*disk_key);
- if (cur_offset + len > array_size)
- goto out_short_read;
+ struct btrfs_chunk *chunk;
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)array_ptr;
+ u32 len = sizeof(*disk_key);
+
+ /*
+ * The sys_chunk_array has been already verified at super block
+ * read time. Only do ASSERT()s for basic checks.
+ */
+ ASSERT(cur_offset + len <= array_size);
btrfs_disk_key_to_cpu(&key, disk_key);
@@ -7433,44 +7314,14 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
sb_array_offset += len;
cur_offset += len;
- if (key.type != BTRFS_CHUNK_ITEM_KEY) {
- btrfs_err(fs_info,
- "unexpected item type %u in sys_array at offset %u",
- (u32)key.type, cur_offset);
- ret = -EIO;
- break;
- }
+ ASSERT(key.type == BTRFS_CHUNK_ITEM_KEY);
chunk = (struct btrfs_chunk *)sb_array_offset;
- /*
- * At least one btrfs_chunk with one stripe must be present,
- * exact stripe count check comes afterwards
- */
- len = btrfs_chunk_item_size(1);
- if (cur_offset + len > array_size)
- goto out_short_read;
-
- num_stripes = btrfs_chunk_num_stripes(sb, chunk);
- if (!num_stripes) {
- btrfs_err(fs_info,
- "invalid number of stripes %u in sys_array at offset %u",
- num_stripes, cur_offset);
- ret = -EIO;
- break;
- }
+ ASSERT(btrfs_chunk_type(sb, chunk) & BTRFS_BLOCK_GROUP_SYSTEM);
- type = btrfs_chunk_type(sb, chunk);
- if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
- btrfs_err(fs_info,
- "invalid chunk type %llu in sys_array at offset %u",
- type, cur_offset);
- ret = -EIO;
- break;
- }
+ len = btrfs_chunk_item_size(btrfs_chunk_num_stripes(sb, chunk));
- len = btrfs_chunk_item_size(num_stripes);
- if (cur_offset + len > array_size)
- goto out_short_read;
+ ASSERT(cur_offset + len <= array_size);
ret = read_one_chunk(&key, sb, chunk);
if (ret)
@@ -7483,13 +7334,6 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
clear_extent_buffer_uptodate(sb);
free_extent_buffer_stale(sb);
return ret;
-
-out_short_read:
- btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
- len, cur_offset);
- clear_extent_buffer_uptodate(sb);
- free_extent_buffer_stale(sb);
- return -EIO;
}
/*
@@ -7503,26 +7347,21 @@ out_short_read:
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev)
{
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- u64 next_start = 0;
+ struct btrfs_chunk_map *map;
+ u64 next_start;
bool ret = true;
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, 0, (u64)-1);
- read_unlock(&map_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, 0, U64_MAX);
/* No chunk at all? Return false anyway */
- if (!em) {
+ if (!map) {
ret = false;
goto out;
}
- while (em) {
- struct map_lookup *map;
+ while (map) {
int missing = 0;
int max_tolerated;
int i;
- map = em->map_lookup;
max_tolerated =
btrfs_get_num_tolerated_disk_barrier_failures(
map->type);
@@ -7540,18 +7379,15 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
if (!failing_dev)
btrfs_warn(fs_info,
"chunk %llu missing %d devices, max tolerance is %d for writable mount",
- em->start, missing, max_tolerated);
- free_extent_map(em);
+ map->start, missing, max_tolerated);
+ btrfs_free_chunk_map(map);
ret = false;
goto out;
}
- next_start = extent_map_end(em);
- free_extent_map(em);
+ next_start = map->start + map->chunk_len;
+ btrfs_free_chunk_map(map);
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, next_start,
- (u64)(-1) - next_start);
- read_unlock(&map_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start);
}
out:
return ret;
@@ -7569,12 +7405,13 @@ static void readahead_tree_node_children(struct extent_buffer *node)
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->chunk_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key key;
struct btrfs_key found_key;
int ret;
int slot;
+ int iter_ret = 0;
u64 total_dev = 0;
u64 last_ra_node = 0;
@@ -7599,7 +7436,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
/*
* Lockdep complains about possible circular locking dependency between
* a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
- * used for freeze procection of a fs (struct super_block.s_writers),
+ * used for freeze protection of a fs (struct super_block.s_writers),
* which we take when starting a transaction, and extent buffers of the
* chunk tree if we call read_one_dev() while holding a lock on an
* extent buffer of the chunk tree. Since we are mounting the filesystem
@@ -7607,7 +7444,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* chunk tree, to keep it simple, just skip locking on the chunk tree.
*/
ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
- path->skip_locking = 1;
+ path->skip_locking = true;
/*
* Read all device items, and then all the chunk items. All
@@ -7616,32 +7453,20 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
*/
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
- key.offset = 0;
key.type = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto error;
- while (1) {
- struct extent_buffer *node;
+ key.offset = 0;
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
+ struct extent_buffer *node = path->nodes[1];
leaf = path->nodes[0];
slot = path->slots[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto error;
- break;
- }
- node = path->nodes[1];
+
if (node) {
if (last_ra_node != node->start) {
readahead_tree_node_children(node);
last_ra_node = node->start;
}
}
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.type == BTRFS_DEV_ITEM_KEY) {
struct btrfs_dev_item *dev_item;
dev_item = btrfs_item_ptr(leaf, slot,
@@ -7666,7 +7491,11 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
if (ret)
goto error;
}
- path->slots[0]++;
+ }
+ /* Catch error found during iteration */
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto error;
}
/*
@@ -7674,12 +7503,12 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* do another round of validation checks.
*/
if (total_dev != fs_info->fs_devices->total_devices) {
- btrfs_err(fs_info,
- "super_num_devices %llu mismatch with num_devices %llu found here",
+ btrfs_warn(fs_info,
+"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
btrfs_super_num_devices(fs_info->super_copy),
total_dev);
- ret = -EINVAL;
- goto error;
+ fs_info->fs_devices->total_devices = total_dev;
+ btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
}
if (btrfs_super_total_bytes(fs_info->super_copy) <
fs_info->fs_devices->total_rw_bytes) {
@@ -7693,29 +7522,32 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
ret = 0;
error:
mutex_unlock(&uuid_mutex);
-
- btrfs_free_path(path);
return ret;
}
-void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
+int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
-
- fs_devices->fs_info = fs_info;
+ int ret = 0;
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list)
device->fs_info = fs_info;
list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
- list_for_each_entry(device, &seed_devs->devices, dev_list)
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
device->fs_info = fs_info;
+ ret = btrfs_get_dev_zone_info(device, false);
+ if (ret)
+ break;
+ }
seed_devs->fs_info = fs_info;
}
mutex_unlock(&fs_devices->device_list_mutex);
+
+ return ret;
}
static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
@@ -7789,7 +7621,7 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
int ret = 0;
path = btrfs_alloc_path();
@@ -7811,8 +7643,6 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
}
out:
mutex_unlock(&fs_devices->device_list_mutex);
-
- btrfs_free_path(path);
return ret;
}
@@ -7821,7 +7651,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *dev_root = fs_info->dev_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *eb;
struct btrfs_dev_stats_item *ptr;
@@ -7837,10 +7667,10 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
return -ENOMEM;
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"error %d while searching for dev_stats item for device %s",
- ret, rcu_str_deref(device->name));
- goto out;
+ ret, btrfs_dev_name(device));
+ return ret;
}
if (ret == 0 &&
@@ -7848,10 +7678,10 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
/* need to delete old one and insert a new one */
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"delete too small dev_stats item for device %s failed %d",
- rcu_str_deref(device->name), ret);
- goto out;
+ btrfs_dev_name(device), ret);
+ return ret;
}
ret = 1;
}
@@ -7862,10 +7692,10 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, dev_root, path,
&key, sizeof(*ptr));
if (ret < 0) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"insert dev_stats item for device %s failed %d",
- rcu_str_deref(device->name), ret);
- goto out;
+ btrfs_dev_name(device), ret);
+ return ret;
}
}
@@ -7874,10 +7704,6 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
btrfs_set_dev_stats_value(eb, ptr, i,
btrfs_dev_stat_read(device, i));
- btrfs_mark_buffer_dirty(eb);
-
-out:
- btrfs_free_path(path);
return ret;
}
@@ -7924,16 +7750,12 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
{
btrfs_dev_stat_inc(dev, index);
- btrfs_dev_stat_print_on_error(dev);
-}
-static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
-{
if (!dev->dev_stats_valid)
return;
- btrfs_err_rl_in_rcu(dev->fs_info,
+ btrfs_err_rl(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
- rcu_str_deref(dev->name),
+ btrfs_dev_name(dev),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
@@ -7951,9 +7773,9 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
if (i == BTRFS_DEV_STAT_VALUES_MAX)
return; /* all values == 0, suppress message */
- btrfs_info_in_rcu(dev->fs_info,
+ btrfs_info(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
- rcu_str_deref(dev->name),
+ btrfs_dev_name(dev),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
@@ -8011,7 +7833,7 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
{
struct btrfs_device *curr, *next;
- ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
+ ASSERT(trans->state == TRANS_STATE_COMMIT_DOING, "state=%d" , trans->state);
if (list_empty(&trans->dev_update_list))
return;
@@ -8041,27 +7863,20 @@ int btrfs_bg_type_to_factor(u64 flags)
return btrfs_raid_array[index].ncopies;
}
-
-
static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
u64 chunk_offset, u64 devid,
u64 physical_offset, u64 physical_len)
{
struct btrfs_dev_lookup_args args = { .devid = devid };
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
struct btrfs_device *dev;
u64 stripe_len;
bool found = false;
int ret = 0;
int i;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- read_unlock(&em_tree->lock);
-
- if (!em) {
+ map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
+ if (unlikely(!map)) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
physical_offset, devid);
@@ -8069,25 +7884,34 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
goto out;
}
- map = em->map_lookup;
- stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
- if (physical_len != stripe_len) {
+ stripe_len = btrfs_calc_stripe_length(map);
+ if (unlikely(physical_len != stripe_len)) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
- physical_offset, devid, em->start, physical_len,
+ physical_offset, devid, map->start, physical_len,
stripe_len);
ret = -EUCLEAN;
goto out;
}
+ /*
+ * Very old mkfs.btrfs (before v4.15) will not respect the reserved
+ * space. Although kernel can handle it without problem, better to warn
+ * the users.
+ */
+ if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED)
+ btrfs_warn(fs_info,
+ "devid %llu physical %llu len %llu inside the reserved space",
+ devid, physical_offset, physical_len);
+
for (i = 0; i < map->num_stripes; i++) {
- if (map->stripes[i].dev->devid == devid &&
- map->stripes[i].physical == physical_offset) {
+ if (unlikely(map->stripes[i].dev->devid == devid &&
+ map->stripes[i].physical == physical_offset)) {
found = true;
if (map->verified_stripes >= map->num_stripes) {
btrfs_err(fs_info,
"too many dev extents for chunk %llu found",
- em->start);
+ map->start);
ret = -EUCLEAN;
goto out;
}
@@ -8095,7 +7919,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
break;
}
}
- if (!found) {
+ if (unlikely(!found)) {
btrfs_err(fs_info,
"dev extent physical offset %llu devid %llu has no corresponding chunk",
physical_offset, devid);
@@ -8104,13 +7928,13 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
/* Make sure no dev extent is beyond device boundary */
dev = btrfs_find_device(fs_info->fs_devices, &args);
- if (!dev) {
+ if (unlikely(!dev)) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
ret = -EUCLEAN;
goto out;
}
- if (physical_offset + physical_len > dev->disk_total_bytes) {
+ if (unlikely(physical_offset + physical_len > dev->disk_total_bytes)) {
btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
devid, physical_offset, physical_len,
@@ -8122,8 +7946,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
if (dev->zone_info) {
u64 zone_size = dev->zone_info->zone_size;
- if (!IS_ALIGNED(physical_offset, zone_size) ||
- !IS_ALIGNED(physical_len, zone_size)) {
+ if (unlikely(!IS_ALIGNED(physical_offset, zone_size) ||
+ !IS_ALIGNED(physical_len, zone_size))) {
btrfs_err(fs_info,
"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
devid, physical_offset, physical_len);
@@ -8133,32 +7957,30 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
out:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
struct rb_node *node;
int ret = 0;
- read_lock(&em_tree->lock);
- for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
- em = rb_entry(node, struct extent_map, rb_node);
- if (em->map_lookup->num_stripes !=
- em->map_lookup->verified_stripes) {
+ read_lock(&fs_info->mapping_tree_lock);
+ for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
+ struct btrfs_chunk_map *map;
+
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ if (unlikely(map->num_stripes != map->verified_stripes)) {
btrfs_err(fs_info,
"chunk %llu has missing dev extent, have %d expect %d",
- em->start, em->map_lookup->verified_stripes,
- em->map_lookup->num_stripes);
+ map->start, map->verified_stripes, map->num_stripes);
ret = -EUCLEAN;
goto out;
}
}
out:
- read_unlock(&em_tree->lock);
+ read_unlock(&fs_info->mapping_tree_lock);
return ret;
}
@@ -8171,7 +7993,7 @@ out:
*/
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_key key;
u64 prev_devid = 0;
@@ -8202,17 +8024,15 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
path->reada = READA_FORWARD;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto out;
+ return ret;
/* No dev extents at all? Not good */
- if (ret > 0) {
- ret = -EUCLEAN;
- goto out;
- }
+ if (unlikely(ret > 0))
+ return -EUCLEAN;
}
while (1) {
struct extent_buffer *leaf = path->nodes[0];
@@ -8234,24 +8054,23 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
physical_len = btrfs_dev_extent_length(leaf, dext);
/* Check if this dev extent overlaps with the previous one */
- if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
+ if (unlikely(devid == prev_devid && physical_offset < prev_dev_ext_end)) {
btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
devid, physical_offset, prev_dev_ext_end);
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
physical_offset, physical_len);
if (ret < 0)
- goto out;
+ return ret;
prev_devid = devid;
prev_dev_ext_end = physical_offset + physical_len;
ret = btrfs_next_item(root, path);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0) {
ret = 0;
break;
@@ -8259,10 +8078,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
}
/* Ensure all chunks have corresponding dev extents */
- ret = verify_chunk_dev_extent_mapping(fs_info);
-out:
- btrfs_free_path(path);
- return ret;
+ return verify_chunk_dev_extent_mapping(fs_info);
}
/*
@@ -8291,7 +8107,7 @@ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
static int relocating_repair_kthread(void *data)
{
- struct btrfs_block_group *cache = (struct btrfs_block_group *)data;
+ struct btrfs_block_group *cache = data;
struct btrfs_fs_info *fs_info = cache->fs_info;
u64 target;
int ret = 0;
@@ -8299,6 +8115,8 @@ static int relocating_repair_kthread(void *data)
target = cache->start;
btrfs_put_block_group(cache);
+ guard(super_write)(fs_info->sb);
+
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
btrfs_info(fs_info,
"zoned: skip relocating block group %llu to repair: EBUSY",
@@ -8313,7 +8131,7 @@ static int relocating_repair_kthread(void *data)
if (!cache)
goto out;
- if (!cache->relocating_repair)
+ if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags))
goto out;
ret = btrfs_may_alloc_data_chunk(fs_info, target);
@@ -8323,7 +8141,7 @@ static int relocating_repair_kthread(void *data)
btrfs_info(fs_info,
"zoned: relocating block group %llu to repair IO failure",
target);
- ret = btrfs_relocate_chunk(fs_info, target);
+ ret = btrfs_relocate_chunk(fs_info, target, true);
out:
if (cache)
@@ -8349,17 +8167,87 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
if (!cache)
return true;
- spin_lock(&cache->lock);
- if (cache->relocating_repair) {
- spin_unlock(&cache->lock);
+ if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) {
btrfs_put_block_group(cache);
return true;
}
- cache->relocating_repair = 1;
- spin_unlock(&cache->lock);
kthread_run(relocating_repair_kthread, cache,
"btrfs-relocating-repair");
return true;
}
+
+static void map_raid56_repair_block(struct btrfs_io_context *bioc,
+ struct btrfs_io_stripe *smap,
+ u64 logical)
+{
+ int data_stripes = nr_bioc_data_stripes(bioc);
+ int i;
+
+ for (i = 0; i < data_stripes; i++) {
+ u64 stripe_start = bioc->full_stripe_logical +
+ btrfs_stripe_nr_to_offset(i);
+
+ if (logical >= stripe_start &&
+ logical < stripe_start + BTRFS_STRIPE_LEN)
+ break;
+ }
+ ASSERT(i < data_stripes, "i=%d data_stripes=%d", i, data_stripes);
+ smap->dev = bioc->stripes[i].dev;
+ smap->physical = bioc->stripes[i].physical +
+ ((logical - bioc->full_stripe_logical) &
+ BTRFS_STRIPE_LEN_MASK);
+}
+
+/*
+ * Map a repair write into a single device.
+ *
+ * A repair write is triggered by read time repair or scrub, which would only
+ * update the contents of a single device.
+ * Not update any other mirrors nor go through RMW path.
+ *
+ * Callers should ensure:
+ *
+ * - Call btrfs_bio_counter_inc_blocked() first
+ * - The range does not cross stripe boundary
+ * - Has a valid @mirror_num passed in.
+ */
+int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
+ struct btrfs_io_stripe *smap, u64 logical,
+ u32 length, int mirror_num)
+{
+ struct btrfs_io_context *bioc = NULL;
+ u64 map_length = length;
+ int mirror_ret = mirror_num;
+ int ret;
+
+ ASSERT(mirror_num > 0, "mirror_num=%d", mirror_num);
+
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
+ &bioc, smap, &mirror_ret);
+ if (ret < 0)
+ return ret;
+
+ /* The map range should not cross stripe boundary. */
+ ASSERT(map_length >= length, "map_length=%llu length=%u", map_length, length);
+
+ /* Already mapped to single stripe. */
+ if (!bioc)
+ goto out;
+
+ /* Map the RAID56 multi-stripe writes to a single one. */
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ map_raid56_repair_block(bioc, smap, logical);
+ goto out;
+ }
+
+ ASSERT(mirror_num <= bioc->num_stripes,
+ "mirror_num=%d num_stripes=%d", mirror_num, bioc->num_stripes);
+ smap->dev = bioc->stripes[mirror_num - 1].dev;
+ smap->physical = bioc->stripes[mirror_num - 1].physical;
+out:
+ btrfs_put_bioc(bioc);
+ ASSERT(smap->dev);
+ return 0;
+}