summaryrefslogtreecommitdiff
path: root/fs/btrfs/volumes.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r--fs/btrfs/volumes.c1040
1 files changed, 612 insertions, 428 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b39737568c22..93f8f17cacca 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -27,6 +27,7 @@
#include <linux/raid/pq.h>
#include <linux/semaphore.h>
#include <linux/uuid.h>
+#include <linux/list_sort.h>
#include <asm/div64.h>
#include "ctree.h"
#include "extent_map.h"
@@ -145,6 +146,71 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
struct btrfs_bio **bbio_ret,
int mirror_num, int need_raid_map);
+/*
+ * Device locking
+ * ==============
+ *
+ * There are several mutexes that protect manipulation of devices and low-level
+ * structures like chunks but not block groups, extents or files
+ *
+ * uuid_mutex (global lock)
+ * ------------------------
+ * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
+ * the SCAN_DEV ioctl registration or from mount either implicitly (the first
+ * device) or requested by the device= mount option
+ *
+ * the mutex can be very coarse and can cover long-running operations
+ *
+ * protects: updates to fs_devices counters like missing devices, rw devices,
+ * seeding, structure cloning, openning/closing devices at mount/umount time
+ *
+ * global::fs_devs - add, remove, updates to the global list
+ *
+ * does not protect: manipulation of the fs_devices::devices list!
+ *
+ * btrfs_device::name - renames (write side), read is RCU
+ *
+ * fs_devices::device_list_mutex (per-fs, with RCU)
+ * ------------------------------------------------
+ * protects updates to fs_devices::devices, ie. adding and deleting
+ *
+ * simple list traversal with read-only actions can be done with RCU protection
+ *
+ * may be used to exclude some operations from running concurrently without any
+ * modifications to the list (see write_all_supers)
+ *
+ * volume_mutex
+ * ------------
+ * coarse lock owned by a mounted filesystem; used to exclude some operations
+ * that cannot run in parallel and affect the higher-level properties of the
+ * filesystem like: device add/deleting/resize/replace, or balance
+ *
+ * balance_mutex
+ * -------------
+ * protects balance structures (status, state) and context accessed from
+ * several places (internally, ioctl)
+ *
+ * chunk_mutex
+ * -----------
+ * protects chunks, adding or removing during allocation, trim or when a new
+ * device is added/removed
+ *
+ * cleaner_mutex
+ * -------------
+ * a big lock that is held by the cleaner thread and prevents running subvolume
+ * cleaning together with relocation or delayed iputs
+ *
+ *
+ * Lock nesting
+ * ============
+ *
+ * uuid_mutex
+ * volume_mutex
+ * device_list_mutex
+ * chunk_mutex
+ * balance_mutex
+ */
+
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
struct list_head *btrfs_get_fs_uuids(void)
@@ -180,6 +246,13 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
return fs_devs;
}
+static void free_device(struct btrfs_device *device)
+{
+ rcu_string_free(device->name);
+ bio_put(device->flush_bio);
+ kfree(device);
+}
+
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device;
@@ -188,8 +261,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
device = list_entry(fs_devices->devices.next,
struct btrfs_device, dev_list);
list_del(&device->dev_list);
- rcu_string_free(device->name);
- kfree(device);
+ free_device(device);
}
kfree(fs_devices);
}
@@ -207,7 +279,7 @@ static void btrfs_kobject_uevent(struct block_device *bdev,
&disk_to_dev(bdev->bd_disk)->kobj);
}
-void btrfs_cleanup_fs_uuids(void)
+void __exit btrfs_cleanup_fs_uuids(void)
{
struct btrfs_fs_devices *fs_devices;
@@ -219,6 +291,11 @@ void btrfs_cleanup_fs_uuids(void)
}
}
+/*
+ * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
+ * Returned struct is not linked onto any lists and must be destroyed using
+ * free_device.
+ */
static struct btrfs_device *__alloc_device(void)
{
struct btrfs_device *dev;
@@ -236,7 +313,6 @@ static struct btrfs_device *__alloc_device(void)
kfree(dev);
return ERR_PTR(-ENOMEM);
}
- bio_get(dev->flush_bio);
INIT_LIST_HEAD(&dev->dev_list);
INIT_LIST_HEAD(&dev->dev_alloc_list);
@@ -244,7 +320,6 @@ static struct btrfs_device *__alloc_device(void)
spin_lock_init(&dev->io_lock);
- spin_lock_init(&dev->reada_lock);
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
btrfs_device_data_ordered_init(dev);
@@ -360,7 +435,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device)
int again = 0;
unsigned long num_run;
unsigned long batch_run = 0;
- unsigned long limit;
unsigned long last_waited = 0;
int force_reg = 0;
int sync_pending = 0;
@@ -375,8 +449,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device)
blk_start_plug(&plug);
bdi = device->bdev->bd_bdi;
- limit = btrfs_async_submit_limit(fs_info);
- limit = limit * 2 / 3;
loop:
spin_lock(&device->io_lock);
@@ -443,13 +515,6 @@ loop_lock:
pending = pending->bi_next;
cur->bi_next = NULL;
- /*
- * atomic_dec_return implies a barrier for waitqueue_active
- */
- if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
- waitqueue_active(&fs_info->async_submit_wait))
- wake_up(&fs_info->async_submit_wait);
-
BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
/*
@@ -517,12 +582,6 @@ loop_lock:
&device->work);
goto done;
}
- /* unplug every 64 requests just for good measure */
- if (batch_run % 64 == 0) {
- blk_finish_plug(&plug);
- blk_start_plug(&plug);
- sync_pending = 0;
- }
}
cond_resched();
@@ -546,84 +605,144 @@ static void pending_bios_fn(struct btrfs_work *work)
run_scheduled_bios(device);
}
-
-void btrfs_free_stale_device(struct btrfs_device *cur_dev)
+/*
+ * Search and remove all stale (devices which are not mounted) devices.
+ * When both inputs are NULL, it will search and release all stale devices.
+ * path: Optional. When provided will it release all unmounted devices
+ * matching this path only.
+ * skip_dev: Optional. Will skip this device when searching for the stale
+ * devices.
+ */
+static void btrfs_free_stale_devices(const char *path,
+ struct btrfs_device *skip_dev)
{
- struct btrfs_fs_devices *fs_devs;
- struct btrfs_device *dev;
+ struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
+ struct btrfs_device *dev, *tmp_dev;
- if (!cur_dev->name)
- return;
-
- list_for_each_entry(fs_devs, &fs_uuids, list) {
- int del = 1;
+ list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) {
if (fs_devs->opened)
continue;
- if (fs_devs->seeding)
- continue;
- list_for_each_entry(dev, &fs_devs->devices, dev_list) {
+ list_for_each_entry_safe(dev, tmp_dev,
+ &fs_devs->devices, dev_list) {
+ int not_found = 0;
- if (dev == cur_dev)
+ if (skip_dev && skip_dev == dev)
continue;
- if (!dev->name)
+ if (path && !dev->name)
continue;
- /*
- * Todo: This won't be enough. What if the same device
- * comes back (with new uuid and) with its mapper path?
- * But for now, this does help as mostly an admin will
- * either use mapper or non mapper path throughout.
- */
rcu_read_lock();
- del = strcmp(rcu_str_deref(dev->name),
- rcu_str_deref(cur_dev->name));
+ if (path)
+ not_found = strcmp(rcu_str_deref(dev->name),
+ path);
rcu_read_unlock();
- if (!del)
- break;
- }
+ if (not_found)
+ continue;
- if (!del) {
/* delete the stale device */
if (fs_devs->num_devices == 1) {
btrfs_sysfs_remove_fsid(fs_devs);
list_del(&fs_devs->list);
free_fs_devices(fs_devs);
+ break;
} else {
fs_devs->num_devices--;
list_del(&dev->dev_list);
- rcu_string_free(dev->name);
- kfree(dev);
+ free_device(dev);
}
- break;
}
}
}
+static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
+ struct btrfs_device *device, fmode_t flags,
+ void *holder)
+{
+ struct request_queue *q;
+ struct block_device *bdev;
+ struct buffer_head *bh;
+ struct btrfs_super_block *disk_super;
+ u64 devid;
+ int ret;
+
+ if (device->bdev)
+ return -EINVAL;
+ if (!device->name)
+ return -EINVAL;
+
+ ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
+ &bdev, &bh);
+ if (ret)
+ return ret;
+
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+ devid = btrfs_stack_device_id(&disk_super->dev_item);
+ if (devid != device->devid)
+ goto error_brelse;
+
+ if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
+ goto error_brelse;
+
+ device->generation = btrfs_super_generation(disk_super);
+
+ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ fs_devices->seeding = 1;
+ } else {
+ if (bdev_read_only(bdev))
+ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ else
+ set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ }
+
+ q = bdev_get_queue(bdev);
+ if (!blk_queue_nonrot(q))
+ fs_devices->rotating = 1;
+
+ device->bdev = bdev;
+ clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ device->mode = flags;
+
+ fs_devices->open_devices++;
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
+ device->devid != BTRFS_DEV_REPLACE_DEVID) {
+ fs_devices->rw_devices++;
+ list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
+ }
+ brelse(bh);
+
+ return 0;
+
+error_brelse:
+ brelse(bh);
+ blkdev_put(bdev, flags);
+
+ return -EINVAL;
+}
+
/*
* Add new device to list of registered devices
*
* Returns:
- * 1 - first time device is seen
- * 0 - device already known
- * < 0 - error
+ * device pointer which was just added or updated when successful
+ * error pointer when failed
*/
-static noinline int device_list_add(const char *path,
- struct btrfs_super_block *disk_super,
- u64 devid, struct btrfs_fs_devices **fs_devices_ret)
+static noinline struct btrfs_device *device_list_add(const char *path,
+ struct btrfs_super_block *disk_super)
{
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices;
struct rcu_string *name;
- int ret = 0;
u64 found_transid = btrfs_super_generation(disk_super);
+ u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
fs_devices = find_fsid(disk_super->fsid);
if (!fs_devices) {
fs_devices = alloc_fs_devices(disk_super->fsid);
if (IS_ERR(fs_devices))
- return PTR_ERR(fs_devices);
+ return ERR_CAST(fs_devices);
list_add(&fs_devices->list, &fs_uuids);
@@ -635,19 +754,19 @@ static noinline int device_list_add(const char *path,
if (!device) {
if (fs_devices->opened)
- return -EBUSY;
+ return ERR_PTR(-EBUSY);
device = btrfs_alloc_device(NULL, &devid,
disk_super->dev_item.uuid);
if (IS_ERR(device)) {
/* we can safely leave the fs_devices entry around */
- return PTR_ERR(device);
+ return device;
}
name = rcu_string_strdup(path, GFP_NOFS);
if (!name) {
- kfree(device);
- return -ENOMEM;
+ free_device(device);
+ return ERR_PTR(-ENOMEM);
}
rcu_assign_pointer(device->name, name);
@@ -656,8 +775,16 @@ static noinline int device_list_add(const char *path,
fs_devices->num_devices++;
mutex_unlock(&fs_devices->device_list_mutex);
- ret = 1;
device->fs_devices = fs_devices;
+ btrfs_free_stale_devices(path, device);
+
+ if (disk_super->label[0])
+ pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
+ disk_super->label, devid, found_transid, path);
+ else
+ pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
+ disk_super->fsid, devid, found_transid, path);
+
} else if (!device->name || strcmp(device->name->str, path)) {
/*
* When FS is already mounted.
@@ -693,17 +820,17 @@ static noinline int device_list_add(const char *path,
* with larger generation number or the last-in if
* generation are equal.
*/
- return -EEXIST;
+ return ERR_PTR(-EEXIST);
}
name = rcu_string_strdup(path, GFP_NOFS);
if (!name)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
rcu_string_free(device->name);
rcu_assign_pointer(device->name, name);
- if (device->missing) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
fs_devices->missing_devices--;
- device->missing = 0;
+ clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
}
}
@@ -716,16 +843,9 @@ static noinline int device_list_add(const char *path,
if (!fs_devices->opened)
device->generation = found_transid;
- /*
- * if there is new btrfs on an already registered device,
- * then remove the stale device entry.
- */
- if (ret > 0)
- btrfs_free_stale_device(device);
-
- *fs_devices_ret = fs_devices;
+ fs_devices->total_devices = btrfs_super_num_devices(disk_super);
- return ret;
+ return device;
}
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
@@ -758,7 +878,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
name = rcu_string_strdup(orig_dev->name->str,
GFP_KERNEL);
if (!name) {
- kfree(device);
+ free_device(device);
goto error;
}
rcu_assign_pointer(device->name, name);
@@ -776,7 +896,11 @@ error:
return ERR_PTR(-ENOMEM);
}
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
+/*
+ * After we have read the system tree and know devids belonging to
+ * this filesystem, remove the device which does not belong there.
+ */
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
{
struct btrfs_device *device, *next;
struct btrfs_device *latest_dev = NULL;
@@ -785,10 +909,12 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
again:
/* This is the initialized path, it is safe to release the devices. */
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
- if (device->in_fs_metadata) {
- if (!device->is_tgtdev_for_dev_replace &&
- (!latest_dev ||
- device->generation > latest_dev->generation)) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &device->dev_state)) {
+ if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
+ &device->dev_state) &&
+ (!latest_dev ||
+ device->generation > latest_dev->generation)) {
latest_dev = device;
}
continue;
@@ -805,7 +931,8 @@ again:
* not, which means whether this device is
* used or whether it should be removed.
*/
- if (step == 0 || device->is_tgtdev_for_dev_replace) {
+ if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
+ &device->dev_state)) {
continue;
}
}
@@ -814,16 +941,16 @@ again:
device->bdev = NULL;
fs_devices->open_devices--;
}
- if (device->writeable) {
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
list_del_init(&device->dev_alloc_list);
- device->writeable = 0;
- if (!device->is_tgtdev_for_dev_replace)
+ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
+ &device->dev_state))
fs_devices->rw_devices--;
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
- rcu_string_free(device->name);
- kfree(device);
+ free_device(device);
}
if (fs_devices->seed) {
@@ -836,35 +963,25 @@ again:
mutex_unlock(&uuid_mutex);
}
-static void __free_device(struct work_struct *work)
-{
- struct btrfs_device *device;
-
- device = container_of(work, struct btrfs_device, rcu_work);
- rcu_string_free(device->name);
- bio_put(device->flush_bio);
- kfree(device);
-}
-
-static void free_device(struct rcu_head *head)
+static void free_device_rcu(struct rcu_head *head)
{
struct btrfs_device *device;
device = container_of(head, struct btrfs_device, rcu);
-
- INIT_WORK(&device->rcu_work, __free_device);
- schedule_work(&device->rcu_work);
+ free_device(device);
}
static void btrfs_close_bdev(struct btrfs_device *device)
{
- if (device->bdev && device->writeable) {
+ if (!device->bdev)
+ return;
+
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
sync_blockdev(device->bdev);
invalidate_bdev(device->bdev);
}
- if (device->bdev)
- blkdev_put(device->bdev, device->mode);
+ blkdev_put(device->bdev, device->mode);
}
static void btrfs_prepare_close_one_device(struct btrfs_device *device)
@@ -876,13 +993,13 @@ static void btrfs_prepare_close_one_device(struct btrfs_device *device)
if (device->bdev)
fs_devices->open_devices--;
- if (device->writeable &&
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
list_del_init(&device->dev_alloc_list);
fs_devices->rw_devices--;
}
- if (device->missing)
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
fs_devices->missing_devices--;
new_device = btrfs_alloc_device(NULL, &device->devid,
@@ -928,7 +1045,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
struct btrfs_device, dev_list);
list_del(&device->dev_list);
btrfs_close_bdev(device);
- call_rcu(&device->rcu, free_device);
+ call_rcu(&device->rcu, free_device_rcu);
}
WARN_ON(fs_devices->open_devices);
@@ -958,93 +1075,32 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
__btrfs_close_devices(fs_devices);
free_fs_devices(fs_devices);
}
- /*
- * Wait for rcu kworkers under __btrfs_close_devices
- * to finish all blkdev_puts so device is really
- * free when umount is done.
- */
- rcu_barrier();
return ret;
}
static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder)
{
- struct request_queue *q;
- struct block_device *bdev;
struct list_head *head = &fs_devices->devices;
struct btrfs_device *device;
struct btrfs_device *latest_dev = NULL;
- struct buffer_head *bh;
- struct btrfs_super_block *disk_super;
- u64 devid;
- int seeding = 1;
int ret = 0;
flags |= FMODE_EXCL;
list_for_each_entry(device, head, dev_list) {
- if (device->bdev)
- continue;
- if (!device->name)
- continue;
-
/* Just open everything we can; ignore failures here */
- if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
- &bdev, &bh))
+ if (btrfs_open_one_device(fs_devices, device, flags, holder))
continue;
- disk_super = (struct btrfs_super_block *)bh->b_data;
- devid = btrfs_stack_device_id(&disk_super->dev_item);
- if (devid != device->devid)
- goto error_brelse;
-
- if (memcmp(device->uuid, disk_super->dev_item.uuid,
- BTRFS_UUID_SIZE))
- goto error_brelse;
-
- device->generation = btrfs_super_generation(disk_super);
if (!latest_dev ||
device->generation > latest_dev->generation)
latest_dev = device;
-
- if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
- device->writeable = 0;
- } else {
- device->writeable = !bdev_read_only(bdev);
- seeding = 0;
- }
-
- q = bdev_get_queue(bdev);
- if (blk_queue_discard(q))
- device->can_discard = 1;
- if (!blk_queue_nonrot(q))
- fs_devices->rotating = 1;
-
- device->bdev = bdev;
- device->in_fs_metadata = 0;
- device->mode = flags;
-
- fs_devices->open_devices++;
- if (device->writeable &&
- device->devid != BTRFS_DEV_REPLACE_DEVID) {
- fs_devices->rw_devices++;
- list_add(&device->dev_alloc_list,
- &fs_devices->alloc_list);
- }
- brelse(bh);
- continue;
-
-error_brelse:
- brelse(bh);
- blkdev_put(bdev, flags);
- continue;
}
if (fs_devices->open_devices == 0) {
ret = -EINVAL;
goto out;
}
- fs_devices->seeding = seeding;
fs_devices->opened = 1;
fs_devices->latest_bdev = latest_dev->bdev;
fs_devices->total_rw_bytes = 0;
@@ -1052,6 +1108,20 @@ out:
return ret;
}
+static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct btrfs_device *dev1, *dev2;
+
+ dev1 = list_entry(a, struct btrfs_device, dev_list);
+ dev2 = list_entry(b, struct btrfs_device, dev_list);
+
+ if (dev1->devid < dev2->devid)
+ return -1;
+ else if (dev1->devid > dev2->devid)
+ return 1;
+ return 0;
+}
+
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder)
{
@@ -1062,20 +1132,22 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fs_devices->opened++;
ret = 0;
} else {
+ list_sort(NULL, &fs_devices->devices, devid_cmp);
ret = __btrfs_open_devices(fs_devices, flags, holder);
}
mutex_unlock(&uuid_mutex);
return ret;
}
-void btrfs_release_disk_super(struct page *page)
+static void btrfs_release_disk_super(struct page *page)
{
kunmap(page);
put_page(page);
}
-int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
- struct page **page, struct btrfs_super_block **disk_super)
+static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
+ struct page **page,
+ struct btrfs_super_block **disk_super)
{
void *p;
pgoff_t index;
@@ -1127,12 +1199,10 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
struct btrfs_fs_devices **fs_devices_ret)
{
struct btrfs_super_block *disk_super;
+ struct btrfs_device *device;
struct block_device *bdev;
struct page *page;
- int ret = -EINVAL;
- u64 devid;
- u64 transid;
- u64 total_devices;
+ int ret = 0;
u64 bytenr;
/*
@@ -1151,26 +1221,16 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
goto error;
}
- if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super))
+ if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
+ ret = -EINVAL;
goto error_bdev_put;
-
- devid = btrfs_stack_device_id(&disk_super->dev_item);
- transid = btrfs_super_generation(disk_super);
- total_devices = btrfs_super_num_devices(disk_super);
-
- ret = device_list_add(path, disk_super, devid, fs_devices_ret);
- if (ret > 0) {
- if (disk_super->label[0]) {
- pr_info("BTRFS: device label %s ", disk_super->label);
- } else {
- pr_info("BTRFS: device fsid %pU ", disk_super->fsid);
- }
-
- pr_cont("devid %llu transid %llu %s\n", devid, transid, path);
- ret = 0;
}
- if (!ret && fs_devices_ret)
- (*fs_devices_ret)->total_devices = total_devices;
+
+ device = device_list_add(path, disk_super);
+ if (IS_ERR(device))
+ ret = PTR_ERR(device);
+ else
+ *fs_devices_ret = device->fs_devices;
btrfs_release_disk_super(page);
@@ -1196,7 +1256,8 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
*length = 0;
- if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
+ if (start >= device->total_bytes ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
return 0;
path = btrfs_alloc_path();
@@ -1374,7 +1435,8 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction,
max_hole_size = 0;
again:
- if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
+ if (search_start >= search_end ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = -ENOSPC;
goto out;
}
@@ -1581,8 +1643,8 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_key key;
- WARN_ON(!device->in_fs_metadata);
- WARN_ON(device->is_tgtdev_for_dev_replace);
+ WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
+ WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -1672,7 +1734,7 @@ error:
* the device information is stored in the chunk root
* the btrfs_device struct should be fully filled in
*/
-static int btrfs_add_device(struct btrfs_trans_handle *trans,
+static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
struct btrfs_device *device)
{
@@ -1765,20 +1827,24 @@ static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
key.offset = device->devid;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0)
- goto out;
-
- if (ret > 0) {
- ret = -ENOENT;
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
goto out;
}
ret = btrfs_del_item(trans, root, path);
- if (ret)
- goto out;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ }
+
out:
btrfs_free_path(path);
- btrfs_commit_transaction(trans);
+ if (!ret)
+ ret = btrfs_commit_transaction(trans);
return ret;
}
@@ -1817,14 +1883,15 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
return 0;
}
-struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs,
- struct btrfs_device *device)
+static struct btrfs_device * btrfs_find_next_active_device(
+ struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
{
struct btrfs_device *next_device;
list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
if (next_device != device &&
- !next_device->missing && next_device->bdev)
+ !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
+ && next_device->bdev)
return next_device;
}
@@ -1865,15 +1932,16 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
u64 num_devices;
int ret = 0;
+ mutex_lock(&fs_info->volume_mutex);
mutex_lock(&uuid_mutex);
num_devices = fs_info->fs_devices->num_devices;
- btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_lock(&fs_info->dev_replace);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
WARN_ON(num_devices < 1);
num_devices--;
}
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
@@ -1884,17 +1952,18 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
if (ret)
goto out;
- if (device->is_tgtdev_for_dev_replace) {
+ if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = BTRFS_ERROR_DEV_TGT_REPLACE;
goto out;
}
- if (device->writeable && fs_info->fs_devices->rw_devices == 1) {
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
+ fs_info->fs_devices->rw_devices == 1) {
ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
goto out;
}
- if (device->writeable) {
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
list_del_init(&device->dev_alloc_list);
device->fs_devices->rw_devices--;
@@ -1916,7 +1985,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
if (ret)
goto error_undo;
- device->in_fs_metadata = 0;
+ clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
btrfs_scrub_cancel_dev(fs_info, device);
/*
@@ -1936,7 +2005,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
device->fs_devices->num_devices--;
device->fs_devices->total_devices--;
- if (device->missing)
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
device->fs_devices->missing_devices--;
btrfs_assign_next_active_device(fs_info, device, NULL);
@@ -1956,11 +2025,11 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
* the devices list. All that's left is to zero out the old
* supers and free the device.
*/
- if (device->writeable)
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
btrfs_scratch_superblocks(device->bdev, device->name->str);
btrfs_close_bdev(device);
- call_rcu(&device->rcu, free_device);
+ call_rcu(&device->rcu, free_device_rcu);
if (cur_devices->open_devices == 0) {
struct btrfs_fs_devices *fs_devices;
@@ -1979,10 +2048,11 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
out:
mutex_unlock(&uuid_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
return ret;
error_undo:
- if (device->writeable) {
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
list_add(&device->dev_alloc_list,
&fs_info->fs_devices->alloc_list);
@@ -1997,7 +2067,7 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devices;
- WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
+ lockdep_assert_held(&fs_info->fs_devices->device_list_mutex);
/*
* in case of fs with no seed, srcdev->fs_devices will point
@@ -2008,12 +2078,12 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
fs_devices = srcdev->fs_devices;
list_del_rcu(&srcdev->dev_list);
- list_del_rcu(&srcdev->dev_alloc_list);
+ list_del(&srcdev->dev_alloc_list);
fs_devices->num_devices--;
- if (srcdev->missing)
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
fs_devices->missing_devices--;
- if (srcdev->writeable)
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
fs_devices->rw_devices--;
if (srcdev->bdev)
@@ -2025,25 +2095,26 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
- if (srcdev->writeable) {
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
/* zero out the old super if it is writable */
btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
}
btrfs_close_bdev(srcdev);
-
- call_rcu(&srcdev->rcu, free_device);
-
- /*
- * unless fs_devices is seed fs, num_devices shouldn't go
- * zero
- */
- BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
+ call_rcu(&srcdev->rcu, free_device_rcu);
/* if this is no devs we rather delete the fs_devices */
if (!fs_devices->num_devices) {
struct btrfs_fs_devices *tmp_fs_devices;
+ /*
+ * On a mounted FS, num_devices can't be zero unless it's a
+ * seed. In case of a seed device being replaced, the replace
+ * target added to the sprout FS, so there will be no more
+ * device left under the seed FS.
+ */
+ ASSERT(fs_devices->seeding);
+
tmp_fs_devices = fs_info->fs_devices;
while (tmp_fs_devices) {
if (tmp_fs_devices->seed == fs_devices) {
@@ -2089,7 +2160,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
btrfs_close_bdev(tgtdev);
- call_rcu(&tgtdev->rcu, free_device);
+ call_rcu(&tgtdev->rcu, free_device_rcu);
}
static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
@@ -2134,7 +2205,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
* is held by the caller.
*/
list_for_each_entry(tmp, devices, dev_list) {
- if (tmp->in_fs_metadata && !tmp->bdev) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &tmp->dev_state) && !tmp->bdev) {
*device = tmp;
break;
}
@@ -2185,7 +2257,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
struct btrfs_device *device;
u64 super_flags;
- BUG_ON(!mutex_is_locked(&uuid_mutex));
+ lockdep_assert_held(&uuid_mutex);
if (!fs_devices->seeding)
return -EINVAL;
@@ -2323,6 +2395,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
u64 tmp;
int seeding_dev = 0;
int ret = 0;
+ bool unlocked = false;
if (sb_rdonly(sb) && !fs_info->fs_devices->seeding)
return -EROFS;
@@ -2362,24 +2435,19 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
name = rcu_string_strdup(device_path, GFP_KERNEL);
if (!name) {
- kfree(device);
ret = -ENOMEM;
- goto error;
+ goto error_free_device;
}
rcu_assign_pointer(device->name, name);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
- rcu_string_free(device->name);
- kfree(device);
ret = PTR_ERR(trans);
- goto error;
+ goto error_free_device;
}
q = bdev_get_queue(bdev);
- if (blk_queue_discard(q))
- device->can_discard = 1;
- device->writeable = 1;
+ set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = trans->transid;
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
@@ -2390,16 +2458,19 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
device->commit_total_bytes = device->total_bytes;
device->fs_info = fs_info;
device->bdev = bdev;
- device->in_fs_metadata = 1;
- device->is_tgtdev_for_dev_replace = 0;
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
if (seeding_dev) {
- sb->s_flags &= ~MS_RDONLY;
+ sb->s_flags &= ~SB_RDONLY;
ret = btrfs_prepare_sprout(fs_info);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto error_trans;
+ }
}
device->fs_devices = fs_info->fs_devices;
@@ -2445,14 +2516,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
mutex_unlock(&fs_info->chunk_mutex);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto error_trans;
+ goto error_sysfs;
}
}
- ret = btrfs_add_device(trans, fs_info, device);
+ ret = btrfs_add_dev_item(trans, fs_info, device);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto error_trans;
+ goto error_sysfs;
}
if (seeding_dev) {
@@ -2461,7 +2532,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
ret = btrfs_finish_sprout(trans, fs_info);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto error_trans;
+ goto error_sysfs;
}
/* Sprouting would change fsid of the mounted root,
@@ -2479,6 +2550,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
+ unlocked = true;
if (ret) /* transaction commit */
return ret;
@@ -2491,7 +2563,9 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (IS_ERR(trans)) {
if (PTR_ERR(trans) == -ENOENT)
return 0;
- return PTR_ERR(trans);
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto error_sysfs;
}
ret = btrfs_commit_transaction(trans);
}
@@ -2500,14 +2574,18 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
update_dev_time(device_path);
return ret;
-error_trans:
- btrfs_end_transaction(trans);
- rcu_string_free(device->name);
+error_sysfs:
btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
- kfree(device);
+error_trans:
+ if (seeding_dev)
+ sb->s_flags |= SB_RDONLY;
+ if (trans)
+ btrfs_end_transaction(trans);
+error_free_device:
+ free_device(device);
error:
blkdev_put(bdev, FMODE_EXCL);
- if (seeding_dev) {
+ if (seeding_dev && !unlocked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
}
@@ -2519,7 +2597,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev,
struct btrfs_device **device_out)
{
- struct request_queue *q;
struct btrfs_device *device;
struct block_device *bdev;
struct list_head *devices;
@@ -2570,17 +2647,14 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
name = rcu_string_strdup(device_path, GFP_KERNEL);
if (!name) {
- kfree(device);
+ free_device(device);
ret = -ENOMEM;
goto error;
}
rcu_assign_pointer(device->name, name);
- q = bdev_get_queue(bdev);
- if (blk_queue_discard(q))
- device->can_discard = 1;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- device->writeable = 1;
+ set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = 0;
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
@@ -2588,13 +2662,12 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->total_bytes = btrfs_device_get_total_bytes(srcdev);
device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
device->bytes_used = btrfs_device_get_bytes_used(srcdev);
- ASSERT(list_empty(&srcdev->resized_list));
device->commit_total_bytes = srcdev->commit_total_bytes;
device->commit_bytes_used = device->bytes_used;
device->fs_info = fs_info;
device->bdev = bdev;
- device->in_fs_metadata = 1;
- device->is_tgtdev_for_dev_replace = 1;
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
@@ -2612,19 +2685,6 @@ error:
return ret;
}
-void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
- struct btrfs_device *tgtdev)
-{
- u32 sectorsize = fs_info->sectorsize;
-
- WARN_ON(fs_info->fs_devices->rw_devices == 0);
- tgtdev->io_width = sectorsize;
- tgtdev->io_align = sectorsize;
- tgtdev->sector_size = sectorsize;
- tgtdev->fs_info = fs_info;
- tgtdev->in_fs_metadata = 1;
-}
-
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
@@ -2680,7 +2740,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
u64 old_total;
u64 diff;
- if (!device->writeable)
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return -EACCES;
new_size = round_down(new_size, fs_info->sectorsize);
@@ -2690,7 +2750,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
if (new_size <= device->total_bytes ||
- device->is_tgtdev_for_dev_replace) {
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
mutex_unlock(&fs_info->chunk_mutex);
return -EINVAL;
}
@@ -2930,7 +2990,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
* we release the path used to search the chunk/dev tree and before
* the current task acquires this mutex and calls us.
*/
- ASSERT(mutex_is_locked(&fs_info->delete_unused_bgs_mutex));
+ lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
ret = btrfs_can_relocate(fs_info, chunk_offset);
if (ret)
@@ -2943,6 +3003,16 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
if (ret)
return ret;
+ /*
+ * We add the kobjects here (and after forcing data chunk creation)
+ * since relocation is the only place we'll create chunks of a new
+ * type at runtime. The only place where we'll remove the last
+ * chunk of a type is the call immediately below this one. Even
+ * so, we're protected against races with the cleaner thread since
+ * we're covered by the delete_unused_bgs_mutex.
+ */
+ btrfs_add_raid_kobjects(fs_info);
+
trans = btrfs_start_trans_remove_block_group(root->fs_info,
chunk_offset);
if (IS_ERR(trans)) {
@@ -3034,6 +3104,50 @@ error:
return ret;
}
+/*
+ * return 1 : allocate a data chunk successfully,
+ * return <0: errors during allocating a data chunk,
+ * return 0 : no need to allocate a data chunk.
+ */
+static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
+ u64 chunk_offset)
+{
+ struct btrfs_block_group_cache *cache;
+ u64 bytes_used;
+ u64 chunk_type;
+
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ ASSERT(cache);
+ chunk_type = cache->flags;
+ btrfs_put_block_group(cache);
+
+ if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
+ spin_lock(&fs_info->data_sinfo->lock);
+ bytes_used = fs_info->data_sinfo->bytes_used;
+ spin_unlock(&fs_info->data_sinfo->lock);
+
+ if (!bytes_used) {
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ trans = btrfs_join_transaction(fs_info->tree_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_force_chunk_alloc(trans, fs_info,
+ BTRFS_BLOCK_GROUP_DATA);
+ btrfs_end_transaction(trans);
+ if (ret < 0)
+ return ret;
+
+ btrfs_add_raid_kobjects(fs_info);
+
+ return 1;
+ }
+ }
+ return 0;
+}
+
static int insert_balance_item(struct btrfs_fs_info *fs_info,
struct btrfs_balance_control *bctl)
{
@@ -3492,7 +3606,6 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
u32 count_meta = 0;
u32 count_sys = 0;
int chunk_reserved = 0;
- u64 bytes_used = 0;
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
@@ -3500,10 +3613,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
old_size = btrfs_device_get_total_bytes(device);
size_to_free = div_factor(old_size, 1);
size_to_free = min_t(u64, size_to_free, SZ_1M);
- if (!device->writeable ||
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) ||
btrfs_device_get_total_bytes(device) -
btrfs_device_get_bytes_used(device) > size_to_free ||
- device->is_tgtdev_for_dev_replace)
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
continue;
ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -3651,28 +3764,21 @@ again:
goto loop;
}
- ASSERT(fs_info->data_sinfo);
- spin_lock(&fs_info->data_sinfo->lock);
- bytes_used = fs_info->data_sinfo->bytes_used;
- spin_unlock(&fs_info->data_sinfo->lock);
-
- if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
- !chunk_reserved && !bytes_used) {
- trans = btrfs_start_transaction(chunk_root, 0);
- if (IS_ERR(trans)) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
- ret = PTR_ERR(trans);
- goto error;
- }
-
- ret = btrfs_force_chunk_alloc(trans, fs_info,
- BTRFS_BLOCK_GROUP_DATA);
- btrfs_end_transaction(trans);
+ if (!chunk_reserved) {
+ /*
+ * We may be relocating the only data chunk we have,
+ * which could potentially end up with losing data's
+ * raid profile, so lets allocate an empty one in
+ * advance.
+ */
+ ret = btrfs_may_alloc_data_chunk(fs_info,
+ found_key.offset);
if (ret < 0) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto error;
+ } else if (ret == 1) {
+ chunk_reserved = 1;
}
- chunk_reserved = 1;
}
ret = btrfs_relocate_chunk(fs_info, found_key.offset);
@@ -3804,12 +3910,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
}
num_devices = fs_info->fs_devices->num_devices;
- btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_lock(&fs_info->dev_replace);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
BUG_ON(num_devices < 1);
num_devices--;
}
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
if (num_devices > 1)
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
@@ -4114,7 +4220,8 @@ static int btrfs_uuid_scan_kthread(void *data)
key.offset = 0;
while (1) {
- ret = btrfs_search_forward(root, &key, path, 0);
+ ret = btrfs_search_forward(root, &key, path,
+ BTRFS_OLDEST_GENERATION);
if (ret) {
if (ret > 0)
ret = 0;
@@ -4371,7 +4478,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
new_size = round_down(new_size, fs_info->sectorsize);
diff = round_down(old_size - new_size, fs_info->sectorsize);
- if (device->is_tgtdev_for_dev_replace)
+ if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
return -EINVAL;
path = btrfs_alloc_path();
@@ -4383,7 +4490,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_total_bytes(device, new_size);
- if (device->writeable) {
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
device->fs_devices->total_rw_bytes -= diff;
atomic64_sub(diff, &fs_info->free_chunk_space);
}
@@ -4435,6 +4542,18 @@ again:
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
btrfs_release_path(path);
+ /*
+ * We may be relocating the only data chunk we have,
+ * which could potentially end up with losing data's
+ * raid profile, so lets allocate an empty one in
+ * advance.
+ */
+ ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
+ if (ret < 0) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ goto done;
+ }
+
ret = btrfs_relocate_chunk(fs_info, chunk_offset);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
if (ret && ret != -ENOSPC)
@@ -4508,7 +4627,7 @@ done:
if (ret) {
mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_total_bytes(device, old_size);
- if (device->writeable)
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
device->fs_devices->total_rw_bytes += diff;
atomic64_add(diff, &fs_info->free_chunk_space);
mutex_unlock(&fs_info->chunk_mutex);
@@ -4572,7 +4691,7 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
btrfs_set_fs_incompat(info, RAID56);
}
-#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r->fs_info) \
+#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \
- sizeof(struct btrfs_chunk)) \
/ sizeof(struct btrfs_stripe) + 1)
@@ -4613,10 +4732,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
BUG_ON(!alloc_profile_is_valid(type, 0));
- if (list_empty(&fs_devices->alloc_list))
+ if (list_empty(&fs_devices->alloc_list)) {
+ if (btrfs_test_opt(info, ENOSPC_DEBUG))
+ btrfs_debug(info, "%s: no writable device", __func__);
return -ENOSPC;
+ }
- index = __get_raid_index(type);
+ index = btrfs_bg_flags_to_raid_index(type);
sub_stripes = btrfs_raid_array[index].sub_stripes;
dev_stripes = btrfs_raid_array[index].dev_stripes;
@@ -4629,7 +4751,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
max_stripe_size = SZ_1G;
max_chunk_size = 10 * max_stripe_size;
if (!devs_max)
- devs_max = BTRFS_MAX_DEVS(info->chunk_root);
+ devs_max = BTRFS_MAX_DEVS(info);
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
/* for larger filesystems, use larger metadata chunks */
if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
@@ -4638,7 +4760,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
max_stripe_size = SZ_256M;
max_chunk_size = max_stripe_size;
if (!devs_max)
- devs_max = BTRFS_MAX_DEVS(info->chunk_root);
+ devs_max = BTRFS_MAX_DEVS(info);
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
max_stripe_size = SZ_32M;
max_chunk_size = 2 * max_stripe_size;
@@ -4668,14 +4790,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
u64 max_avail;
u64 dev_offset;
- if (!device->writeable) {
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
WARN(1, KERN_ERR
"BTRFS: read-only device in alloc_list\n");
continue;
}
- if (!device->in_fs_metadata ||
- device->is_tgtdev_for_dev_replace)
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &device->dev_state) ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
continue;
if (device->total_bytes > device->bytes_used)
@@ -4696,8 +4819,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (ret == 0)
max_avail = max_stripe_size * dev_stripes;
- if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
+ if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
+ if (btrfs_test_opt(info, ENOSPC_DEBUG))
+ btrfs_debug(info,
+ "%s: devid %llu has no free space, have=%llu want=%u",
+ __func__, device->devid, max_avail,
+ BTRFS_STRIPE_LEN * dev_stripes);
continue;
+ }
if (ndevs == fs_devices->rw_devices) {
WARN(1, "%s: found more than %llu devices\n",
@@ -4720,18 +4849,26 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
/* round down to number of usable stripes */
ndevs = round_down(ndevs, devs_increment);
- if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
+ if (ndevs < devs_min) {
ret = -ENOSPC;
+ if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
+ btrfs_debug(info,
+ "%s: not enough devices with free space: have=%d minimum required=%d",
+ __func__, ndevs, devs_min);
+ }
goto error;
}
ndevs = min(ndevs, devs_max);
/*
- * the primary goal is to maximize the number of stripes, so use as many
- * devices as possible, even if the stripes are not maximum sized.
+ * The primary goal is to maximize the number of stripes, so use as
+ * many devices as possible, even if the stripes are not maximum sized.
+ *
+ * The DUP profile stores more than one stripe per device, the
+ * max_avail is the total size so we have to adjust.
*/
- stripe_size = devices_info[ndevs-1].max_avail;
+ stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
num_stripes = ndevs * dev_stripes;
/*
@@ -4752,22 +4889,19 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
* and compare that answer with the max chunk size
*/
if (stripe_size * data_stripes > max_chunk_size) {
- u64 mask = (1ULL << 24) - 1;
-
stripe_size = div_u64(max_chunk_size, data_stripes);
/* bump the answer up to a 16MB boundary */
- stripe_size = (stripe_size + mask) & ~mask;
+ stripe_size = round_up(stripe_size, SZ_16M);
- /* but don't go higher than the limits we found
- * while searching for free extents
+ /*
+ * But don't go higher than the limits we found while searching
+ * for free extents
*/
- if (stripe_size > devices_info[ndevs-1].max_avail)
- stripe_size = devices_info[ndevs-1].max_avail;
+ stripe_size = min(devices_info[ndevs - 1].max_avail,
+ stripe_size);
}
- stripe_size = div_u64(stripe_size, dev_stripes);
-
/* align to BTRFS_STRIPE_LEN */
stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
@@ -4813,16 +4947,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
em_tree = &info->mapping_tree.map_tree;
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
- if (!ret) {
- list_add_tail(&em->list, &trans->transaction->pending_chunks);
- refcount_inc(&em->refs);
- }
- write_unlock(&em_tree->lock);
if (ret) {
+ write_unlock(&em_tree->lock);
free_extent_map(em);
goto error;
}
+ list_add_tail(&em->list, &trans->transaction->pending_chunks);
+ refcount_inc(&em->refs);
+ write_unlock(&em_tree->lock);
+
ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes);
if (ret)
goto error_del_extent;
@@ -4966,7 +5100,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
{
u64 chunk_offset;
- ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
+ lockdep_assert_held(&fs_info->chunk_mutex);
chunk_offset = find_next_chunk(fs_info);
return __btrfs_alloc_chunk(trans, chunk_offset, type);
}
@@ -5023,12 +5157,13 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
- if (map->stripes[i].dev->missing) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING,
+ &map->stripes[i].dev->dev_state)) {
miss_ndevs++;
continue;
}
-
- if (!map->stripes[i].dev->writeable) {
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
+ &map->stripes[i].dev->dev_state)) {
readonly = 1;
goto end;
}
@@ -5094,16 +5229,23 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
ret = 2;
else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- ret = 3;
+ /*
+ * There could be two corrupted data stripes, we need
+ * to loop retry in order to rebuild the correct data.
+ *
+ * Fail a stripe at a time on every retry except the
+ * stripe under reconstruction.
+ */
+ ret = map->num_stripes;
else
ret = 1;
free_extent_map(em);
- btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_lock(&fs_info->dev_replace);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
fs_info->dev_replace.tgtdev)
ret++;
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
return ret;
}
@@ -5144,13 +5286,25 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
}
static int find_live_mirror(struct btrfs_fs_info *fs_info,
- struct map_lookup *map, int first, int num,
- int optimal, int dev_replace_is_ongoing)
+ struct map_lookup *map, int first,
+ int dev_replace_is_ongoing)
{
int i;
+ int num_stripes;
+ int preferred_mirror;
int tolerance;
struct btrfs_device *srcdev;
+ ASSERT((map->type &
+ (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)));
+
+ if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+ num_stripes = map->sub_stripes;
+ else
+ num_stripes = map->num_stripes;
+
+ preferred_mirror = first + current->pid % num_stripes;
+
if (dev_replace_is_ongoing &&
fs_info->dev_replace.cont_reading_from_srcdev_mode ==
BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
@@ -5164,10 +5318,10 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
* mirror is available
*/
for (tolerance = 0; tolerance < 2; tolerance++) {
- if (map->stripes[optimal].dev->bdev &&
- (tolerance || map->stripes[optimal].dev != srcdev))
- return optimal;
- for (i = first; i < first + num; i++) {
+ if (map->stripes[preferred_mirror].dev->bdev &&
+ (tolerance || map->stripes[preferred_mirror].dev != srcdev))
+ return preferred_mirror;
+ for (i = first; i < first + num_stripes; i++) {
if (map->stripes[i].dev->bdev &&
(tolerance || map->stripes[i].dev != srcdev))
return i;
@@ -5177,7 +5331,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
/* we couldn't find one that doesn't fail. Just return something
* and the io error handling code will clean up eventually
*/
- return optimal;
+ return preferred_mirror;
}
static inline int parity_smaller(u64 a, u64 b)
@@ -5669,10 +5823,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (!bbio_ret)
goto out;
- btrfs_dev_replace_lock(dev_replace, 0);
+ btrfs_dev_replace_read_lock(dev_replace);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
if (!dev_replace_is_ongoing)
- btrfs_dev_replace_unlock(dev_replace, 0);
+ btrfs_dev_replace_read_unlock(dev_replace);
else
btrfs_dev_replace_set_lock_blocking(dev_replace);
@@ -5695,23 +5849,21 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
- if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS)
+ if (!need_full_stripe(op))
mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
+ if (need_full_stripe(op))
num_stripes = map->num_stripes;
else if (mirror_num)
stripe_index = mirror_num - 1;
else {
stripe_index = find_live_mirror(fs_info, map, 0,
- map->num_stripes,
- current->pid % map->num_stripes,
dev_replace_is_ongoing);
mirror_num = stripe_index + 1;
}
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) {
+ if (need_full_stripe(op)) {
num_stripes = map->num_stripes;
} else if (mirror_num) {
stripe_index = mirror_num - 1;
@@ -5725,7 +5877,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
stripe_index *= map->sub_stripes;
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
+ if (need_full_stripe(op))
num_stripes = map->sub_stripes;
else if (mirror_num)
stripe_index += mirror_num - 1;
@@ -5733,16 +5885,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
int old_stripe_index = stripe_index;
stripe_index = find_live_mirror(fs_info, map,
stripe_index,
- map->sub_stripes, stripe_index +
- current->pid % map->sub_stripes,
dev_replace_is_ongoing);
mirror_num = stripe_index - old_stripe_index + 1;
}
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- if (need_raid_map &&
- (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS ||
- mirror_num > 1)) {
+ if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
/* push stripe_nr back to the start of the full stripe */
stripe_nr = div64_u64(raid56_full_stripe_start,
stripe_len * nr_data_stripes(map));
@@ -5769,9 +5917,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
/* We distribute the parity blocks across stripes */
div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
&stripe_index);
- if ((op != BTRFS_MAP_WRITE &&
- op != BTRFS_MAP_GET_READ_MIRRORS) &&
- mirror_num <= 1)
+ if (!need_full_stripe(op) && mirror_num <= 1)
mirror_num = 1;
}
} else {
@@ -5878,7 +6024,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
out:
if (dev_replace_is_ongoing) {
btrfs_dev_replace_clear_lock_blocking(dev_replace);
- btrfs_dev_replace_unlock(dev_replace, 0);
+ btrfs_dev_replace_read_unlock(dev_replace);
}
free_extent_map(em);
return ret;
@@ -5998,15 +6144,14 @@ static void btrfs_end_bio(struct bio *bio)
dev = bbio->stripes[stripe_index].dev;
if (dev->bdev) {
if (bio_op(bio) == REQ_OP_WRITE)
- btrfs_dev_stat_inc(dev,
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_WRITE_ERRS);
else
- btrfs_dev_stat_inc(dev,
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_READ_ERRS);
if (bio->bi_opf & REQ_PREFLUSH)
- btrfs_dev_stat_inc(dev,
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_FLUSH_ERRS);
- btrfs_dev_stat_print_on_error(dev);
}
}
}
@@ -6033,7 +6178,7 @@ static void btrfs_end_bio(struct bio *bio)
* this bio is actually up to date, we didn't
* go over the max number of errors
*/
- bio->bi_status = 0;
+ bio->bi_status = BLK_STS_OK;
}
btrfs_end_bbio(bbio, bio);
@@ -6056,26 +6201,18 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device,
int should_queue = 1;
struct btrfs_pending_bios *pending_bios;
- if (device->missing || !device->bdev) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
+ !device->bdev) {
bio_io_error(bio);
return;
}
/* don't bother with additional async steps for reads, right now */
if (bio_op(bio) == REQ_OP_READ) {
- bio_get(bio);
btrfsic_submit_bio(bio);
- bio_put(bio);
return;
}
- /*
- * nr_async_bios allows us to reliably return congestion to the
- * higher layers. Otherwise, the async bio makes it appear we have
- * made progress against dirty pages when we've really just put it
- * on a queue for later
- */
- atomic_inc(&fs_info->nr_async_bios);
WARN_ON(bio->bi_next);
bio->bi_next = NULL;
@@ -6144,7 +6281,10 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
bio->bi_iter.bi_sector = logical >> 9;
- bio->bi_status = BLK_STS_IOERR;
+ if (atomic_read(&bbio->error) > bbio->max_errors)
+ bio->bi_status = BLK_STS_IOERR;
+ else
+ bio->bi_status = BLK_STS_OK;
btrfs_end_bbio(bbio, bio);
}
}
@@ -6206,7 +6346,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
dev = bbio->stripes[dev_nr].dev;
if (!dev || !dev->bdev ||
- (bio_op(first_bio) == REQ_OP_WRITE && !dev->writeable)) {
+ (bio_op(first_bio) == REQ_OP_WRITE &&
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
bbio_error(bbio, first_bio, logical);
continue;
}
@@ -6249,13 +6390,13 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
device = btrfs_alloc_device(NULL, &devid, dev_uuid);
if (IS_ERR(device))
- return NULL;
+ return device;
list_add(&device->dev_list, &fs_devices->devices);
device->fs_devices = fs_devices;
fs_devices->num_devices++;
- device->missing = 1;
+ set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
fs_devices->missing_devices++;
return device;
@@ -6271,8 +6412,8 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
* is generated.
*
* Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
- * on error. Returned struct is not linked onto any lists and can be
- * destroyed with kfree() right away.
+ * on error. Returned struct is not linked onto any lists and must be
+ * destroyed with free_device.
*/
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
@@ -6295,7 +6436,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
ret = find_next_devid(fs_info, &tmp);
if (ret) {
- kfree(dev);
+ free_device(dev);
return ERR_PTR(ret);
}
}
@@ -6377,6 +6518,17 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
return 0;
}
+static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
+ u64 devid, u8 *uuid, bool error)
+{
+ if (error)
+ btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
+ devid, uuid);
+ else
+ btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
+ devid, uuid);
+}
+
static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
@@ -6447,20 +6599,25 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
if (!map->stripes[i].dev &&
!btrfs_test_opt(fs_info, DEGRADED)) {
free_extent_map(em);
- btrfs_report_missing_device(fs_info, devid, uuid);
- return -EIO;
+ btrfs_report_missing_device(fs_info, devid, uuid, true);
+ return -ENOENT;
}
if (!map->stripes[i].dev) {
map->stripes[i].dev =
add_missing_dev(fs_info->fs_devices, devid,
uuid);
- if (!map->stripes[i].dev) {
+ if (IS_ERR(map->stripes[i].dev)) {
free_extent_map(em);
- return -EIO;
+ btrfs_err(fs_info,
+ "failed to init missing dev %llu: %ld",
+ devid, PTR_ERR(map->stripes[i].dev));
+ return PTR_ERR(map->stripes[i].dev);
}
- btrfs_report_missing_device(fs_info, devid, uuid);
+ btrfs_report_missing_device(fs_info, devid, uuid, false);
}
- map->stripes[i].dev->in_fs_metadata = 1;
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &(map->stripes[i].dev->dev_state));
+
}
write_lock(&map_tree->map_tree.lock);
@@ -6489,7 +6646,7 @@ static void fill_device_from_item(struct extent_buffer *leaf,
device->io_width = btrfs_device_io_width(leaf, dev_item);
device->sector_size = btrfs_device_sector_size(leaf, dev_item);
WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
- device->is_tgtdev_for_dev_replace = 0;
+ clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
ptr = btrfs_device_uuid(dev_item);
read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
@@ -6501,7 +6658,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices;
int ret;
- BUG_ON(!mutex_is_locked(&uuid_mutex));
+ lockdep_assert_held(&uuid_mutex);
ASSERT(fsid);
fs_devices = fs_info->fs_devices->seed;
@@ -6577,22 +6734,32 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
if (!device) {
if (!btrfs_test_opt(fs_info, DEGRADED)) {
- btrfs_report_missing_device(fs_info, devid, dev_uuid);
- return -EIO;
+ btrfs_report_missing_device(fs_info, devid,
+ dev_uuid, true);
+ return -ENOENT;
}
device = add_missing_dev(fs_devices, devid, dev_uuid);
- if (!device)
- return -ENOMEM;
- btrfs_report_missing_device(fs_info, devid, dev_uuid);
+ if (IS_ERR(device)) {
+ btrfs_err(fs_info,
+ "failed to add missing dev %llu: %ld",
+ devid, PTR_ERR(device));
+ return PTR_ERR(device);
+ }
+ btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
} else {
if (!device->bdev) {
- btrfs_report_missing_device(fs_info, devid, dev_uuid);
- if (!btrfs_test_opt(fs_info, DEGRADED))
- return -EIO;
+ if (!btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_report_missing_device(fs_info,
+ devid, dev_uuid, true);
+ return -ENOENT;
+ }
+ btrfs_report_missing_device(fs_info, devid,
+ dev_uuid, false);
}
- if(!device->bdev && !device->missing) {
+ if (!device->bdev &&
+ !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
/*
* this happens when a device that was properly setup
* in the device info lists suddenly goes bad.
@@ -6600,12 +6767,13 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
* device->missing to one here
*/
device->fs_devices->missing_devices++;
- device->missing = 1;
+ set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
}
/* Move the device to its own fs_devices */
if (device->fs_devices != fs_devices) {
- ASSERT(device->missing);
+ ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
+ &device->dev_state));
list_move(&device->dev_list, &fs_devices->devices);
device->fs_devices->num_devices--;
@@ -6619,15 +6787,16 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
}
if (device->fs_devices != fs_info->fs_devices) {
- BUG_ON(device->writeable);
+ BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
if (device->generation !=
btrfs_device_generation(leaf, dev_item))
return -EINVAL;
}
fill_device_from_item(leaf, dev_item, device);
- device->in_fs_metadata = 1;
- if (device->writeable && !device->is_tgtdev_for_dev_replace) {
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
+ !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
device->fs_devices->total_rw_bytes += device->total_bytes;
atomic64_add(device->total_bytes - device->bytes_used,
&fs_info->free_chunk_space);
@@ -6756,19 +6925,16 @@ out_short_read:
return -EIO;
}
-void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid,
- u8 *uuid)
-{
- btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", devid, uuid);
-}
-
/*
* Check if all chunks in the fs are OK for read-write degraded mount
*
+ * If the @failing_dev is specified, it's accounted as missing.
+ *
* Return true if all chunks meet the minimal RW mount requirements.
* Return false if any chunk doesn't meet the minimal RW mount requirements.
*/
-bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info)
+bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *failing_dev)
{
struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct extent_map *em;
@@ -6796,12 +6962,16 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info)
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *dev = map->stripes[i].dev;
- if (!dev || !dev->bdev || dev->missing ||
+ if (!dev || !dev->bdev ||
+ test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
dev->last_flush_error)
missing++;
+ else if (failing_dev && failing_dev == dev)
+ missing++;
}
if (missing > max_tolerated) {
- btrfs_warn(fs_info,
+ if (!failing_dev)
+ btrfs_warn(fs_info,
"chunk %llu missing %d devices, max tolerance is %d for writeable mount",
em->start, missing, max_tolerated);
free_extent_map(em);
@@ -7072,10 +7242,24 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
- if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device))
+ stats_cnt = atomic_read(&device->dev_stats_ccnt);
+ if (!device->dev_stats_valid || stats_cnt == 0)
continue;
- stats_cnt = atomic_read(&device->dev_stats_ccnt);
+
+ /*
+ * There is a LOAD-LOAD control dependency between the value of
+ * dev_stats_ccnt and updating the on-disk values which requires
+ * reading the in-memory counters. Such control dependencies
+ * require explicit read memory barriers.
+ *
+ * This memory barriers pairs with smp_mb__before_atomic in
+ * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
+ * barrier implied by atomic_xchg in
+ * btrfs_dev_stats_read_and_reset
+ */
+ smp_rmb();
+
ret = update_dev_stat_item(trans, fs_info, device);
if (!ret)
atomic_sub(stats_cnt, &device->dev_stats_ccnt);
@@ -7214,20 +7398,20 @@ void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
}
/* Must be invoked during the transaction commit */
-void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
- struct btrfs_transaction *transaction)
+void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct extent_map *em;
struct map_lookup *map;
struct btrfs_device *dev;
int i;
- if (list_empty(&transaction->pending_chunks))
+ if (list_empty(&trans->pending_chunks))
return;
/* In order to kick the device replace finish process */
mutex_lock(&fs_info->chunk_mutex);
- list_for_each_entry(em, &transaction->pending_chunks, list) {
+ list_for_each_entry(em, &trans->pending_chunks, list) {
map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {