diff options
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r-- | fs/btrfs/volumes.c | 1040 |
1 files changed, 612 insertions, 428 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b39737568c22..93f8f17cacca 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -27,6 +27,7 @@ #include <linux/raid/pq.h> #include <linux/semaphore.h> #include <linux/uuid.h> +#include <linux/list_sort.h> #include <asm/div64.h> #include "ctree.h" #include "extent_map.h" @@ -145,6 +146,71 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map); +/* + * Device locking + * ============== + * + * There are several mutexes that protect manipulation of devices and low-level + * structures like chunks but not block groups, extents or files + * + * uuid_mutex (global lock) + * ------------------------ + * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from + * the SCAN_DEV ioctl registration or from mount either implicitly (the first + * device) or requested by the device= mount option + * + * the mutex can be very coarse and can cover long-running operations + * + * protects: updates to fs_devices counters like missing devices, rw devices, + * seeding, structure cloning, openning/closing devices at mount/umount time + * + * global::fs_devs - add, remove, updates to the global list + * + * does not protect: manipulation of the fs_devices::devices list! + * + * btrfs_device::name - renames (write side), read is RCU + * + * fs_devices::device_list_mutex (per-fs, with RCU) + * ------------------------------------------------ + * protects updates to fs_devices::devices, ie. adding and deleting + * + * simple list traversal with read-only actions can be done with RCU protection + * + * may be used to exclude some operations from running concurrently without any + * modifications to the list (see write_all_supers) + * + * volume_mutex + * ------------ + * coarse lock owned by a mounted filesystem; used to exclude some operations + * that cannot run in parallel and affect the higher-level properties of the + * filesystem like: device add/deleting/resize/replace, or balance + * + * balance_mutex + * ------------- + * protects balance structures (status, state) and context accessed from + * several places (internally, ioctl) + * + * chunk_mutex + * ----------- + * protects chunks, adding or removing during allocation, trim or when a new + * device is added/removed + * + * cleaner_mutex + * ------------- + * a big lock that is held by the cleaner thread and prevents running subvolume + * cleaning together with relocation or delayed iputs + * + * + * Lock nesting + * ============ + * + * uuid_mutex + * volume_mutex + * device_list_mutex + * chunk_mutex + * balance_mutex + */ + DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); struct list_head *btrfs_get_fs_uuids(void) @@ -180,6 +246,13 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) return fs_devs; } +static void free_device(struct btrfs_device *device) +{ + rcu_string_free(device->name); + bio_put(device->flush_bio); + kfree(device); +} + static void free_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; @@ -188,8 +261,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) device = list_entry(fs_devices->devices.next, struct btrfs_device, dev_list); list_del(&device->dev_list); - rcu_string_free(device->name); - kfree(device); + free_device(device); } kfree(fs_devices); } @@ -207,7 +279,7 @@ static void btrfs_kobject_uevent(struct block_device *bdev, &disk_to_dev(bdev->bd_disk)->kobj); } -void btrfs_cleanup_fs_uuids(void) +void __exit btrfs_cleanup_fs_uuids(void) { struct btrfs_fs_devices *fs_devices; @@ -219,6 +291,11 @@ void btrfs_cleanup_fs_uuids(void) } } +/* + * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error. + * Returned struct is not linked onto any lists and must be destroyed using + * free_device. + */ static struct btrfs_device *__alloc_device(void) { struct btrfs_device *dev; @@ -236,7 +313,6 @@ static struct btrfs_device *__alloc_device(void) kfree(dev); return ERR_PTR(-ENOMEM); } - bio_get(dev->flush_bio); INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); @@ -244,7 +320,6 @@ static struct btrfs_device *__alloc_device(void) spin_lock_init(&dev->io_lock); - spin_lock_init(&dev->reada_lock); atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); @@ -360,7 +435,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device) int again = 0; unsigned long num_run; unsigned long batch_run = 0; - unsigned long limit; unsigned long last_waited = 0; int force_reg = 0; int sync_pending = 0; @@ -375,8 +449,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device) blk_start_plug(&plug); bdi = device->bdev->bd_bdi; - limit = btrfs_async_submit_limit(fs_info); - limit = limit * 2 / 3; loop: spin_lock(&device->io_lock); @@ -443,13 +515,6 @@ loop_lock: pending = pending->bi_next; cur->bi_next = NULL; - /* - * atomic_dec_return implies a barrier for waitqueue_active - */ - if (atomic_dec_return(&fs_info->nr_async_bios) < limit && - waitqueue_active(&fs_info->async_submit_wait)) - wake_up(&fs_info->async_submit_wait); - BUG_ON(atomic_read(&cur->__bi_cnt) == 0); /* @@ -517,12 +582,6 @@ loop_lock: &device->work); goto done; } - /* unplug every 64 requests just for good measure */ - if (batch_run % 64 == 0) { - blk_finish_plug(&plug); - blk_start_plug(&plug); - sync_pending = 0; - } } cond_resched(); @@ -546,84 +605,144 @@ static void pending_bios_fn(struct btrfs_work *work) run_scheduled_bios(device); } - -void btrfs_free_stale_device(struct btrfs_device *cur_dev) +/* + * Search and remove all stale (devices which are not mounted) devices. + * When both inputs are NULL, it will search and release all stale devices. + * path: Optional. When provided will it release all unmounted devices + * matching this path only. + * skip_dev: Optional. Will skip this device when searching for the stale + * devices. + */ +static void btrfs_free_stale_devices(const char *path, + struct btrfs_device *skip_dev) { - struct btrfs_fs_devices *fs_devs; - struct btrfs_device *dev; + struct btrfs_fs_devices *fs_devs, *tmp_fs_devs; + struct btrfs_device *dev, *tmp_dev; - if (!cur_dev->name) - return; - - list_for_each_entry(fs_devs, &fs_uuids, list) { - int del = 1; + list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) { if (fs_devs->opened) continue; - if (fs_devs->seeding) - continue; - list_for_each_entry(dev, &fs_devs->devices, dev_list) { + list_for_each_entry_safe(dev, tmp_dev, + &fs_devs->devices, dev_list) { + int not_found = 0; - if (dev == cur_dev) + if (skip_dev && skip_dev == dev) continue; - if (!dev->name) + if (path && !dev->name) continue; - /* - * Todo: This won't be enough. What if the same device - * comes back (with new uuid and) with its mapper path? - * But for now, this does help as mostly an admin will - * either use mapper or non mapper path throughout. - */ rcu_read_lock(); - del = strcmp(rcu_str_deref(dev->name), - rcu_str_deref(cur_dev->name)); + if (path) + not_found = strcmp(rcu_str_deref(dev->name), + path); rcu_read_unlock(); - if (!del) - break; - } + if (not_found) + continue; - if (!del) { /* delete the stale device */ if (fs_devs->num_devices == 1) { btrfs_sysfs_remove_fsid(fs_devs); list_del(&fs_devs->list); free_fs_devices(fs_devs); + break; } else { fs_devs->num_devices--; list_del(&dev->dev_list); - rcu_string_free(dev->name); - kfree(dev); + free_device(dev); } - break; } } } +static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *device, fmode_t flags, + void *holder) +{ + struct request_queue *q; + struct block_device *bdev; + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + u64 devid; + int ret; + + if (device->bdev) + return -EINVAL; + if (!device->name) + return -EINVAL; + + ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, + &bdev, &bh); + if (ret) + return ret; + + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = btrfs_stack_device_id(&disk_super->dev_item); + if (devid != device->devid) + goto error_brelse; + + if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) + goto error_brelse; + + device->generation = btrfs_super_generation(disk_super); + + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + fs_devices->seeding = 1; + } else { + if (bdev_read_only(bdev)) + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + else + set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + } + + q = bdev_get_queue(bdev); + if (!blk_queue_nonrot(q)) + fs_devices->rotating = 1; + + device->bdev = bdev; + clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + device->mode = flags; + + fs_devices->open_devices++; + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && + device->devid != BTRFS_DEV_REPLACE_DEVID) { + fs_devices->rw_devices++; + list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); + } + brelse(bh); + + return 0; + +error_brelse: + brelse(bh); + blkdev_put(bdev, flags); + + return -EINVAL; +} + /* * Add new device to list of registered devices * * Returns: - * 1 - first time device is seen - * 0 - device already known - * < 0 - error + * device pointer which was just added or updated when successful + * error pointer when failed */ -static noinline int device_list_add(const char *path, - struct btrfs_super_block *disk_super, - u64 devid, struct btrfs_fs_devices **fs_devices_ret) +static noinline struct btrfs_device *device_list_add(const char *path, + struct btrfs_super_block *disk_super) { struct btrfs_device *device; struct btrfs_fs_devices *fs_devices; struct rcu_string *name; - int ret = 0; u64 found_transid = btrfs_super_generation(disk_super); + u64 devid = btrfs_stack_device_id(&disk_super->dev_item); fs_devices = find_fsid(disk_super->fsid); if (!fs_devices) { fs_devices = alloc_fs_devices(disk_super->fsid); if (IS_ERR(fs_devices)) - return PTR_ERR(fs_devices); + return ERR_CAST(fs_devices); list_add(&fs_devices->list, &fs_uuids); @@ -635,19 +754,19 @@ static noinline int device_list_add(const char *path, if (!device) { if (fs_devices->opened) - return -EBUSY; + return ERR_PTR(-EBUSY); device = btrfs_alloc_device(NULL, &devid, disk_super->dev_item.uuid); if (IS_ERR(device)) { /* we can safely leave the fs_devices entry around */ - return PTR_ERR(device); + return device; } name = rcu_string_strdup(path, GFP_NOFS); if (!name) { - kfree(device); - return -ENOMEM; + free_device(device); + return ERR_PTR(-ENOMEM); } rcu_assign_pointer(device->name, name); @@ -656,8 +775,16 @@ static noinline int device_list_add(const char *path, fs_devices->num_devices++; mutex_unlock(&fs_devices->device_list_mutex); - ret = 1; device->fs_devices = fs_devices; + btrfs_free_stale_devices(path, device); + + if (disk_super->label[0]) + pr_info("BTRFS: device label %s devid %llu transid %llu %s\n", + disk_super->label, devid, found_transid, path); + else + pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n", + disk_super->fsid, devid, found_transid, path); + } else if (!device->name || strcmp(device->name->str, path)) { /* * When FS is already mounted. @@ -693,17 +820,17 @@ static noinline int device_list_add(const char *path, * with larger generation number or the last-in if * generation are equal. */ - return -EEXIST; + return ERR_PTR(-EEXIST); } name = rcu_string_strdup(path, GFP_NOFS); if (!name) - return -ENOMEM; + return ERR_PTR(-ENOMEM); rcu_string_free(device->name); rcu_assign_pointer(device->name, name); - if (device->missing) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { fs_devices->missing_devices--; - device->missing = 0; + clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); } } @@ -716,16 +843,9 @@ static noinline int device_list_add(const char *path, if (!fs_devices->opened) device->generation = found_transid; - /* - * if there is new btrfs on an already registered device, - * then remove the stale device entry. - */ - if (ret > 0) - btrfs_free_stale_device(device); - - *fs_devices_ret = fs_devices; + fs_devices->total_devices = btrfs_super_num_devices(disk_super); - return ret; + return device; } static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) @@ -758,7 +878,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) name = rcu_string_strdup(orig_dev->name->str, GFP_KERNEL); if (!name) { - kfree(device); + free_device(device); goto error; } rcu_assign_pointer(device->name, name); @@ -776,7 +896,11 @@ error: return ERR_PTR(-ENOMEM); } -void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step) +/* + * After we have read the system tree and know devids belonging to + * this filesystem, remove the device which does not belong there. + */ +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) { struct btrfs_device *device, *next; struct btrfs_device *latest_dev = NULL; @@ -785,10 +909,12 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step) again: /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (device->in_fs_metadata) { - if (!device->is_tgtdev_for_dev_replace && - (!latest_dev || - device->generation > latest_dev->generation)) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &device->dev_state)) { + if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, + &device->dev_state) && + (!latest_dev || + device->generation > latest_dev->generation)) { latest_dev = device; } continue; @@ -805,7 +931,8 @@ again: * not, which means whether this device is * used or whether it should be removed. */ - if (step == 0 || device->is_tgtdev_for_dev_replace) { + if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, + &device->dev_state)) { continue; } } @@ -814,16 +941,16 @@ again: device->bdev = NULL; fs_devices->open_devices--; } - if (device->writeable) { + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { list_del_init(&device->dev_alloc_list); - device->writeable = 0; - if (!device->is_tgtdev_for_dev_replace) + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, + &device->dev_state)) fs_devices->rw_devices--; } list_del_init(&device->dev_list); fs_devices->num_devices--; - rcu_string_free(device->name); - kfree(device); + free_device(device); } if (fs_devices->seed) { @@ -836,35 +963,25 @@ again: mutex_unlock(&uuid_mutex); } -static void __free_device(struct work_struct *work) -{ - struct btrfs_device *device; - - device = container_of(work, struct btrfs_device, rcu_work); - rcu_string_free(device->name); - bio_put(device->flush_bio); - kfree(device); -} - -static void free_device(struct rcu_head *head) +static void free_device_rcu(struct rcu_head *head) { struct btrfs_device *device; device = container_of(head, struct btrfs_device, rcu); - - INIT_WORK(&device->rcu_work, __free_device); - schedule_work(&device->rcu_work); + free_device(device); } static void btrfs_close_bdev(struct btrfs_device *device) { - if (device->bdev && device->writeable) { + if (!device->bdev) + return; + + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { sync_blockdev(device->bdev); invalidate_bdev(device->bdev); } - if (device->bdev) - blkdev_put(device->bdev, device->mode); + blkdev_put(device->bdev, device->mode); } static void btrfs_prepare_close_one_device(struct btrfs_device *device) @@ -876,13 +993,13 @@ static void btrfs_prepare_close_one_device(struct btrfs_device *device) if (device->bdev) fs_devices->open_devices--; - if (device->writeable && + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { list_del_init(&device->dev_alloc_list); fs_devices->rw_devices--; } - if (device->missing) + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) fs_devices->missing_devices--; new_device = btrfs_alloc_device(NULL, &device->devid, @@ -928,7 +1045,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) struct btrfs_device, dev_list); list_del(&device->dev_list); btrfs_close_bdev(device); - call_rcu(&device->rcu, free_device); + call_rcu(&device->rcu, free_device_rcu); } WARN_ON(fs_devices->open_devices); @@ -958,93 +1075,32 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) __btrfs_close_devices(fs_devices); free_fs_devices(fs_devices); } - /* - * Wait for rcu kworkers under __btrfs_close_devices - * to finish all blkdev_puts so device is really - * free when umount is done. - */ - rcu_barrier(); return ret; } static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder) { - struct request_queue *q; - struct block_device *bdev; struct list_head *head = &fs_devices->devices; struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; - struct buffer_head *bh; - struct btrfs_super_block *disk_super; - u64 devid; - int seeding = 1; int ret = 0; flags |= FMODE_EXCL; list_for_each_entry(device, head, dev_list) { - if (device->bdev) - continue; - if (!device->name) - continue; - /* Just open everything we can; ignore failures here */ - if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, - &bdev, &bh)) + if (btrfs_open_one_device(fs_devices, device, flags, holder)) continue; - disk_super = (struct btrfs_super_block *)bh->b_data; - devid = btrfs_stack_device_id(&disk_super->dev_item); - if (devid != device->devid) - goto error_brelse; - - if (memcmp(device->uuid, disk_super->dev_item.uuid, - BTRFS_UUID_SIZE)) - goto error_brelse; - - device->generation = btrfs_super_generation(disk_super); if (!latest_dev || device->generation > latest_dev->generation) latest_dev = device; - - if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { - device->writeable = 0; - } else { - device->writeable = !bdev_read_only(bdev); - seeding = 0; - } - - q = bdev_get_queue(bdev); - if (blk_queue_discard(q)) - device->can_discard = 1; - if (!blk_queue_nonrot(q)) - fs_devices->rotating = 1; - - device->bdev = bdev; - device->in_fs_metadata = 0; - device->mode = flags; - - fs_devices->open_devices++; - if (device->writeable && - device->devid != BTRFS_DEV_REPLACE_DEVID) { - fs_devices->rw_devices++; - list_add(&device->dev_alloc_list, - &fs_devices->alloc_list); - } - brelse(bh); - continue; - -error_brelse: - brelse(bh); - blkdev_put(bdev, flags); - continue; } if (fs_devices->open_devices == 0) { ret = -EINVAL; goto out; } - fs_devices->seeding = seeding; fs_devices->opened = 1; fs_devices->latest_bdev = latest_dev->bdev; fs_devices->total_rw_bytes = 0; @@ -1052,6 +1108,20 @@ out: return ret; } +static int devid_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct btrfs_device *dev1, *dev2; + + dev1 = list_entry(a, struct btrfs_device, dev_list); + dev2 = list_entry(b, struct btrfs_device, dev_list); + + if (dev1->devid < dev2->devid) + return -1; + else if (dev1->devid > dev2->devid) + return 1; + return 0; +} + int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder) { @@ -1062,20 +1132,22 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fs_devices->opened++; ret = 0; } else { + list_sort(NULL, &fs_devices->devices, devid_cmp); ret = __btrfs_open_devices(fs_devices, flags, holder); } mutex_unlock(&uuid_mutex); return ret; } -void btrfs_release_disk_super(struct page *page) +static void btrfs_release_disk_super(struct page *page) { kunmap(page); put_page(page); } -int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, - struct page **page, struct btrfs_super_block **disk_super) +static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, + struct page **page, + struct btrfs_super_block **disk_super) { void *p; pgoff_t index; @@ -1127,12 +1199,10 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret) { struct btrfs_super_block *disk_super; + struct btrfs_device *device; struct block_device *bdev; struct page *page; - int ret = -EINVAL; - u64 devid; - u64 transid; - u64 total_devices; + int ret = 0; u64 bytenr; /* @@ -1151,26 +1221,16 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, goto error; } - if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) + if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) { + ret = -EINVAL; goto error_bdev_put; - - devid = btrfs_stack_device_id(&disk_super->dev_item); - transid = btrfs_super_generation(disk_super); - total_devices = btrfs_super_num_devices(disk_super); - - ret = device_list_add(path, disk_super, devid, fs_devices_ret); - if (ret > 0) { - if (disk_super->label[0]) { - pr_info("BTRFS: device label %s ", disk_super->label); - } else { - pr_info("BTRFS: device fsid %pU ", disk_super->fsid); - } - - pr_cont("devid %llu transid %llu %s\n", devid, transid, path); - ret = 0; } - if (!ret && fs_devices_ret) - (*fs_devices_ret)->total_devices = total_devices; + + device = device_list_add(path, disk_super); + if (IS_ERR(device)) + ret = PTR_ERR(device); + else + *fs_devices_ret = device->fs_devices; btrfs_release_disk_super(page); @@ -1196,7 +1256,8 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, *length = 0; - if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace) + if (start >= device->total_bytes || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) return 0; path = btrfs_alloc_path(); @@ -1374,7 +1435,8 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction, max_hole_size = 0; again: - if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { + if (search_start >= search_end || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = -ENOSPC; goto out; } @@ -1581,8 +1643,8 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; - WARN_ON(!device->in_fs_metadata); - WARN_ON(device->is_tgtdev_for_dev_replace); + WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); + WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1672,7 +1734,7 @@ error: * the device information is stored in the chunk root * the btrfs_device struct should be fully filled in */ -static int btrfs_add_device(struct btrfs_trans_handle *trans, +static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_device *device) { @@ -1765,20 +1827,24 @@ static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info, key.offset = device->devid; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto out; - - if (ret > 0) { - ret = -ENOENT; + if (ret) { + if (ret > 0) + ret = -ENOENT; + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); goto out; } ret = btrfs_del_item(trans, root, path); - if (ret) - goto out; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + } + out: btrfs_free_path(path); - btrfs_commit_transaction(trans); + if (!ret) + ret = btrfs_commit_transaction(trans); return ret; } @@ -1817,14 +1883,15 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, return 0; } -struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs, - struct btrfs_device *device) +static struct btrfs_device * btrfs_find_next_active_device( + struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) { struct btrfs_device *next_device; list_for_each_entry(next_device, &fs_devs->devices, dev_list) { if (next_device != device && - !next_device->missing && next_device->bdev) + !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) + && next_device->bdev) return next_device; } @@ -1865,15 +1932,16 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, u64 num_devices; int ret = 0; + mutex_lock(&fs_info->volume_mutex); mutex_lock(&uuid_mutex); num_devices = fs_info->fs_devices->num_devices; - btrfs_dev_replace_lock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_lock(&fs_info->dev_replace); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { WARN_ON(num_devices < 1); num_devices--; } - btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_unlock(&fs_info->dev_replace); ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) @@ -1884,17 +1952,18 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (ret) goto out; - if (device->is_tgtdev_for_dev_replace) { + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = BTRFS_ERROR_DEV_TGT_REPLACE; goto out; } - if (device->writeable && fs_info->fs_devices->rw_devices == 1) { + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && + fs_info->fs_devices->rw_devices == 1) { ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; goto out; } - if (device->writeable) { + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_del_init(&device->dev_alloc_list); device->fs_devices->rw_devices--; @@ -1916,7 +1985,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (ret) goto error_undo; - device->in_fs_metadata = 0; + clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); btrfs_scrub_cancel_dev(fs_info, device); /* @@ -1936,7 +2005,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, device->fs_devices->num_devices--; device->fs_devices->total_devices--; - if (device->missing) + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) device->fs_devices->missing_devices--; btrfs_assign_next_active_device(fs_info, device, NULL); @@ -1956,11 +2025,11 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, * the devices list. All that's left is to zero out the old * supers and free the device. */ - if (device->writeable) + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) btrfs_scratch_superblocks(device->bdev, device->name->str); btrfs_close_bdev(device); - call_rcu(&device->rcu, free_device); + call_rcu(&device->rcu, free_device_rcu); if (cur_devices->open_devices == 0) { struct btrfs_fs_devices *fs_devices; @@ -1979,10 +2048,11 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, out: mutex_unlock(&uuid_mutex); + mutex_unlock(&fs_info->volume_mutex); return ret; error_undo: - if (device->writeable) { + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_add(&device->dev_alloc_list, &fs_info->fs_devices->alloc_list); @@ -1997,7 +2067,7 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devices; - WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); + lockdep_assert_held(&fs_info->fs_devices->device_list_mutex); /* * in case of fs with no seed, srcdev->fs_devices will point @@ -2008,12 +2078,12 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, fs_devices = srcdev->fs_devices; list_del_rcu(&srcdev->dev_list); - list_del_rcu(&srcdev->dev_alloc_list); + list_del(&srcdev->dev_alloc_list); fs_devices->num_devices--; - if (srcdev->missing) + if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) fs_devices->missing_devices--; - if (srcdev->writeable) + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) fs_devices->rw_devices--; if (srcdev->bdev) @@ -2025,25 +2095,26 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; - if (srcdev->writeable) { + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { /* zero out the old super if it is writable */ btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); } btrfs_close_bdev(srcdev); - - call_rcu(&srcdev->rcu, free_device); - - /* - * unless fs_devices is seed fs, num_devices shouldn't go - * zero - */ - BUG_ON(!fs_devices->num_devices && !fs_devices->seeding); + call_rcu(&srcdev->rcu, free_device_rcu); /* if this is no devs we rather delete the fs_devices */ if (!fs_devices->num_devices) { struct btrfs_fs_devices *tmp_fs_devices; + /* + * On a mounted FS, num_devices can't be zero unless it's a + * seed. In case of a seed device being replaced, the replace + * target added to the sprout FS, so there will be no more + * device left under the seed FS. + */ + ASSERT(fs_devices->seeding); + tmp_fs_devices = fs_info->fs_devices; while (tmp_fs_devices) { if (tmp_fs_devices->seed == fs_devices) { @@ -2089,7 +2160,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); btrfs_close_bdev(tgtdev); - call_rcu(&tgtdev->rcu, free_device); + call_rcu(&tgtdev->rcu, free_device_rcu); } static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, @@ -2134,7 +2205,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, * is held by the caller. */ list_for_each_entry(tmp, devices, dev_list) { - if (tmp->in_fs_metadata && !tmp->bdev) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &tmp->dev_state) && !tmp->bdev) { *device = tmp; break; } @@ -2185,7 +2257,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) struct btrfs_device *device; u64 super_flags; - BUG_ON(!mutex_is_locked(&uuid_mutex)); + lockdep_assert_held(&uuid_mutex); if (!fs_devices->seeding) return -EINVAL; @@ -2323,6 +2395,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path u64 tmp; int seeding_dev = 0; int ret = 0; + bool unlocked = false; if (sb_rdonly(sb) && !fs_info->fs_devices->seeding) return -EROFS; @@ -2362,24 +2435,19 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { - kfree(device); ret = -ENOMEM; - goto error; + goto error_free_device; } rcu_assign_pointer(device->name, name); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { - rcu_string_free(device->name); - kfree(device); ret = PTR_ERR(trans); - goto error; + goto error_free_device; } q = bdev_get_queue(bdev); - if (blk_queue_discard(q)) - device->can_discard = 1; - device->writeable = 1; + set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = trans->transid; device->io_width = fs_info->sectorsize; device->io_align = fs_info->sectorsize; @@ -2390,16 +2458,19 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->commit_total_bytes = device->total_bytes; device->fs_info = fs_info; device->bdev = bdev; - device->in_fs_metadata = 1; - device->is_tgtdev_for_dev_replace = 0; + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); device->mode = FMODE_EXCL; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); if (seeding_dev) { - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; ret = btrfs_prepare_sprout(fs_info); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + goto error_trans; + } } device->fs_devices = fs_info->fs_devices; @@ -2445,14 +2516,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path mutex_unlock(&fs_info->chunk_mutex); if (ret) { btrfs_abort_transaction(trans, ret); - goto error_trans; + goto error_sysfs; } } - ret = btrfs_add_device(trans, fs_info, device); + ret = btrfs_add_dev_item(trans, fs_info, device); if (ret) { btrfs_abort_transaction(trans, ret); - goto error_trans; + goto error_sysfs; } if (seeding_dev) { @@ -2461,7 +2532,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path ret = btrfs_finish_sprout(trans, fs_info); if (ret) { btrfs_abort_transaction(trans, ret); - goto error_trans; + goto error_sysfs; } /* Sprouting would change fsid of the mounted root, @@ -2479,6 +2550,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); + unlocked = true; if (ret) /* transaction commit */ return ret; @@ -2491,7 +2563,9 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (IS_ERR(trans)) { if (PTR_ERR(trans) == -ENOENT) return 0; - return PTR_ERR(trans); + ret = PTR_ERR(trans); + trans = NULL; + goto error_sysfs; } ret = btrfs_commit_transaction(trans); } @@ -2500,14 +2574,18 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path update_dev_time(device_path); return ret; -error_trans: - btrfs_end_transaction(trans); - rcu_string_free(device->name); +error_sysfs: btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); - kfree(device); +error_trans: + if (seeding_dev) + sb->s_flags |= SB_RDONLY; + if (trans) + btrfs_end_transaction(trans); +error_free_device: + free_device(device); error: blkdev_put(bdev, FMODE_EXCL); - if (seeding_dev) { + if (seeding_dev && !unlocked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } @@ -2519,7 +2597,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev, struct btrfs_device **device_out) { - struct request_queue *q; struct btrfs_device *device; struct block_device *bdev; struct list_head *devices; @@ -2570,17 +2647,14 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { - kfree(device); + free_device(device); ret = -ENOMEM; goto error; } rcu_assign_pointer(device->name, name); - q = bdev_get_queue(bdev); - if (blk_queue_discard(q)) - device->can_discard = 1; mutex_lock(&fs_info->fs_devices->device_list_mutex); - device->writeable = 1; + set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = 0; device->io_width = fs_info->sectorsize; device->io_align = fs_info->sectorsize; @@ -2588,13 +2662,12 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->total_bytes = btrfs_device_get_total_bytes(srcdev); device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); device->bytes_used = btrfs_device_get_bytes_used(srcdev); - ASSERT(list_empty(&srcdev->resized_list)); device->commit_total_bytes = srcdev->commit_total_bytes; device->commit_bytes_used = device->bytes_used; device->fs_info = fs_info; device->bdev = bdev; - device->in_fs_metadata = 1; - device->is_tgtdev_for_dev_replace = 1; + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); device->mode = FMODE_EXCL; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); @@ -2612,19 +2685,6 @@ error: return ret; } -void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, - struct btrfs_device *tgtdev) -{ - u32 sectorsize = fs_info->sectorsize; - - WARN_ON(fs_info->fs_devices->rw_devices == 0); - tgtdev->io_width = sectorsize; - tgtdev->io_align = sectorsize; - tgtdev->sector_size = sectorsize; - tgtdev->fs_info = fs_info; - tgtdev->in_fs_metadata = 1; -} - static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { @@ -2680,7 +2740,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, u64 old_total; u64 diff; - if (!device->writeable) + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) return -EACCES; new_size = round_down(new_size, fs_info->sectorsize); @@ -2690,7 +2750,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); if (new_size <= device->total_bytes || - device->is_tgtdev_for_dev_replace) { + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { mutex_unlock(&fs_info->chunk_mutex); return -EINVAL; } @@ -2930,7 +2990,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) * we release the path used to search the chunk/dev tree and before * the current task acquires this mutex and calls us. */ - ASSERT(mutex_is_locked(&fs_info->delete_unused_bgs_mutex)); + lockdep_assert_held(&fs_info->delete_unused_bgs_mutex); ret = btrfs_can_relocate(fs_info, chunk_offset); if (ret) @@ -2943,6 +3003,16 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) if (ret) return ret; + /* + * We add the kobjects here (and after forcing data chunk creation) + * since relocation is the only place we'll create chunks of a new + * type at runtime. The only place where we'll remove the last + * chunk of a type is the call immediately below this one. Even + * so, we're protected against races with the cleaner thread since + * we're covered by the delete_unused_bgs_mutex. + */ + btrfs_add_raid_kobjects(fs_info); + trans = btrfs_start_trans_remove_block_group(root->fs_info, chunk_offset); if (IS_ERR(trans)) { @@ -3034,6 +3104,50 @@ error: return ret; } +/* + * return 1 : allocate a data chunk successfully, + * return <0: errors during allocating a data chunk, + * return 0 : no need to allocate a data chunk. + */ +static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, + u64 chunk_offset) +{ + struct btrfs_block_group_cache *cache; + u64 bytes_used; + u64 chunk_type; + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + ASSERT(cache); + chunk_type = cache->flags; + btrfs_put_block_group(cache); + + if (chunk_type & BTRFS_BLOCK_GROUP_DATA) { + spin_lock(&fs_info->data_sinfo->lock); + bytes_used = fs_info->data_sinfo->bytes_used; + spin_unlock(&fs_info->data_sinfo->lock); + + if (!bytes_used) { + struct btrfs_trans_handle *trans; + int ret; + + trans = btrfs_join_transaction(fs_info->tree_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_force_chunk_alloc(trans, fs_info, + BTRFS_BLOCK_GROUP_DATA); + btrfs_end_transaction(trans); + if (ret < 0) + return ret; + + btrfs_add_raid_kobjects(fs_info); + + return 1; + } + } + return 0; +} + static int insert_balance_item(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl) { @@ -3492,7 +3606,6 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u32 count_meta = 0; u32 count_sys = 0; int chunk_reserved = 0; - u64 bytes_used = 0; /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; @@ -3500,10 +3613,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) old_size = btrfs_device_get_total_bytes(device); size_to_free = div_factor(old_size, 1); size_to_free = min_t(u64, size_to_free, SZ_1M); - if (!device->writeable || + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) || btrfs_device_get_total_bytes(device) - btrfs_device_get_bytes_used(device) > size_to_free || - device->is_tgtdev_for_dev_replace) + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) continue; ret = btrfs_shrink_device(device, old_size - size_to_free); @@ -3651,28 +3764,21 @@ again: goto loop; } - ASSERT(fs_info->data_sinfo); - spin_lock(&fs_info->data_sinfo->lock); - bytes_used = fs_info->data_sinfo->bytes_used; - spin_unlock(&fs_info->data_sinfo->lock); - - if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && - !chunk_reserved && !bytes_used) { - trans = btrfs_start_transaction(chunk_root, 0); - if (IS_ERR(trans)) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); - ret = PTR_ERR(trans); - goto error; - } - - ret = btrfs_force_chunk_alloc(trans, fs_info, - BTRFS_BLOCK_GROUP_DATA); - btrfs_end_transaction(trans); + if (!chunk_reserved) { + /* + * We may be relocating the only data chunk we have, + * which could potentially end up with losing data's + * raid profile, so lets allocate an empty one in + * advance. + */ + ret = btrfs_may_alloc_data_chunk(fs_info, + found_key.offset); if (ret < 0) { mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto error; + } else if (ret == 1) { + chunk_reserved = 1; } - chunk_reserved = 1; } ret = btrfs_relocate_chunk(fs_info, found_key.offset); @@ -3804,12 +3910,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } num_devices = fs_info->fs_devices->num_devices; - btrfs_dev_replace_lock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_lock(&fs_info->dev_replace); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { BUG_ON(num_devices < 1); num_devices--; } - btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_unlock(&fs_info->dev_replace); allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; if (num_devices > 1) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); @@ -4114,7 +4220,8 @@ static int btrfs_uuid_scan_kthread(void *data) key.offset = 0; while (1) { - ret = btrfs_search_forward(root, &key, path, 0); + ret = btrfs_search_forward(root, &key, path, + BTRFS_OLDEST_GENERATION); if (ret) { if (ret > 0) ret = 0; @@ -4371,7 +4478,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) new_size = round_down(new_size, fs_info->sectorsize); diff = round_down(old_size - new_size, fs_info->sectorsize); - if (device->is_tgtdev_for_dev_replace) + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) return -EINVAL; path = btrfs_alloc_path(); @@ -4383,7 +4490,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_total_bytes(device, new_size); - if (device->writeable) { + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { device->fs_devices->total_rw_bytes -= diff; atomic64_sub(diff, &fs_info->free_chunk_space); } @@ -4435,6 +4542,18 @@ again: chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); btrfs_release_path(path); + /* + * We may be relocating the only data chunk we have, + * which could potentially end up with losing data's + * raid profile, so lets allocate an empty one in + * advance. + */ + ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); + if (ret < 0) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + goto done; + } + ret = btrfs_relocate_chunk(fs_info, chunk_offset); mutex_unlock(&fs_info->delete_unused_bgs_mutex); if (ret && ret != -ENOSPC) @@ -4508,7 +4627,7 @@ done: if (ret) { mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_total_bytes(device, old_size); - if (device->writeable) + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) device->fs_devices->total_rw_bytes += diff; atomic64_add(diff, &fs_info->free_chunk_space); mutex_unlock(&fs_info->chunk_mutex); @@ -4572,7 +4691,7 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID56); } -#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r->fs_info) \ +#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ - sizeof(struct btrfs_chunk)) \ / sizeof(struct btrfs_stripe) + 1) @@ -4613,10 +4732,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, BUG_ON(!alloc_profile_is_valid(type, 0)); - if (list_empty(&fs_devices->alloc_list)) + if (list_empty(&fs_devices->alloc_list)) { + if (btrfs_test_opt(info, ENOSPC_DEBUG)) + btrfs_debug(info, "%s: no writable device", __func__); return -ENOSPC; + } - index = __get_raid_index(type); + index = btrfs_bg_flags_to_raid_index(type); sub_stripes = btrfs_raid_array[index].sub_stripes; dev_stripes = btrfs_raid_array[index].dev_stripes; @@ -4629,7 +4751,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_stripe_size = SZ_1G; max_chunk_size = 10 * max_stripe_size; if (!devs_max) - devs_max = BTRFS_MAX_DEVS(info->chunk_root); + devs_max = BTRFS_MAX_DEVS(info); } else if (type & BTRFS_BLOCK_GROUP_METADATA) { /* for larger filesystems, use larger metadata chunks */ if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) @@ -4638,7 +4760,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_stripe_size = SZ_256M; max_chunk_size = max_stripe_size; if (!devs_max) - devs_max = BTRFS_MAX_DEVS(info->chunk_root); + devs_max = BTRFS_MAX_DEVS(info); } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { max_stripe_size = SZ_32M; max_chunk_size = 2 * max_stripe_size; @@ -4668,14 +4790,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 max_avail; u64 dev_offset; - if (!device->writeable) { + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { WARN(1, KERN_ERR "BTRFS: read-only device in alloc_list\n"); continue; } - if (!device->in_fs_metadata || - device->is_tgtdev_for_dev_replace) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &device->dev_state) || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) continue; if (device->total_bytes > device->bytes_used) @@ -4696,8 +4819,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (ret == 0) max_avail = max_stripe_size * dev_stripes; - if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) + if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) { + if (btrfs_test_opt(info, ENOSPC_DEBUG)) + btrfs_debug(info, + "%s: devid %llu has no free space, have=%llu want=%u", + __func__, device->devid, max_avail, + BTRFS_STRIPE_LEN * dev_stripes); continue; + } if (ndevs == fs_devices->rw_devices) { WARN(1, "%s: found more than %llu devices\n", @@ -4720,18 +4849,26 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, /* round down to number of usable stripes */ ndevs = round_down(ndevs, devs_increment); - if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { + if (ndevs < devs_min) { ret = -ENOSPC; + if (btrfs_test_opt(info, ENOSPC_DEBUG)) { + btrfs_debug(info, + "%s: not enough devices with free space: have=%d minimum required=%d", + __func__, ndevs, devs_min); + } goto error; } ndevs = min(ndevs, devs_max); /* - * the primary goal is to maximize the number of stripes, so use as many - * devices as possible, even if the stripes are not maximum sized. + * The primary goal is to maximize the number of stripes, so use as + * many devices as possible, even if the stripes are not maximum sized. + * + * The DUP profile stores more than one stripe per device, the + * max_avail is the total size so we have to adjust. */ - stripe_size = devices_info[ndevs-1].max_avail; + stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); num_stripes = ndevs * dev_stripes; /* @@ -4752,22 +4889,19 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, * and compare that answer with the max chunk size */ if (stripe_size * data_stripes > max_chunk_size) { - u64 mask = (1ULL << 24) - 1; - stripe_size = div_u64(max_chunk_size, data_stripes); /* bump the answer up to a 16MB boundary */ - stripe_size = (stripe_size + mask) & ~mask; + stripe_size = round_up(stripe_size, SZ_16M); - /* but don't go higher than the limits we found - * while searching for free extents + /* + * But don't go higher than the limits we found while searching + * for free extents */ - if (stripe_size > devices_info[ndevs-1].max_avail) - stripe_size = devices_info[ndevs-1].max_avail; + stripe_size = min(devices_info[ndevs - 1].max_avail, + stripe_size); } - stripe_size = div_u64(stripe_size, dev_stripes); - /* align to BTRFS_STRIPE_LEN */ stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN); @@ -4813,16 +4947,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, em_tree = &info->mapping_tree.map_tree; write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); - if (!ret) { - list_add_tail(&em->list, &trans->transaction->pending_chunks); - refcount_inc(&em->refs); - } - write_unlock(&em_tree->lock); if (ret) { + write_unlock(&em_tree->lock); free_extent_map(em); goto error; } + list_add_tail(&em->list, &trans->transaction->pending_chunks); + refcount_inc(&em->refs); + write_unlock(&em_tree->lock); + ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes); if (ret) goto error_del_extent; @@ -4966,7 +5100,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, { u64 chunk_offset; - ASSERT(mutex_is_locked(&fs_info->chunk_mutex)); + lockdep_assert_held(&fs_info->chunk_mutex); chunk_offset = find_next_chunk(fs_info); return __btrfs_alloc_chunk(trans, chunk_offset, type); } @@ -5023,12 +5157,13 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { - if (map->stripes[i].dev->missing) { + if (test_bit(BTRFS_DEV_STATE_MISSING, + &map->stripes[i].dev->dev_state)) { miss_ndevs++; continue; } - - if (!map->stripes[i].dev->writeable) { + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, + &map->stripes[i].dev->dev_state)) { readonly = 1; goto end; } @@ -5094,16 +5229,23 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) else if (map->type & BTRFS_BLOCK_GROUP_RAID5) ret = 2; else if (map->type & BTRFS_BLOCK_GROUP_RAID6) - ret = 3; + /* + * There could be two corrupted data stripes, we need + * to loop retry in order to rebuild the correct data. + * + * Fail a stripe at a time on every retry except the + * stripe under reconstruction. + */ + ret = map->num_stripes; else ret = 1; free_extent_map(em); - btrfs_dev_replace_lock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_lock(&fs_info->dev_replace); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && fs_info->dev_replace.tgtdev) ret++; - btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_unlock(&fs_info->dev_replace); return ret; } @@ -5144,13 +5286,25 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) } static int find_live_mirror(struct btrfs_fs_info *fs_info, - struct map_lookup *map, int first, int num, - int optimal, int dev_replace_is_ongoing) + struct map_lookup *map, int first, + int dev_replace_is_ongoing) { int i; + int num_stripes; + int preferred_mirror; int tolerance; struct btrfs_device *srcdev; + ASSERT((map->type & + (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))); + + if (map->type & BTRFS_BLOCK_GROUP_RAID10) + num_stripes = map->sub_stripes; + else + num_stripes = map->num_stripes; + + preferred_mirror = first + current->pid % num_stripes; + if (dev_replace_is_ongoing && fs_info->dev_replace.cont_reading_from_srcdev_mode == BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) @@ -5164,10 +5318,10 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, * mirror is available */ for (tolerance = 0; tolerance < 2; tolerance++) { - if (map->stripes[optimal].dev->bdev && - (tolerance || map->stripes[optimal].dev != srcdev)) - return optimal; - for (i = first; i < first + num; i++) { + if (map->stripes[preferred_mirror].dev->bdev && + (tolerance || map->stripes[preferred_mirror].dev != srcdev)) + return preferred_mirror; + for (i = first; i < first + num_stripes; i++) { if (map->stripes[i].dev->bdev && (tolerance || map->stripes[i].dev != srcdev)) return i; @@ -5177,7 +5331,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, /* we couldn't find one that doesn't fail. Just return something * and the io error handling code will clean up eventually */ - return optimal; + return preferred_mirror; } static inline int parity_smaller(u64 a, u64 b) @@ -5669,10 +5823,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (!bbio_ret) goto out; - btrfs_dev_replace_lock(dev_replace, 0); + btrfs_dev_replace_read_lock(dev_replace); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); if (!dev_replace_is_ongoing) - btrfs_dev_replace_unlock(dev_replace, 0); + btrfs_dev_replace_read_unlock(dev_replace); else btrfs_dev_replace_set_lock_blocking(dev_replace); @@ -5695,23 +5849,21 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS) + if (!need_full_stripe(op)) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) + if (need_full_stripe(op)) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; else { stripe_index = find_live_mirror(fs_info, map, 0, - map->num_stripes, - current->pid % map->num_stripes, dev_replace_is_ongoing); mirror_num = stripe_index + 1; } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) { + if (need_full_stripe(op)) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -5725,7 +5877,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); stripe_index *= map->sub_stripes; - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) + if (need_full_stripe(op)) num_stripes = map->sub_stripes; else if (mirror_num) stripe_index += mirror_num - 1; @@ -5733,16 +5885,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int old_stripe_index = stripe_index; stripe_index = find_live_mirror(fs_info, map, stripe_index, - map->sub_stripes, stripe_index + - current->pid % map->sub_stripes, dev_replace_is_ongoing); mirror_num = stripe_index - old_stripe_index + 1; } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - if (need_raid_map && - (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS || - mirror_num > 1)) { + if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div64_u64(raid56_full_stripe_start, stripe_len * nr_data_stripes(map)); @@ -5769,9 +5917,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, &stripe_index); - if ((op != BTRFS_MAP_WRITE && - op != BTRFS_MAP_GET_READ_MIRRORS) && - mirror_num <= 1) + if (!need_full_stripe(op) && mirror_num <= 1) mirror_num = 1; } } else { @@ -5878,7 +6024,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, out: if (dev_replace_is_ongoing) { btrfs_dev_replace_clear_lock_blocking(dev_replace); - btrfs_dev_replace_unlock(dev_replace, 0); + btrfs_dev_replace_read_unlock(dev_replace); } free_extent_map(em); return ret; @@ -5998,15 +6144,14 @@ static void btrfs_end_bio(struct bio *bio) dev = bbio->stripes[stripe_index].dev; if (dev->bdev) { if (bio_op(bio) == REQ_OP_WRITE) - btrfs_dev_stat_inc(dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); else - btrfs_dev_stat_inc(dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); if (bio->bi_opf & REQ_PREFLUSH) - btrfs_dev_stat_inc(dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); - btrfs_dev_stat_print_on_error(dev); } } } @@ -6033,7 +6178,7 @@ static void btrfs_end_bio(struct bio *bio) * this bio is actually up to date, we didn't * go over the max number of errors */ - bio->bi_status = 0; + bio->bi_status = BLK_STS_OK; } btrfs_end_bbio(bbio, bio); @@ -6056,26 +6201,18 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device, int should_queue = 1; struct btrfs_pending_bios *pending_bios; - if (device->missing || !device->bdev) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || + !device->bdev) { bio_io_error(bio); return; } /* don't bother with additional async steps for reads, right now */ if (bio_op(bio) == REQ_OP_READ) { - bio_get(bio); btrfsic_submit_bio(bio); - bio_put(bio); return; } - /* - * nr_async_bios allows us to reliably return congestion to the - * higher layers. Otherwise, the async bio makes it appear we have - * made progress against dirty pages when we've really just put it - * on a queue for later - */ - atomic_inc(&fs_info->nr_async_bios); WARN_ON(bio->bi_next); bio->bi_next = NULL; @@ -6144,7 +6281,10 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; - bio->bi_status = BLK_STS_IOERR; + if (atomic_read(&bbio->error) > bbio->max_errors) + bio->bi_status = BLK_STS_IOERR; + else + bio->bi_status = BLK_STS_OK; btrfs_end_bbio(bbio, bio); } } @@ -6206,7 +6346,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || - (bio_op(first_bio) == REQ_OP_WRITE && !dev->writeable)) { + (bio_op(first_bio) == REQ_OP_WRITE && + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { bbio_error(bbio, first_bio, logical); continue; } @@ -6249,13 +6390,13 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, device = btrfs_alloc_device(NULL, &devid, dev_uuid); if (IS_ERR(device)) - return NULL; + return device; list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; fs_devices->num_devices++; - device->missing = 1; + set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); fs_devices->missing_devices++; return device; @@ -6271,8 +6412,8 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, * is generated. * * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() - * on error. Returned struct is not linked onto any lists and can be - * destroyed with kfree() right away. + * on error. Returned struct is not linked onto any lists and must be + * destroyed with free_device. */ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, @@ -6295,7 +6436,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, ret = find_next_devid(fs_info, &tmp); if (ret) { - kfree(dev); + free_device(dev); return ERR_PTR(ret); } } @@ -6377,6 +6518,17 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, return 0; } +static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, + u64 devid, u8 *uuid, bool error) +{ + if (error) + btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", + devid, uuid); + else + btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", + devid, uuid); +} + static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) @@ -6447,20 +6599,25 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, if (!map->stripes[i].dev && !btrfs_test_opt(fs_info, DEGRADED)) { free_extent_map(em); - btrfs_report_missing_device(fs_info, devid, uuid); - return -EIO; + btrfs_report_missing_device(fs_info, devid, uuid, true); + return -ENOENT; } if (!map->stripes[i].dev) { map->stripes[i].dev = add_missing_dev(fs_info->fs_devices, devid, uuid); - if (!map->stripes[i].dev) { + if (IS_ERR(map->stripes[i].dev)) { free_extent_map(em); - return -EIO; + btrfs_err(fs_info, + "failed to init missing dev %llu: %ld", + devid, PTR_ERR(map->stripes[i].dev)); + return PTR_ERR(map->stripes[i].dev); } - btrfs_report_missing_device(fs_info, devid, uuid); + btrfs_report_missing_device(fs_info, devid, uuid, false); } - map->stripes[i].dev->in_fs_metadata = 1; + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &(map->stripes[i].dev->dev_state)); + } write_lock(&map_tree->map_tree.lock); @@ -6489,7 +6646,7 @@ static void fill_device_from_item(struct extent_buffer *leaf, device->io_width = btrfs_device_io_width(leaf, dev_item); device->sector_size = btrfs_device_sector_size(leaf, dev_item); WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); - device->is_tgtdev_for_dev_replace = 0; + clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); ptr = btrfs_device_uuid(dev_item); read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); @@ -6501,7 +6658,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices; int ret; - BUG_ON(!mutex_is_locked(&uuid_mutex)); + lockdep_assert_held(&uuid_mutex); ASSERT(fsid); fs_devices = fs_info->fs_devices->seed; @@ -6577,22 +6734,32 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); if (!device) { if (!btrfs_test_opt(fs_info, DEGRADED)) { - btrfs_report_missing_device(fs_info, devid, dev_uuid); - return -EIO; + btrfs_report_missing_device(fs_info, devid, + dev_uuid, true); + return -ENOENT; } device = add_missing_dev(fs_devices, devid, dev_uuid); - if (!device) - return -ENOMEM; - btrfs_report_missing_device(fs_info, devid, dev_uuid); + if (IS_ERR(device)) { + btrfs_err(fs_info, + "failed to add missing dev %llu: %ld", + devid, PTR_ERR(device)); + return PTR_ERR(device); + } + btrfs_report_missing_device(fs_info, devid, dev_uuid, false); } else { if (!device->bdev) { - btrfs_report_missing_device(fs_info, devid, dev_uuid); - if (!btrfs_test_opt(fs_info, DEGRADED)) - return -EIO; + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_report_missing_device(fs_info, + devid, dev_uuid, true); + return -ENOENT; + } + btrfs_report_missing_device(fs_info, devid, + dev_uuid, false); } - if(!device->bdev && !device->missing) { + if (!device->bdev && + !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { /* * this happens when a device that was properly setup * in the device info lists suddenly goes bad. @@ -6600,12 +6767,13 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, * device->missing to one here */ device->fs_devices->missing_devices++; - device->missing = 1; + set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); } /* Move the device to its own fs_devices */ if (device->fs_devices != fs_devices) { - ASSERT(device->missing); + ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, + &device->dev_state)); list_move(&device->dev_list, &fs_devices->devices); device->fs_devices->num_devices--; @@ -6619,15 +6787,16 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, } if (device->fs_devices != fs_info->fs_devices) { - BUG_ON(device->writeable); + BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); if (device->generation != btrfs_device_generation(leaf, dev_item)) return -EINVAL; } fill_device_from_item(leaf, dev_item, device); - device->in_fs_metadata = 1; - if (device->writeable && !device->is_tgtdev_for_dev_replace) { + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && + !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { device->fs_devices->total_rw_bytes += device->total_bytes; atomic64_add(device->total_bytes - device->bytes_used, &fs_info->free_chunk_space); @@ -6756,19 +6925,16 @@ out_short_read: return -EIO; } -void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid, - u8 *uuid) -{ - btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", devid, uuid); -} - /* * Check if all chunks in the fs are OK for read-write degraded mount * + * If the @failing_dev is specified, it's accounted as missing. + * * Return true if all chunks meet the minimal RW mount requirements. * Return false if any chunk doesn't meet the minimal RW mount requirements. */ -bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info) +bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, + struct btrfs_device *failing_dev) { struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; @@ -6796,12 +6962,16 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info) for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *dev = map->stripes[i].dev; - if (!dev || !dev->bdev || dev->missing || + if (!dev || !dev->bdev || + test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || dev->last_flush_error) missing++; + else if (failing_dev && failing_dev == dev) + missing++; } if (missing > max_tolerated) { - btrfs_warn(fs_info, + if (!failing_dev) + btrfs_warn(fs_info, "chunk %llu missing %d devices, max tolerance is %d for writeable mount", em->start, missing, max_tolerated); free_extent_map(em); @@ -7072,10 +7242,24 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device)) + stats_cnt = atomic_read(&device->dev_stats_ccnt); + if (!device->dev_stats_valid || stats_cnt == 0) continue; - stats_cnt = atomic_read(&device->dev_stats_ccnt); + + /* + * There is a LOAD-LOAD control dependency between the value of + * dev_stats_ccnt and updating the on-disk values which requires + * reading the in-memory counters. Such control dependencies + * require explicit read memory barriers. + * + * This memory barriers pairs with smp_mb__before_atomic in + * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full + * barrier implied by atomic_xchg in + * btrfs_dev_stats_read_and_reset + */ + smp_rmb(); + ret = update_dev_stat_item(trans, fs_info, device); if (!ret) atomic_sub(stats_cnt, &device->dev_stats_ccnt); @@ -7214,20 +7398,20 @@ void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) } /* Must be invoked during the transaction commit */ -void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info, - struct btrfs_transaction *transaction) +void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct extent_map *em; struct map_lookup *map; struct btrfs_device *dev; int i; - if (list_empty(&transaction->pending_chunks)) + if (list_empty(&trans->pending_chunks)) return; /* In order to kick the device replace finish process */ mutex_lock(&fs_info->chunk_mutex); - list_for_each_entry(em, &transaction->pending_chunks, list) { + list_for_each_entry(em, &trans->pending_chunks, list) { map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { |