From 4330e183c9537df20952d4a9ee142c536fb8ae54 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 9 Mar 2017 09:34:37 +0800 Subject: btrfs: Do chunk level check for degraded rw mount Now use the btrfs_check_rw_degradable() to check if we can mount in the degraded mode. With this patch, we can mount in the following case: # mkfs.btrfs -f -m raid1 -d single /dev/sdb /dev/sdc # wipefs -a /dev/sdc # mount /dev/sdb /mnt/btrfs -o degraded As the single data chunk is only on sdb, so it's OK to mount as degraded, as missing one device is OK for RAID1. But still fail in the following case as expected: # mkfs.btrfs -f -m raid1 -d single /dev/sdb /dev/sdc # wipefs -a /dev/sdb # mount /dev/sdc /mnt/btrfs -o degraded As the data chunk is only in sdb, so it's not OK to mount it as degraded. Reported-by: Zhao Lei Reported-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 080e2ebb8aa0..78fc7f5ab773 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3035,15 +3035,10 @@ retry_root_backup: btrfs_err(fs_info, "failed to read block groups: %d", ret); goto fail_sysfs; } - fs_info->num_tolerated_disk_barrier_failures = - btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); - if (fs_info->fs_devices->missing_devices > - fs_info->num_tolerated_disk_barrier_failures && - !(sb->s_flags & MS_RDONLY)) { + + if (!(sb->s_flags & MS_RDONLY) && !btrfs_check_rw_degradable(fs_info)) { btrfs_warn(fs_info, -"missing devices (%llu) exceeds the limit (%d), writeable mount is not allowed", - fs_info->fs_devices->missing_devices, - fs_info->num_tolerated_disk_barrier_failures); + "writeable mount is not allowed due to too many missing devices"); goto fail_sysfs; } -- cgit From d10b82fe29cf3730ab7c1a487f0f62f10af5d301 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 27 Jun 2017 17:28:40 +0800 Subject: btrfs: Allow barrier_all_devices to do chunk level device check The last user of num_tolerated_disk_barrier_failures is barrier_all_devices(). But it can be easily changed to the new per-chunk degradable check framework. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 78fc7f5ab773..552098e1b65b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3519,20 +3519,10 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device) return bio->bi_status; } -static int check_barrier_error(struct btrfs_fs_devices *fsdevs) +static int check_barrier_error(struct btrfs_fs_info *fs_info) { - int dev_flush_error = 0; - struct btrfs_device *dev; - - list_for_each_entry_rcu(dev, &fsdevs->devices, dev_list) { - if (!dev->bdev || dev->last_flush_error) - dev_flush_error++; - } - - if (dev_flush_error > - fsdevs->fs_info->num_tolerated_disk_barrier_failures) + if (!btrfs_check_rw_degradable(fs_info)) return -EIO; - return 0; } @@ -3587,7 +3577,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info) * to arrive at the volume status. So error checking * is being pushed to a separate loop. */ - return check_barrier_error(info->fs_devices); + return check_barrier_error(info); } return 0; } -- cgit From bc3cce2378b9eeb420873b17664493e68fd07e8c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 9 Mar 2017 09:34:41 +0800 Subject: btrfs: Cleanup num_tolerated_disk_barrier_failures As we use per-chunk degradable check, the global num_tolerated_disk_barrier_failures is of no use. We can now remove it. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 54 ------------------------------------------------------ 1 file changed, 54 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 552098e1b65b..2f366044d891 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3611,60 +3611,6 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) return min_tolerated; } -int btrfs_calc_num_tolerated_disk_barrier_failures( - struct btrfs_fs_info *fs_info) -{ - struct btrfs_ioctl_space_info space; - struct btrfs_space_info *sinfo; - u64 types[] = {BTRFS_BLOCK_GROUP_DATA, - BTRFS_BLOCK_GROUP_SYSTEM, - BTRFS_BLOCK_GROUP_METADATA, - BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; - int i; - int c; - int num_tolerated_disk_barrier_failures = - (int)fs_info->fs_devices->num_devices; - - for (i = 0; i < ARRAY_SIZE(types); i++) { - struct btrfs_space_info *tmp; - - sinfo = NULL; - rcu_read_lock(); - list_for_each_entry_rcu(tmp, &fs_info->space_info, list) { - if (tmp->flags == types[i]) { - sinfo = tmp; - break; - } - } - rcu_read_unlock(); - - if (!sinfo) - continue; - - down_read(&sinfo->groups_sem); - for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { - u64 flags; - - if (list_empty(&sinfo->block_groups[c])) - continue; - - btrfs_get_block_group_info(&sinfo->block_groups[c], - &space); - if (space.total_bytes == 0 || space.used_bytes == 0) - continue; - flags = space.flags; - - num_tolerated_disk_barrier_failures = min( - num_tolerated_disk_barrier_failures, - btrfs_get_num_tolerated_disk_barrier_failures( - flags)); - } - up_read(&sinfo->groups_sem); - } - - return num_tolerated_disk_barrier_failures; -} - int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) { struct list_head *head; -- cgit From a4f78750ef1882e59bb4f947e216cf61ef2d67d2 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 29 Jun 2017 18:37:49 +0200 Subject: btrfs: get fs_info from eb in btrfs_print_leaf, remove argument Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2f366044d891..a42ae7676759 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3984,7 +3984,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) fs_info->dirty_metadata_batch); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { - btrfs_print_leaf(fs_info, buf); + btrfs_print_leaf(buf); ASSERT(0); } #endif -- cgit From abbb3b8ebfec577390873d33076d56bc6e758939 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 16 Jun 2017 00:50:33 +0200 Subject: btrfs: split write_dev_supers to two functions There are two independent parts, one that writes the superblocks and another that waits for completion. No functional changes, but cleanups, reformatting and comment updates. Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 129 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 73 insertions(+), 56 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a42ae7676759..b72711ad7121 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3373,19 +3373,17 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) } /* - * this should be called twice, once with wait == 0 and - * once with wait == 1. When wait == 0 is done, all the buffer heads - * we write are pinned. + * Write superblock @sb to the @device. Do not wait for completion, all the + * buffer heads we write are pinned. * - * They are released when wait == 1 is done. - * max_mirrors must be the same for both runs, and it indicates how - * many supers on this one device should be written. + * Write @max_mirrors copies of the superblock, where 0 means default that fit + * the expected device size at commit time. Note that max_mirrors must be + * same for write and wait phases. * - * max_mirrors == 0 means to write them all. + * Return number of errors when buffer head is not found or submission fails. */ static int write_dev_supers(struct btrfs_device *device, - struct btrfs_super_block *sb, - int wait, int max_mirrors) + struct btrfs_super_block *sb, int max_mirrors) { struct buffer_head *bh; int i; @@ -3403,57 +3401,33 @@ static int write_dev_supers(struct btrfs_device *device, device->commit_total_bytes) break; - if (wait) { - bh = __find_get_block(device->bdev, bytenr / 4096, - BTRFS_SUPER_INFO_SIZE); - if (!bh) { - errors++; - continue; - } - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - errors++; + btrfs_set_super_bytenr(sb, bytenr); - /* drop our reference */ - brelse(bh); + crc = ~(u32)0; + crc = btrfs_csum_data((const char *)sb + BTRFS_CSUM_SIZE, crc, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, sb->csum); - /* drop the reference from the wait == 0 run */ - brelse(bh); + /* One reference for us, and we leave it for the caller */ + bh = __getblk(device->bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + if (!bh) { + btrfs_err(device->fs_info, + "couldn't get super buffer head for bytenr %llu", + bytenr); + errors++; continue; - } else { - btrfs_set_super_bytenr(sb, bytenr); - - crc = ~(u32)0; - crc = btrfs_csum_data((const char *)sb + - BTRFS_CSUM_SIZE, crc, - BTRFS_SUPER_INFO_SIZE - - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, sb->csum); - - /* - * one reference for us, and we leave it for the - * caller - */ - bh = __getblk(device->bdev, bytenr / 4096, - BTRFS_SUPER_INFO_SIZE); - if (!bh) { - btrfs_err(device->fs_info, - "couldn't get super buffer head for bytenr %llu", - bytenr); - errors++; - continue; - } + } - memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); + memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); - /* one reference for submit_bh */ - get_bh(bh); + /* one reference for submit_bh */ + get_bh(bh); - set_buffer_uptodate(bh); - lock_buffer(bh); - bh->b_end_io = btrfs_end_buffer_write_sync; - bh->b_private = device; - } + set_buffer_uptodate(bh); + lock_buffer(bh); + bh->b_end_io = btrfs_end_buffer_write_sync; + bh->b_private = device; /* * we fua the first super. The others we allow @@ -3471,6 +3445,49 @@ static int write_dev_supers(struct btrfs_device *device, return errors < i ? 0 : -1; } +/* + * Wait for write completion of superblocks done by write_dev_supers, + * @max_mirrors same for write and wait phases. + * + * Return number of errors when buffer head is not found or not marked up to + * date. + */ +static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) +{ + struct buffer_head *bh; + int i; + int errors = 0; + u64 bytenr; + + if (max_mirrors == 0) + max_mirrors = BTRFS_SUPER_MIRROR_MAX; + + for (i = 0; i < max_mirrors; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + device->commit_total_bytes) + break; + + bh = __find_get_block(device->bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + if (!bh) { + errors++; + continue; + } + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + errors++; + + /* drop our reference */ + brelse(bh); + + /* drop the reference from the writing run */ + brelse(bh); + } + + return errors < i ? 0 : -1; +} + /* * endio for the write_dev_flush, this will wake anyone waiting * for the barrier when it is done @@ -3668,7 +3685,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) flags = btrfs_super_flags(sb); btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); - ret = write_dev_supers(dev, sb, 0, max_mirrors); + ret = write_dev_supers(dev, sb, max_mirrors); if (ret) total_errors++; } @@ -3691,7 +3708,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) if (!dev->in_fs_metadata || !dev->writeable) continue; - ret = write_dev_supers(dev, sb, 1, max_mirrors); + ret = wait_dev_supers(dev, max_mirrors); if (ret) total_errors++; } -- cgit From 9f6d251033bc5bd0115c23fae8b6aac5f6263c2c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 16 Jun 2017 01:48:05 +0200 Subject: btrfs: use named constant for bdev blocksize Superblock is read and written using buffer heads, we need to set the bdev blocksize. The magic constant has been hardcoded in several places, so replace it with a named constant. Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b72711ad7121..9b1f4ef54438 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2694,8 +2694,8 @@ int open_ctree(struct super_block *sb, btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); - sb->s_blocksize = 4096; - sb->s_blocksize_bits = blksize_bits(4096); + sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; + sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); btrfs_init_btree_inode(fs_info); @@ -3316,7 +3316,7 @@ int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode)) return -EINVAL; - bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); + bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE); /* * If we fail to read from the underlying devices, as of now * the best option we have is to mark it EIO. @@ -3409,7 +3409,7 @@ static int write_dev_supers(struct btrfs_device *device, btrfs_csum_final(crc, sb->csum); /* One reference for us, and we leave it for the caller */ - bh = __getblk(device->bdev, bytenr / 4096, + bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE); if (!bh) { btrfs_err(device->fs_info, @@ -3468,7 +3468,8 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) device->commit_total_bytes) break; - bh = __find_get_block(device->bdev, bytenr / 4096, + bh = __find_get_block(device->bdev, + bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE); if (!bh) { errors++; -- cgit From ea14b57fd1954fa3193e025224bbbeab7415c490 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 22 Jun 2017 02:19:11 +0200 Subject: btrfs: fix spelling of snapshotting Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9b1f4ef54438..57a857142cda 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1343,7 +1343,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, atomic_set(&root->log_batch, 0); atomic_set(&root->orphan_inodes, 0); refcount_set(&root->refs, 1); - atomic_set(&root->will_be_snapshoted, 0); + atomic_set(&root->will_be_snapshotted, 0); atomic64_set(&root->qgroup_meta_rsv, 0); root->log_transid = 0; root->log_transid_committed = -1; -- cgit From 4958aa6821c7c4e9682806bc4fea92250d020829 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 22 Jun 2017 03:59:40 +0200 Subject: btrfs: drop chunk locks at the end of close_ctree The pinned chunks might be left over so we clean them but at this point of close_ctree, there's noone to race with, the locking can be removed. Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 57a857142cda..c2d3699a70bd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3944,7 +3944,6 @@ void close_ctree(struct btrfs_fs_info *fs_info) __btrfs_free_block_rsv(root->orphan_block_rsv); root->orphan_block_rsv = NULL; - mutex_lock(&fs_info->chunk_mutex); while (!list_empty(&fs_info->pinned_chunks)) { struct extent_map *em; @@ -3953,7 +3952,6 @@ void close_ctree(struct btrfs_fs_info *fs_info) list_del_init(&em->list); free_extent_map(em); } - mutex_unlock(&fs_info->chunk_mutex); } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, -- cgit From 44880fdc91bc4f6730e37f2cb6025b35c70b312d Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 29 Jul 2017 17:50:09 +0800 Subject: btrfs: use appropriate define for the fsid Though BTRFS_FSID_SIZE and BTRFS_UUID_SIZE are of the same size, we should use the matching constant for the fsid buffer. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c2d3699a70bd..3715478fffed 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -529,7 +529,7 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - u8 fsid[BTRFS_UUID_SIZE]; + u8 fsid[BTRFS_FSID_SIZE]; int ret = 1; read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); @@ -3681,7 +3681,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) btrfs_set_stack_device_io_width(dev_item, dev->io_width); btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); - memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE); + memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE); flags = btrfs_super_flags(sb); btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); @@ -4120,7 +4120,7 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info) ret = -EINVAL; } - if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { + if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "dev_item UUID does not match fsid: %pU != %pU", fs_info->fsid, sb->dev_item.fsid); -- cgit From 43a0111103af2d358f6d2d671d8429ba169e53bb Mon Sep 17 00:00:00 2001 From: Lu Fengqi Date: Fri, 18 Aug 2017 16:38:07 +0800 Subject: btrfs: use btrfsic_submit_bio instead of submit_bio in write_dev_flush Although this bio has no data attached, it will reach this condition (bio->bi_opf & REQ_PREFLUSH) and then update the flush_gen of dev_state in __btrfsic_submit_bio. So we should still submit it through integrity checker. Otherwise, the integrity checker will throw the following warning when I mount a newly created btrfs filesystem. [10264.755497] btrfs: attempt to write superblock which references block M @29523968 (sdb1/1111654400/0) which is not flushed out of disk's write cache (block flush_gen=1, dev->flush_gen=0)! [10264.755498] btrfs: attempt to write superblock which references block M @29523968 (sdb1/37912576/0) which is not flushed out of disk's write cache (block flush_gen=1, dev->flush_gen=0)! Signed-off-by: Lu Fengqi Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3715478fffed..195634098f21 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3517,7 +3517,7 @@ static void write_dev_flush(struct btrfs_device *device) init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; - submit_bio(bio); + btrfsic_submit_bio(bio); device->flush_bio_sent = 1; } -- cgit From 583b723151794e2ff1691f1510b4e43710293875 Mon Sep 17 00:00:00 2001 From: Hans van Kranenburg Date: Fri, 28 Jul 2017 08:31:28 +0200 Subject: btrfs: Do not use data_alloc_cluster in ssd mode This patch provides a band aid to improve the 'out of the box' behaviour of btrfs for disks that are detected as being an ssd. In a general purpose mixed workload scenario, the current ssd mode causes overallocation of available raw disk space for data, while leaving behind increasing amounts of unused fragmented free space. This situation leads to early ENOSPC problems which are harming user experience and adoption of btrfs as a general purpose filesystem. This patch modifies the data extent allocation behaviour of the ssd mode to make it behave identical to nossd mode. The metadata behaviour and additional ssd_spread option stay untouched so far. Recommendations for future development are to reconsider the current oversimplified nossd / ssd distinction and the broken detection mechanism based on the rotational attribute in sysfs and provide experienced users with a more flexible way to choose allocator behaviour for data and metadata, optimized for certain use cases, while keeping sane 'out of the box' default settings. The internals of the current btrfs code have more potential than what currently gets exposed to the user to choose from. The SSD story... In the first year of btrfs development, around early 2008, btrfs gained a mount option which enables specific functionality for filesystems on solid state devices. The first occurance of this functionality is in commit e18e4809, labeled "Add mount -o ssd, which includes optimizations for seek free storage". The effect on allocating free space for doing (data) writes is to 'cluster' writes together, writing them out in contiguous space, as opposed to a 'tetris' way of putting all separate writes into any free space fragment that fits (which is what the -o nossd behaviour does). A somewhat simplified explanation of what happens is that, when for example, the 'cluster' size is set to 2MiB, when we do some writes, the data allocator will search for a free space block that is 2MiB big, and put the writes in there. The ssd mode itself might allow a 2MiB cluster to be composed of multiple free space extents with some existing data in between, while the additional ssd_spread mount option kills off this option and requires fully free space. The idea behind this is (commit 536ac8ae): "The [...] clusters make it more likely a given IO will completely overwrite the ssd block, so it doesn't have to do an internal rwm cycle."; ssd block meaning nand erase block. So, effectively this means applying a "locality based algorithm" and trying to outsmart the actual ssd. Since then, various changes have been made to the involved code, but the basic idea is still present, and gets activated whenever the ssd mount option is active. This also happens by default, when the rotational flag as seen at /sys/block//queue/rotational is set to 0. However, there's a number of problems with this approach. First, what the optimization is trying to do is outsmart the ssd by assuming there is a relation between the physical address space of the block device as seen by btrfs and the actual physical storage of the ssd, and then adjusting data placement. However, since the introduction of the Flash Translation Layer (FTL) which is a part of the internal controller of an ssd, these attempts are futile. The use of good quality FTL in consumer ssd products might have been limited in 2008, but this situation has changed drastically soon after that time. Today, even the flash memory in your automatic cat feeding machine or your grandma's wheelchair has a full featured one. Second, the behaviour as described above results in the filesystem being filled up with badly fragmented free space extents because of relatively small pieces of space that are freed up by deletes, but not selected again as part of a 'cluster'. Since the algorithm prefers allocating a new chunk over going back to tetris mode, the end result is a filesystem in which all raw space is allocated, but which is composed of underutilized chunks with a 'shotgun blast' pattern of fragmented free space. Usually, the next problematic thing that happens is the filesystem wanting to allocate new space for metadata, which causes the filesystem to fail in spectacular ways. Third, the default mount options you get for an ssd ('ssd' mode enabled, 'discard' not enabled), in combination with spreading out writes over the full address space and ignoring freed up space leads to worst case behaviour in providing information to the ssd itself, since it will never learn that all the free space left behind is actually free. There are two ways to let an ssd know previously written data does not have to be preserved, which are sending explicit signals using discard or fstrim, or by simply overwriting the space with new data. The worst case behaviour is the btrfs ssd_spread mount option in combination with not having discard enabled. It has a side effect of minimizing the reuse of free space previously written in. Fourth, the rotational flag in /sys/ does not reliably indicate if the device is a locally attached ssd. For example, iSCSI or NBD displays as non-rotational, while a loop device on an ssd shows up as rotational. The combination of the second and third problem effectively means that despite all the good intentions, the btrfs ssd mode reliably causes the ssd hardware and the filesystem structures and performance to be choked to death. The clickbait version of the title of this story would have been "Btrfs ssd optimizations considered harmful for ssds". The current nossd 'tetris' mode (even still without discard) allows a pattern of overwriting much more previously used space, causing many more implicit discards to happen because of the overwrite information the ssd gets. The actual location in the physical address space, as seen from the point of view of btrfs is irrelevant, because the actual writes to the low level flash are reordered anyway thanks to the FTL. Changes made in the code 1. Make ssd mode data allocation identical to tetris mode, like nossd. 2. Adjust and clean up filesystem mount messages so that we can easily identify if a kernel has this patch applied or not, when providing support to end users. Also, make better use of the *_and_info helpers to only trigger messages on actual state changes. Backporting notes Notes for whoever wants to backport this patch to their 4.9 LTS kernel: * First apply commit 951e7966 "btrfs: drop the nossd flag when remounting with -o ssd", or fixup the differences manually. * The rest of the conflicts are because of the fs_info refactoring. So, for example, instead of using fs_info, it's root->fs_info in extent-tree.c Signed-off-by: Hans van Kranenburg Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 195634098f21..90b967ae46d0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3053,11 +3053,9 @@ retry_root_backup: if (IS_ERR(fs_info->transaction_kthread)) goto fail_cleaner; - if (!btrfs_test_opt(fs_info, SSD) && - !btrfs_test_opt(fs_info, NOSSD) && + if (!btrfs_test_opt(fs_info, NOSSD) && !fs_info->fs_devices->rotating) { - btrfs_info(fs_info, "detected SSD devices, enabling SSD mode"); - btrfs_set_opt(fs_info->mount_opt, SSD); + btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations"); } /* -- cgit From db95c876c568cef951fbbd4c0118cb5386e4bb99 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 18 Aug 2017 18:16:25 +0200 Subject: btrfs: submit superblock io with REQ_META and REQ_PRIO The superblock is also metadata of the filesystem so the relevant IO should be tagged as such. We also tag it as high priority, as it's the last block committed for metadata from a given transaction. Any delays would effectively block the whole transaction, also blocking any other operation holding the device_list_mutex. Reviewed-by: Josef Bacik Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 90b967ae46d0..27d458640536 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3433,9 +3433,10 @@ static int write_dev_supers(struct btrfs_device *device, */ if (i == 0) { ret = btrfsic_submit_bh(REQ_OP_WRITE, - REQ_SYNC | REQ_FUA, bh); + REQ_SYNC | REQ_FUA | REQ_META | REQ_PRIO, bh); } else { - ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); + ret = btrfsic_submit_bh(REQ_OP_WRITE, + REQ_SYNC | REQ_META | REQ_PRIO, bh); } if (ret) errors++; -- cgit