diff options
author | Arnaldo Carvalho de Melo <acme@redhat.com> | 2021-06-30 15:27:32 -0300 |
---|---|---|
committer | Arnaldo Carvalho de Melo <acme@redhat.com> | 2021-06-30 15:27:32 -0300 |
commit | 857286e4c5ae5d2e860fd15d4628e707b434d7e5 (patch) | |
tree | 520ea5916f50fb2a4289d8d70438d559c6808b01 /fs/btrfs/scrub.c | |
parent | 51f382428c17f172f430f9be8de4246b8f15f97c (diff) | |
parent | 007b350a58754a93ca9fe50c498cc27780171153 (diff) |
Merge remote-tracking branch 'torvalds/master' into perf/core
To pick up fixes.
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Diffstat (limited to 'fs/btrfs/scrub.c')
-rw-r--r-- | fs/btrfs/scrub.c | 159 |
1 files changed, 111 insertions, 48 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 485cda3eb8d7..088641ba7a8e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -165,6 +165,10 @@ struct scrub_ctx { int readonly; int pages_per_rd_bio; + /* State of IO submission throttling affecting the associated device */ + ktime_t throttle_deadline; + u64 throttle_sent; + int is_dev_replace; u64 write_pointer; @@ -605,6 +609,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( spin_lock_init(&sctx->list_lock); spin_lock_init(&sctx->stat_lock); init_waitqueue_head(&sctx->list_wait); + sctx->throttle_deadline = 0; WARN_ON(sctx->wr_curr_bio != NULL); mutex_init(&sctx->wr_lock); @@ -626,7 +631,6 @@ nomem: static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *warn_ctx) { - u64 isize; u32 nlink; int ret; int i; @@ -662,7 +666,6 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, eb = swarn->path->nodes[0]; inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], struct btrfs_inode_item); - isize = btrfs_inode_size(eb, inode_item); nlink = btrfs_inode_nlink(eb, inode_item); btrfs_release_path(swarn->path); @@ -691,12 +694,12 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) btrfs_warn_in_rcu(fs_info, -"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)", +"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), swarn->physical, root, inum, offset, - min(isize - offset, (u64)PAGE_SIZE), nlink, + fs_info->sectorsize, nlink, (char *)(unsigned long)ipath->fspath->val[i]); btrfs_put_root(local_root); @@ -885,25 +888,25 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * read all mirrors one after the other. This includes to * re-read the extent or metadata block that failed (that was * the cause that this fixup code is called) another time, - * page by page this time in order to know which pages + * sector by sector this time in order to know which sectors * caused I/O errors and which ones are good (for all mirrors). * It is the goal to handle the situation when more than one * mirror contains I/O errors, but the errors do not * overlap, i.e. the data can be repaired by selecting the - * pages from those mirrors without I/O error on the - * particular pages. One example (with blocks >= 2 * PAGE_SIZE) - * would be that mirror #1 has an I/O error on the first page, - * the second page is good, and mirror #2 has an I/O error on - * the second page, but the first page is good. - * Then the first page of the first mirror can be repaired by - * taking the first page of the second mirror, and the - * second page of the second mirror can be repaired by - * copying the contents of the 2nd page of the 1st mirror. - * One more note: if the pages of one mirror contain I/O + * sectors from those mirrors without I/O error on the + * particular sectors. One example (with blocks >= 2 * sectorsize) + * would be that mirror #1 has an I/O error on the first sector, + * the second sector is good, and mirror #2 has an I/O error on + * the second sector, but the first sector is good. + * Then the first sector of the first mirror can be repaired by + * taking the first sector of the second mirror, and the + * second sector of the second mirror can be repaired by + * copying the contents of the 2nd sector of the 1st mirror. + * One more note: if the sectors of one mirror contain I/O * errors, the checksum cannot be verified. In order to get * the best data for repairing, the first attempt is to find * a mirror without I/O errors and with a validated checksum. - * Only if this is not possible, the pages are picked from + * Only if this is not possible, the sectors are picked from * mirrors with I/O errors without considering the checksum. * If the latter is the case, at the end, the checksum of the * repaired area is verified in order to correctly maintain @@ -1060,26 +1063,26 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) /* * In case of I/O errors in the area that is supposed to be - * repaired, continue by picking good copies of those pages. - * Select the good pages from mirrors to rewrite bad pages from + * repaired, continue by picking good copies of those sectors. + * Select the good sectors from mirrors to rewrite bad sectors from * the area to fix. Afterwards verify the checksum of the block * that is supposed to be repaired. This verification step is * only done for the purpose of statistic counting and for the * final scrub report, whether errors remain. * A perfect algorithm could make use of the checksum and try - * all possible combinations of pages from the different mirrors + * all possible combinations of sectors from the different mirrors * until the checksum verification succeeds. For example, when - * the 2nd page of mirror #1 faces I/O errors, and the 2nd page + * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector * of mirror #2 is readable but the final checksum test fails, - * then the 2nd page of mirror #3 could be tried, whether now + * then the 2nd sector of mirror #3 could be tried, whether now * the final checksum succeeds. But this would be a rare * exception and is therefore not implemented. At least it is * avoided that the good copy is overwritten. * A more useful improvement would be to pick the sectors * without I/O error based on sector sizes (512 bytes on legacy - * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one + * disks) instead of on sectorsize. Then maybe 512 byte of one * mirror could be repaired by taking 512 byte of a different - * mirror, even if other 512 byte sectors in the same PAGE_SIZE + * mirror, even if other 512 byte sectors in the same sectorsize * area are unreadable. */ success = 1; @@ -1260,7 +1263,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, { struct scrub_ctx *sctx = original_sblock->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; - u64 length = original_sblock->page_count * PAGE_SIZE; + u64 length = original_sblock->page_count * fs_info->sectorsize; u64 logical = original_sblock->pagev[0]->logical; u64 generation = original_sblock->pagev[0]->generation; u64 flags = original_sblock->pagev[0]->flags; @@ -1283,13 +1286,13 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, */ while (length > 0) { - sublen = min_t(u64, length, PAGE_SIZE); + sublen = min_t(u64, length, fs_info->sectorsize); mapped_length = sublen; bbio = NULL; /* - * with a length of PAGE_SIZE, each returned stripe - * represents one mirror + * With a length of sectorsize, each returned stripe represents + * one mirror */ btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, @@ -1480,7 +1483,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, bio = btrfs_io_bio_alloc(1); bio_set_dev(bio, spage->dev->bdev); - bio_add_page(bio, spage->page, PAGE_SIZE, 0); + bio_add_page(bio, spage->page, fs_info->sectorsize, 0); bio->bi_iter.bi_sector = spage->physical >> 9; bio->bi_opf = REQ_OP_READ; @@ -1544,6 +1547,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, struct scrub_page *spage_bad = sblock_bad->pagev[page_num]; struct scrub_page *spage_good = sblock_good->pagev[page_num]; struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; + const u32 sectorsize = fs_info->sectorsize; BUG_ON(spage_bad->page == NULL); BUG_ON(spage_good->page == NULL); @@ -1563,8 +1567,8 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, bio->bi_iter.bi_sector = spage_bad->physical >> 9; bio->bi_opf = REQ_OP_WRITE; - ret = bio_add_page(bio, spage_good->page, PAGE_SIZE, 0); - if (PAGE_SIZE != ret) { + ret = bio_add_page(bio, spage_good->page, sectorsize, 0); + if (ret != sectorsize) { bio_put(bio); return -EIO; } @@ -1642,6 +1646,7 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, { struct scrub_bio *sbio; int ret; + const u32 sectorsize = sctx->fs_info->sectorsize; mutex_lock(&sctx->wr_lock); again: @@ -1681,16 +1686,16 @@ again: bio->bi_iter.bi_sector = sbio->physical >> 9; bio->bi_opf = REQ_OP_WRITE; sbio->status = 0; - } else if (sbio->physical + sbio->page_count * PAGE_SIZE != + } else if (sbio->physical + sbio->page_count * sectorsize != spage->physical_for_dev_replace || - sbio->logical + sbio->page_count * PAGE_SIZE != + sbio->logical + sbio->page_count * sectorsize != spage->logical) { scrub_wr_submit(sctx); goto again; } - ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); - if (ret != PAGE_SIZE) { + ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); + if (ret != sectorsize) { if (sbio->page_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; @@ -1729,7 +1734,8 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) btrfsic_submit_bio(sbio->bio); if (btrfs_is_zoned(sctx->fs_info)) - sctx->write_pointer = sbio->physical + sbio->page_count * PAGE_SIZE; + sctx->write_pointer = sbio->physical + sbio->page_count * + sctx->fs_info->sectorsize; } static void scrub_wr_bio_end_io(struct bio *bio) @@ -1988,6 +1994,65 @@ static void scrub_page_put(struct scrub_page *spage) } } +/* + * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 + * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. + */ +static void scrub_throttle(struct scrub_ctx *sctx) +{ + const int time_slice = 1000; + struct scrub_bio *sbio; + struct btrfs_device *device; + s64 delta; + ktime_t now; + u32 div; + u64 bwlimit; + + sbio = sctx->bios[sctx->curr]; + device = sbio->dev; + bwlimit = READ_ONCE(device->scrub_speed_max); + if (bwlimit == 0) + return; + + /* + * Slice is divided into intervals when the IO is submitted, adjust by + * bwlimit and maximum of 64 intervals. + */ + div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); + div = min_t(u32, 64, div); + + /* Start new epoch, set deadline */ + now = ktime_get(); + if (sctx->throttle_deadline == 0) { + sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); + sctx->throttle_sent = 0; + } + + /* Still in the time to send? */ + if (ktime_before(now, sctx->throttle_deadline)) { + /* If current bio is within the limit, send it */ + sctx->throttle_sent += sbio->bio->bi_iter.bi_size; + if (sctx->throttle_sent <= div_u64(bwlimit, div)) + return; + + /* We're over the limit, sleep until the rest of the slice */ + delta = ktime_ms_delta(sctx->throttle_deadline, now); + } else { + /* New request after deadline, start new epoch */ + delta = 0; + } + + if (delta) { + long timeout; + + timeout = div_u64(delta * HZ, 1000); + schedule_timeout_interruptible(timeout); + } + + /* Next call will start the deadline period */ + sctx->throttle_deadline = 0; +} + static void scrub_submit(struct scrub_ctx *sctx) { struct scrub_bio *sbio; @@ -1995,6 +2060,8 @@ static void scrub_submit(struct scrub_ctx *sctx) if (sctx->curr == -1) return; + scrub_throttle(sctx); + sbio = sctx->bios[sctx->curr]; sctx->curr = -1; scrub_pending_bio_inc(sctx); @@ -2006,6 +2073,7 @@ static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, { struct scrub_block *sblock = spage->sblock; struct scrub_bio *sbio; + const u32 sectorsize = sctx->fs_info->sectorsize; int ret; again: @@ -2044,9 +2112,9 @@ again: bio->bi_iter.bi_sector = sbio->physical >> 9; bio->bi_opf = REQ_OP_READ; sbio->status = 0; - } else if (sbio->physical + sbio->page_count * PAGE_SIZE != + } else if (sbio->physical + sbio->page_count * sectorsize != spage->physical || - sbio->logical + sbio->page_count * PAGE_SIZE != + sbio->logical + sbio->page_count * sectorsize != spage->logical || sbio->dev != spage->dev) { scrub_submit(sctx); @@ -2054,8 +2122,8 @@ again: } sbio->pagev[sbio->page_count] = spage; - ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); - if (ret != PAGE_SIZE) { + ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); + if (ret != sectorsize) { if (sbio->page_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; @@ -2398,7 +2466,7 @@ static void scrub_block_complete(struct scrub_block *sblock) if (sblock->sparity && corrupted && !sblock->data_corrected) { u64 start = sblock->pagev[0]->logical; u64 end = sblock->pagev[sblock->page_count - 1]->logical + - PAGE_SIZE; + sblock->sctx->fs_info->sectorsize; ASSERT(end - start <= U32_MAX); scrub_parity_mark_sectors_error(sblock->sparity, @@ -2418,7 +2486,7 @@ static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *su * the csum into @csum. * * The search source is sctx->csum_list, which is a pre-populated list - * storing bytenr ordered csum ranges. We're reponsible to cleanup any range + * storing bytenr ordered csum ranges. We're responsible to cleanup any range * that is before @logical. * * Return 0 if there is no csum for the range. @@ -3138,28 +3206,23 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, physical = map->stripes[num].physical; offset = 0; nstripes = div64_u64(length, map->stripe_len); + mirror_num = 1; + increment = map->stripe_len; if (map->type & BTRFS_BLOCK_GROUP_RAID0) { offset = map->stripe_len * num; increment = map->stripe_len * map->num_stripes; - mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { int factor = map->num_stripes / map->sub_stripes; offset = map->stripe_len * (num / map->sub_stripes); increment = map->stripe_len * factor; mirror_num = num % map->sub_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { - increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { get_raid56_logic_offset(physical, num, map, &offset, NULL); increment = map->stripe_len * nr_data_stripes(map); - mirror_num = 1; - } else { - increment = map->stripe_len; - mirror_num = 1; } path = btrfs_alloc_path(); |