diff options
Diffstat (limited to 'fs/btrfs/raid56.c')
| -rw-r--r-- | fs/btrfs/raid56.c | 1614 |
1 files changed, 899 insertions, 715 deletions
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 6a2cf754912d..f38d8305e46d 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -14,7 +14,6 @@ #include <linux/raid/xor.h> #include <linux/mm.h> #include "messages.h" -#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "volumes.h" @@ -41,6 +40,85 @@ #define BTRFS_STRIPE_HASH_TABLE_BITS 11 +static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc) +{ + if (unlikely(!bioc)) { + btrfs_crit(fs_info, "bioc=NULL"); + return; + } + btrfs_crit(fs_info, +"bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u", + bioc->logical, bioc->full_stripe_logical, bioc->size, + bioc->map_type, bioc->mirror_num, bioc->replace_nr_stripes, + bioc->replace_stripe_src, bioc->num_stripes); + for (int i = 0; i < bioc->num_stripes; i++) { + btrfs_crit(fs_info, " nr=%d devid=%llu physical=%llu", + i, bioc->stripes[i].dev->devid, + bioc->stripes[i].physical); + } +} + +static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info, + const struct btrfs_raid_bio *rbio) +{ + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) + return; + + dump_bioc(fs_info, rbio->bioc); + btrfs_crit(fs_info, +"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u sector_nsteps=%u scrubp=%u dbitmap=0x%lx", + rbio->flags, rbio->nr_sectors, rbio->nr_data, + rbio->real_stripes, rbio->stripe_nsectors, + rbio->sector_nsteps, rbio->scrubp, rbio->dbitmap); +} + +#define ASSERT_RBIO(expr, rbio) \ +({ \ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ + const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ + (rbio)->bioc->fs_info : NULL; \ + \ + btrfs_dump_rbio(__fs_info, (rbio)); \ + } \ + ASSERT((expr)); \ +}) + +#define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr) \ +({ \ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ + const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ + (rbio)->bioc->fs_info : NULL; \ + \ + btrfs_dump_rbio(__fs_info, (rbio)); \ + btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr)); \ + } \ + ASSERT((expr)); \ +}) + +#define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr) \ +({ \ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ + const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ + (rbio)->bioc->fs_info : NULL; \ + \ + btrfs_dump_rbio(__fs_info, (rbio)); \ + btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr)); \ + } \ + ASSERT((expr)); \ +}) + +#define ASSERT_RBIO_LOGICAL(expr, rbio, logical) \ +({ \ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ + const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ + (rbio)->bioc->fs_info : NULL; \ + \ + btrfs_dump_rbio(__fs_info, (rbio)); \ + btrfs_crit(__fs_info, "logical=%llu", (logical)); \ + } \ + ASSERT((expr)); \ +}) + /* Used by the raid56 code to lock stripes for read/modify/write */ struct btrfs_stripe_hash { struct list_head hash_list; @@ -56,30 +134,25 @@ struct btrfs_stripe_hash_table { }; /* - * A bvec like structure to present a sector inside a page. - * - * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. + * The PFN may still be valid, but our paddrs should always be block size + * aligned, thus such -1 paddr is definitely not a valid one. */ -struct sector_ptr { - struct page *page; - unsigned int pgoff:24; - unsigned int uptodate:8; -}; +#define INVALID_PADDR (~(phys_addr_t)0) static void rmw_rbio_work(struct work_struct *work); static void rmw_rbio_work_locked(struct work_struct *work); static void index_rbio_pages(struct btrfs_raid_bio *rbio); static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); -static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); +static int finish_parity_scrub(struct btrfs_raid_bio *rbio); static void scrub_rbio_work_locked(struct work_struct *work); static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) { bitmap_free(rbio->error_bitmap); kfree(rbio->stripe_pages); - kfree(rbio->bio_sectors); - kfree(rbio->stripe_sectors); + kfree(rbio->bio_paddrs); + kfree(rbio->stripe_paddrs); kfree(rbio->finish_pointers); } @@ -122,8 +195,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) struct btrfs_stripe_hash_table *x; struct btrfs_stripe_hash *cur; struct btrfs_stripe_hash *h; - int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; - int i; + unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS; if (info->stripe_hash_table) return 0; @@ -144,7 +216,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) h = table->table; - for (i = 0; i < num_entries; i++) { + for (unsigned int i = 0; i < num_entries; i++) { cur = h + i; INIT_LIST_HEAD(&cur->hash_list); spin_lock_init(&cur->lock); @@ -155,6 +227,24 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) return 0; } +static void memcpy_from_bio_to_stripe(struct btrfs_raid_bio *rbio, unsigned int sector_nr) +{ + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); + + ASSERT(sector_nr < rbio->nr_sectors); + for (int i = 0; i < rbio->sector_nsteps; i++) { + unsigned int index = sector_nr * rbio->sector_nsteps + i; + phys_addr_t dst = rbio->stripe_paddrs[index]; + phys_addr_t src = rbio->bio_paddrs[index]; + + ASSERT(dst != INVALID_PADDR); + ASSERT(src != INVALID_PADDR); + + memcpy_page(phys_to_page(dst), offset_in_page(dst), + phys_to_page(src), offset_in_page(src), step); + } +} + /* * caching an rbio means to copy anything from the * bio_sectors array into the stripe_pages array. We @@ -175,24 +265,19 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) for (i = 0; i < rbio->nr_sectors; i++) { /* Some range not covered by bio (partial write), skip it */ - if (!rbio->bio_sectors[i].page) { + if (rbio->bio_paddrs[i * rbio->sector_nsteps] == INVALID_PADDR) { /* * Even if the sector is not covered by bio, if it is * a data sector it should still be uptodate as it is * read from disk. */ if (i < rbio->nr_data * rbio->stripe_nsectors) - ASSERT(rbio->stripe_sectors[i].uptodate); + ASSERT(test_bit(i, rbio->stripe_uptodate_bitmap)); continue; } - ASSERT(rbio->stripe_sectors[i].page); - memcpy_page(rbio->stripe_sectors[i].page, - rbio->stripe_sectors[i].pgoff, - rbio->bio_sectors[i].page, - rbio->bio_sectors[i].pgoff, - rbio->bioc->fs_info->sectorsize); - rbio->stripe_sectors[i].uptodate = 1; + memcpy_from_bio_to_stripe(rbio, i); + set_bit(i, rbio->stripe_uptodate_bitmap); } set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); } @@ -202,7 +287,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) */ static int rbio_bucket(struct btrfs_raid_bio *rbio) { - u64 num = rbio->bioc->raid_map[0]; + u64 num = rbio->bioc->full_stripe_logical; /* * we shift down quite a bit. We're using byte @@ -215,19 +300,48 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio) return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); } -static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, - unsigned int page_nr) +/* Get the sector number of the first sector covered by @page_nr. */ +static u32 page_nr_to_sector_nr(struct btrfs_raid_bio *rbio, unsigned int page_nr) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - const u32 sectors_per_page = PAGE_SIZE / sectorsize; + u32 sector_nr; + + ASSERT(page_nr < rbio->nr_pages); + + sector_nr = (page_nr << PAGE_SHIFT) >> rbio->bioc->fs_info->sectorsize_bits; + ASSERT(sector_nr < rbio->nr_sectors); + return sector_nr; +} + +/* + * Get the number of sectors covered by @page_nr. + * + * For bs > ps cases, the result will always be 1. + * For bs <= ps cases, the result will be ps / bs. + */ +static u32 page_nr_to_num_sectors(struct btrfs_raid_bio *rbio, unsigned int page_nr) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + u32 nr_sectors; + + ASSERT(page_nr < rbio->nr_pages); + + nr_sectors = round_up(PAGE_SIZE, fs_info->sectorsize) >> fs_info->sectorsize_bits; + ASSERT(nr_sectors > 0); + return nr_sectors; +} + +static __maybe_unused bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, + unsigned int page_nr) +{ + const u32 sector_nr = page_nr_to_sector_nr(rbio, page_nr); + const u32 nr_bits = page_nr_to_num_sectors(rbio, page_nr); int i; ASSERT(page_nr < rbio->nr_pages); + ASSERT(sector_nr + nr_bits < rbio->nr_sectors); - for (i = sectors_per_page * page_nr; - i < sectors_per_page * page_nr + sectors_per_page; - i++) { - if (!rbio->stripe_sectors[i].uptodate) + for (i = sector_nr; i < sector_nr + nr_bits; i++) { + if (!test_bit(i, rbio->stripe_uptodate_bitmap)) return false; } return true; @@ -240,41 +354,44 @@ static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, */ static void index_stripe_sectors(struct btrfs_raid_bio *rbio) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); u32 offset; int i; - for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { + for (i = 0, offset = 0; i < rbio->nr_sectors * rbio->sector_nsteps; + i++, offset += step) { int page_index = offset >> PAGE_SHIFT; ASSERT(page_index < rbio->nr_pages); - rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; - rbio->stripe_sectors[i].pgoff = offset_in_page(offset); + if (!rbio->stripe_pages[page_index]) + continue; + + rbio->stripe_paddrs[i] = page_to_phys(rbio->stripe_pages[page_index]) + + offset_in_page(offset); } } static void steal_rbio_page(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest, int page_nr) { - const u32 sectorsize = src->bioc->fs_info->sectorsize; - const u32 sectors_per_page = PAGE_SIZE / sectorsize; - int i; + const u32 sector_nr = page_nr_to_sector_nr(src, page_nr); + const u32 nr_bits = page_nr_to_num_sectors(src, page_nr); + + ASSERT(page_nr < src->nr_pages); + ASSERT(sector_nr + nr_bits < src->nr_sectors); if (dest->stripe_pages[page_nr]) __free_page(dest->stripe_pages[page_nr]); dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; src->stripe_pages[page_nr] = NULL; - /* Also update the sector->uptodate bits. */ - for (i = sectors_per_page * page_nr; - i < sectors_per_page * page_nr + sectors_per_page; i++) - dest->stripe_sectors[i].uptodate = true; + /* Also update the stripe_uptodate_bitmap bits. */ + bitmap_set(dest->stripe_uptodate_bitmap, sector_nr, nr_bits); } static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) { - const int sector_nr = (page_nr << PAGE_SHIFT) >> - rbio->bioc->fs_info->sectorsize_bits; + const int sector_nr = page_nr_to_sector_nr(rbio, page_nr); /* * We have ensured PAGE_SIZE is aligned with sectorsize, thus @@ -332,12 +449,11 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) static void merge_rbio(struct btrfs_raid_bio *dest, struct btrfs_raid_bio *victim) { - bio_list_merge(&dest->bio_list, &victim->bio_list); + bio_list_merge_init(&dest->bio_list, &victim->bio_list); dest->bio_list_bytes += victim->bio_list_bytes; /* Also inherit the bitmaps from @victim. */ bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, dest->stripe_nsectors); - bio_list_init(&victim->bio_list); } /* @@ -407,16 +523,15 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) { struct btrfs_stripe_hash_table *table; - unsigned long flags; if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) return; table = rbio->bioc->fs_info->stripe_hash_table; - spin_lock_irqsave(&table->cache_lock, flags); + spin_lock(&table->cache_lock); __remove_rbio_from_cache(rbio); - spin_unlock_irqrestore(&table->cache_lock, flags); + spin_unlock(&table->cache_lock); } /* @@ -425,19 +540,17 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) { struct btrfs_stripe_hash_table *table; - unsigned long flags; struct btrfs_raid_bio *rbio; table = info->stripe_hash_table; - spin_lock_irqsave(&table->cache_lock, flags); + spin_lock(&table->cache_lock); while (!list_empty(&table->stripe_cache)) { - rbio = list_entry(table->stripe_cache.next, - struct btrfs_raid_bio, - stripe_cache); + rbio = list_first_entry(&table->stripe_cache, + struct btrfs_raid_bio, stripe_cache); __remove_rbio_from_cache(rbio); } - spin_unlock_irqrestore(&table->cache_lock, flags); + spin_unlock(&table->cache_lock); } /* @@ -467,14 +580,13 @@ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) static void cache_rbio(struct btrfs_raid_bio *rbio) { struct btrfs_stripe_hash_table *table; - unsigned long flags; if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) return; table = rbio->bioc->fs_info->stripe_hash_table; - spin_lock_irqsave(&table->cache_lock, flags); + spin_lock(&table->cache_lock); spin_lock(&rbio->bio_list_lock); /* bump our ref if we were not in the list before */ @@ -493,15 +605,15 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) if (table->cache_size > RBIO_CACHE_SIZE) { struct btrfs_raid_bio *found; - found = list_entry(table->stripe_cache.prev, - struct btrfs_raid_bio, - stripe_cache); + found = list_last_entry(&table->stripe_cache, + struct btrfs_raid_bio, + stripe_cache); if (found != rbio) __remove_rbio_from_cache(found); } - spin_unlock_irqrestore(&table->cache_lock, flags); + spin_unlock(&table->cache_lock); } /* @@ -530,15 +642,14 @@ static void run_xor(void **pages, int src_cnt, ssize_t len) */ static int rbio_is_full(struct btrfs_raid_bio *rbio) { - unsigned long flags; unsigned long size = rbio->bio_list_bytes; int ret = 1; - spin_lock_irqsave(&rbio->bio_list_lock, flags); + spin_lock(&rbio->bio_list_lock); if (size != rbio->nr_data * BTRFS_STRIPE_LEN) ret = 0; BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); - spin_unlock_irqrestore(&rbio->bio_list_lock, flags); + spin_unlock(&rbio->bio_list_lock); return ret; } @@ -571,7 +682,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, test_bit(RBIO_CACHE_BIT, &cur->flags)) return 0; - if (last->bioc->raid_map[0] != cur->bioc->raid_map[0]) + if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical) return 0; /* we can't merge with different operations */ @@ -588,46 +699,68 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, if (last->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; - if (last->operation == BTRFS_RBIO_REBUILD_MISSING || - last->operation == BTRFS_RBIO_READ_REBUILD) + if (last->operation == BTRFS_RBIO_READ_REBUILD) return 0; return 1; } -static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, - unsigned int stripe_nr, - unsigned int sector_nr) +/* Return the sector index for @stripe_nr and @sector_nr. */ +static unsigned int rbio_sector_index(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr) { - ASSERT(stripe_nr < rbio->real_stripes); - ASSERT(sector_nr < rbio->stripe_nsectors); + unsigned int ret; + + ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr); + ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr); + + ret = stripe_nr * rbio->stripe_nsectors + sector_nr; + ASSERT(ret < rbio->nr_sectors); + return ret; +} + +/* Return the paddr array index for @stripe_nr, @sector_nr and @step_nr. */ +static unsigned int rbio_paddr_index(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr, + unsigned int step_nr) +{ + unsigned int ret; - return stripe_nr * rbio->stripe_nsectors + sector_nr; + ASSERT_RBIO_SECTOR(step_nr < rbio->sector_nsteps, rbio, step_nr); + + ret = rbio_sector_index(rbio, stripe_nr, sector_nr) * rbio->sector_nsteps + step_nr; + ASSERT(ret < rbio->nr_sectors * rbio->sector_nsteps); + return ret; } -/* Return a sector from rbio->stripe_sectors, not from the bio list */ -static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, - unsigned int stripe_nr, - unsigned int sector_nr) +static phys_addr_t rbio_stripe_paddr(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, unsigned int sector_nr, + unsigned int step_nr) { - return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, - sector_nr)]; + return rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr)]; } -/* Grab a sector inside P stripe */ -static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, - unsigned int sector_nr) +static phys_addr_t rbio_pstripe_paddr(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr, unsigned int step_nr) { - return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); + return rbio_stripe_paddr(rbio, rbio->nr_data, sector_nr, step_nr); } -/* Grab a sector inside Q stripe, return NULL if not RAID6 */ -static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, - unsigned int sector_nr) +static phys_addr_t rbio_qstripe_paddr(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr, unsigned int step_nr) { if (rbio->nr_data + 1 == rbio->real_stripes) - return NULL; - return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); + return INVALID_PADDR; + return rbio_stripe_paddr(rbio, rbio->nr_data + 1, sector_nr, step_nr); +} + +/* Return a paddr pointer into the rbio::stripe_paddrs[] for the specified sector. */ +static phys_addr_t *rbio_stripe_paddrs(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, unsigned int sector_nr) +{ + return &rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, 0)]; } /* @@ -657,16 +790,15 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) struct btrfs_stripe_hash *h; struct btrfs_raid_bio *cur; struct btrfs_raid_bio *pending; - unsigned long flags; struct btrfs_raid_bio *freeit = NULL; struct btrfs_raid_bio *cache_drop = NULL; int ret = 0; h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); - spin_lock_irqsave(&h->lock, flags); + spin_lock(&h->lock); list_for_each_entry(cur, &h->hash_list, hash_list) { - if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0]) + if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical) continue; spin_lock(&cur->bio_list_lock); @@ -724,7 +856,7 @@ lockit: refcount_inc(&rbio->refs); list_add(&rbio->hash_list, &h->hash_list); out: - spin_unlock_irqrestore(&h->lock, flags); + spin_unlock(&h->lock); if (cache_drop) remove_rbio_from_cache(cache_drop); if (freeit) @@ -742,7 +874,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) { int bucket; struct btrfs_stripe_hash *h; - unsigned long flags; int keep_cache = 0; bucket = rbio_bucket(rbio); @@ -751,7 +882,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) if (list_empty(&rbio->plug_list)) cache_rbio(rbio); - spin_lock_irqsave(&h->lock, flags); + spin_lock(&h->lock); spin_lock(&rbio->bio_list_lock); if (!list_empty(&rbio->hash_list)) { @@ -788,12 +919,9 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) list_add(&next->hash_list, &h->hash_list); refcount_inc(&next->refs); spin_unlock(&rbio->bio_list_lock); - spin_unlock_irqrestore(&h->lock, flags); + spin_unlock(&h->lock); - if (next->operation == BTRFS_RBIO_READ_REBUILD) - start_async_work(next, recover_rbio_work_locked); - else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { - steal_rbio(rbio, next); + if (next->operation == BTRFS_RBIO_READ_REBUILD) { start_async_work(next, recover_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); @@ -808,21 +936,21 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) } done: spin_unlock(&rbio->bio_list_lock); - spin_unlock_irqrestore(&h->lock, flags); + spin_unlock(&h->lock); done_nolock: if (!keep_cache) remove_rbio_from_cache(rbio); } -static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) +static void rbio_endio_bio_list(struct bio *cur, blk_status_t status) { struct bio *next; while (cur) { next = cur->bi_next; cur->bi_next = NULL; - cur->bi_status = err; + cur->bi_status = status; bio_endio(cur); cur = next; } @@ -832,7 +960,7 @@ static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) * this frees the rbio and runs through all the bios in the * bio_list and calls end_io on them */ -static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *extra; @@ -861,13 +989,13 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) extra = bio_list_get(&rbio->bio_list); free_raid_bio(rbio); - rbio_endio_bio_list(cur, err); + rbio_endio_bio_list(cur, status); if (extra) - rbio_endio_bio_list(extra, err); + rbio_endio_bio_list(extra, status); } /* - * Get a sector pointer specified by its @stripe_nr and @sector_nr. + * Get paddr pointer for the sector specified by its @stripe_nr and @sector_nr. * * @rbio: The raid bio * @stripe_nr: Stripe number, valid range [0, real_stripe) @@ -877,32 +1005,52 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) * * The read/modify/write code wants to reuse the original bio page as much * as possible, and only use stripe_sectors as fallback. + * + * Return NULL if bio_list_only is set but the specified sector has no + * coresponding bio. */ -static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, - int stripe_nr, int sector_nr, - bool bio_list_only) +static phys_addr_t *sector_paddrs_in_rbio(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr, + bool bio_list_only) { - struct sector_ptr *sector; - int index; - - ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes); - ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); + phys_addr_t *ret = NULL; + const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, 0); - index = stripe_nr * rbio->stripe_nsectors + sector_nr; - ASSERT(index >= 0 && index < rbio->nr_sectors); + ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps); - spin_lock_irq(&rbio->bio_list_lock); - sector = &rbio->bio_sectors[index]; - if (sector->page || bio_list_only) { - /* Don't return sector without a valid page pointer */ - if (!sector->page) - sector = NULL; - spin_unlock_irq(&rbio->bio_list_lock); - return sector; + scoped_guard(spinlock, &rbio->bio_list_lock) { + if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) { + /* Don't return sector without a valid page pointer */ + if (rbio->bio_paddrs[index] != INVALID_PADDR) + ret = &rbio->bio_paddrs[index]; + return ret; + } } - spin_unlock_irq(&rbio->bio_list_lock); + return &rbio->stripe_paddrs[index]; +} + +/* + * Similar to sector_paddr_in_rbio(), but with extra consideration for + * bs > ps cases, where we can have multiple steps for a fs block. + */ +static phys_addr_t sector_paddr_in_rbio(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr, int step_nr, + bool bio_list_only) +{ + phys_addr_t ret = INVALID_PADDR; + const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr); - return &rbio->stripe_sectors[index]; + ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps); + + scoped_guard(spinlock, &rbio->bio_list_lock) { + if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) { + /* Don't return sector without a valid page pointer */ + if (rbio->bio_paddrs[index] != INVALID_PADDR) + ret = rbio->bio_paddrs[index]; + return ret; + } + } + return rbio->stripe_paddrs[index]; } /* @@ -912,40 +1060,56 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, struct btrfs_io_context *bioc) { - const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; + const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes; const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; const unsigned int num_pages = stripe_npages * real_stripes; const unsigned int stripe_nsectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; const unsigned int num_sectors = stripe_nsectors * real_stripes; + const unsigned int step = min(fs_info->sectorsize, PAGE_SIZE); + const unsigned int sector_nsteps = fs_info->sectorsize / step; struct btrfs_raid_bio *rbio; - /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ - ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); + /* + * For bs <= ps cases, ps must be aligned to bs. + * For bs > ps cases, bs must be aligned to ps. + */ + ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize) || + IS_ALIGNED(fs_info->sectorsize, PAGE_SIZE)); /* * Our current stripe len should be fixed to 64k thus stripe_nsectors * (at most 16) should be no larger than BITS_PER_LONG. */ ASSERT(stripe_nsectors <= BITS_PER_LONG); + /* + * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256 + * (limited by u8). + */ + ASSERT(real_stripes >= 2); + ASSERT(real_stripes <= U8_MAX); + rbio = kzalloc(sizeof(*rbio), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); - rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), - GFP_NOFS); - rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), - GFP_NOFS); + rbio->bio_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS); + rbio->stripe_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS); rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); + rbio->stripe_uptodate_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); - if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || - !rbio->finish_pointers || !rbio->error_bitmap) { + if (!rbio->stripe_pages || !rbio->bio_paddrs || !rbio->stripe_paddrs || + !rbio->finish_pointers || !rbio->error_bitmap || !rbio->stripe_uptodate_bitmap) { free_raid_bio_pointers(rbio); kfree(rbio); return ERR_PTR(-ENOMEM); } + for (int i = 0; i < num_sectors * sector_nsteps; i++) { + rbio->stripe_paddrs[i] = INVALID_PADDR; + rbio->bio_paddrs[i] = INVALID_PADDR; + } bio_list_init(&rbio->bio_list); init_waitqueue_head(&rbio->io_wait); @@ -960,11 +1124,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, rbio->real_stripes = real_stripes; rbio->stripe_npages = stripe_npages; rbio->stripe_nsectors = stripe_nsectors; + rbio->sector_nsteps = sector_nsteps; refcount_set(&rbio->refs, 1); atomic_set(&rbio->stripes_pending, 0); ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); + ASSERT(rbio->nr_data > 0); return rbio; } @@ -974,7 +1140,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) { int ret; - ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages); + ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false); if (ret < 0) return ret; /* Mapping all sectors */ @@ -989,7 +1155,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) int ret; ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, - rbio->stripe_pages + data_pages); + rbio->stripe_pages + data_pages, false); if (ret < 0) return ret; @@ -998,13 +1164,13 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) } /* - * Return the total numer of errors found in the vertical stripe of @sector_nr. + * Return the total number of errors found in the vertical stripe of @sector_nr. * * @faila and @failb will also be updated to the first and second stripe * number of the errors. */ -static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, - int *faila, int *failb) +static int get_rbio_vertical_errors(struct btrfs_raid_bio *rbio, int sector_nr, + int *faila, int *failb) { int stripe_nr; int found_errors = 0; @@ -1036,20 +1202,41 @@ static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, return found_errors; } +static int bio_add_paddrs(struct bio *bio, phys_addr_t *paddrs, unsigned int nr_steps, + unsigned int step) +{ + int added = 0; + int ret; + + for (int i = 0; i < nr_steps; i++) { + ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, + offset_in_page(paddrs[i])); + if (ret != step) + goto revert; + added += ret; + } + return added; +revert: + /* + * We don't need to revert the bvec, as the bio will be submitted immediately, + * as long as the size is reduced the extra bvec will not be accessed. + */ + bio->bi_iter.bi_size -= added; + return 0; +} + /* * Add a single sector @sector into our list of bios for IO. * * Return 0 if everything went well. - * Return <0 for error. + * Return <0 for error, and no byte will be added to @rbio. */ -static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list, - struct sector_ptr *sector, - unsigned int stripe_nr, - unsigned int sector_nr, - enum req_op op) +static int rbio_add_io_paddrs(struct btrfs_raid_bio *rbio, struct bio_list *bio_list, + phys_addr_t *paddrs, unsigned int stripe_nr, + unsigned int sector_nr, enum req_op op) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 step = min(sectorsize, PAGE_SIZE); struct bio *last = bio_list->tail; int ret; struct bio *bio; @@ -1061,9 +1248,11 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, * thus it can be larger than rbio->real_stripe. * So here we check against bioc->num_stripes, not rbio->real_stripes. */ - ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes); - ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); - ASSERT(sector->page); + ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes, + rbio, stripe_nr); + ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, + rbio, sector_nr); + ASSERT(paddrs != NULL); stripe = &rbio->bioc->stripes[stripe_nr]; disk_start = stripe->physical + sector_nr * sectorsize; @@ -1076,16 +1265,16 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, rbio->error_bitmap); /* Check if we have reached tolerance early. */ - found_errors = get_rbio_veritical_errors(rbio, sector_nr, - NULL, NULL); - if (found_errors > rbio->bioc->max_errors) + found_errors = get_rbio_vertical_errors(rbio, sector_nr, + NULL, NULL); + if (unlikely(found_errors > rbio->bioc->max_errors)) return -EIO; return 0; } /* see if we can add this page onto our existing bio */ if (last) { - u64 last_end = last->bi_iter.bi_sector << 9; + u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT; last_end += last->bi_iter.bi_size; /* @@ -1094,8 +1283,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, */ if (last_end == disk_start && !last->bi_status && last->bi_bdev == stripe->dev->bdev) { - ret = bio_add_page(last, sector->page, sectorsize, - sector->pgoff); + ret = bio_add_paddrs(last, paddrs, rbio->sector_nsteps, step); if (ret == sectorsize) return 0; } @@ -1105,34 +1293,30 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, bio = bio_alloc(stripe->dev->bdev, max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), op, GFP_NOFS); - bio->bi_iter.bi_sector = disk_start >> 9; + bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; bio->bi_private = rbio; - bio_add_page(bio, sector->page, sectorsize, sector->pgoff); + ret = bio_add_paddrs(bio, paddrs, rbio->sector_nsteps, step); + ASSERT(ret == sectorsize); bio_list_add(bio_list, bio); return 0; } static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - struct bio_vec bvec; - struct bvec_iter iter; + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); + const u32 step_bits = min(fs_info->sectorsize_bits, PAGE_SHIFT); + struct bvec_iter iter = bio->bi_iter; + phys_addr_t paddr; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - - rbio->bioc->raid_map[0]; + rbio->bioc->full_stripe_logical; - bio_for_each_segment(bvec, bio, iter) { - u32 bvec_offset; + btrfs_bio_for_each_block(paddr, bio, &iter, step) { + unsigned int index = (offset >> step_bits); - for (bvec_offset = 0; bvec_offset < bvec.bv_len; - bvec_offset += sectorsize, offset += sectorsize) { - int index = offset / sectorsize; - struct sector_ptr *sector = &rbio->bio_sectors[index]; - - sector->page = bvec.bv_page; - sector->pgoff = bvec.bv_offset + bvec_offset; - ASSERT(sector->pgoff < PAGE_SIZE); - } + rbio->bio_paddrs[index] = paddr; + offset += step; } } @@ -1148,11 +1332,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) { struct bio *bio; - spin_lock_irq(&rbio->bio_list_lock); + spin_lock(&rbio->bio_list_lock); bio_list_for_each(bio, &rbio->bio_list) index_one_bio(rbio, bio); - spin_unlock_irq(&rbio->bio_list_lock); + spin_unlock(&rbio->bio_list_lock); } static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, @@ -1183,52 +1367,94 @@ not_found: trace_info->stripe_nr = -1; } -/* Generate PQ for one veritical stripe. */ -static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) +static inline void bio_list_put(struct bio_list *bio_list) +{ + struct bio *bio; + + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); +} + +static void assert_rbio(struct btrfs_raid_bio *rbio) +{ + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) + return; + + /* + * At least two stripes (2 disks RAID5), and since real_stripes is U8, + * we won't go beyond 256 disks anyway. + */ + ASSERT_RBIO(rbio->real_stripes >= 2, rbio); + ASSERT_RBIO(rbio->nr_data > 0, rbio); + + /* + * This is another check to make sure nr data stripes is smaller + * than total stripes. + */ + ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio); +} + +static inline void *kmap_local_paddr(phys_addr_t paddr) +{ + /* The sector pointer must have a page mapped to it. */ + ASSERT(paddr != INVALID_PADDR); + + return kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr); +} + +static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int sector_nr, + unsigned int step_nr) { void **pointers = rbio->finish_pointers; - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - struct sector_ptr *sector; + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); int stripe; const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; /* First collect one sector from each data stripe */ - for (stripe = 0; stripe < rbio->nr_data; stripe++) { - sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; - } + for (stripe = 0; stripe < rbio->nr_data; stripe++) + pointers[stripe] = kmap_local_paddr( + sector_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0)); /* Then add the parity stripe */ - sector = rbio_pstripe_sector(rbio, sectornr); - sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; + pointers[stripe++] = kmap_local_paddr(rbio_pstripe_paddr(rbio, sector_nr, step_nr)); if (has_qstripe) { /* * RAID6, add the qstripe and call the library function * to fill in our p/q */ - sector = rbio_qstripe_sector(rbio, sectornr); - sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + - sector->pgoff; + pointers[stripe++] = kmap_local_paddr( + rbio_qstripe_paddr(rbio, sector_nr, step_nr)); - raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, - pointers); + assert_rbio(rbio); + raid6_call.gen_syndrome(rbio->real_stripes, step, pointers); } else { /* raid5 */ - memcpy(pointers[rbio->nr_data], pointers[0], sectorsize); - run_xor(pointers + 1, rbio->nr_data - 1, sectorsize); + memcpy(pointers[rbio->nr_data], pointers[0], step); + run_xor(pointers + 1, rbio->nr_data - 1, step); } for (stripe = stripe - 1; stripe >= 0; stripe--) kunmap_local(pointers[stripe]); } +/* Generate PQ for one vertical stripe. */ +static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) +{ + const bool has_qstripe = (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6); + + for (int i = 0; i < rbio->sector_nsteps; i++) + generate_pq_vertical_step(rbio, sectornr, i); + + set_bit(rbio_sector_index(rbio, rbio->nr_data, sectornr), + rbio->stripe_uptodate_bitmap); + if (has_qstripe) + set_bit(rbio_sector_index(rbio, rbio->nr_data + 1, sectornr), + rbio->stripe_uptodate_bitmap); +} + static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { - struct bio *bio; /* The total sector number inside the full stripe. */ int total_sector_nr; int sectornr; @@ -1252,7 +1478,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { - struct sector_ptr *sector; + phys_addr_t *paddrs; stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; @@ -1262,31 +1488,42 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, continue; if (stripe < rbio->nr_data) { - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (!sector) + paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1); + if (paddrs == NULL) continue; } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); } - ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, + ret = rbio_add_io_paddrs(rbio, bio_list, paddrs, stripe, sectornr, REQ_OP_WRITE); if (ret) goto error; } - if (likely(!rbio->bioc->num_tgtdevs)) + if (likely(!rbio->bioc->replace_nr_stripes)) return 0; - /* Make a copy for the replace target device. */ + /* + * Make a copy for the replace target device. + * + * Thus the source stripe number (in replace_stripe_src) should be valid. + */ + ASSERT(rbio->bioc->replace_stripe_src >= 0); + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { - struct sector_ptr *sector; + phys_addr_t *paddrs; stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; - if (!rbio->bioc->tgtdev_map[stripe]) { + /* + * For RAID56, there is only one device that can be replaced, + * and replace_stripe_src[0] indicates the stripe number we + * need to copy from. + */ + if (stripe != rbio->bioc->replace_stripe_src) { /* * We can skip the whole stripe completely, note * total_sector_nr will be increased by one anyway. @@ -1301,15 +1538,15 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, continue; if (stripe < rbio->nr_data) { - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (!sector) + paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1); + if (paddrs == NULL) continue; } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); } - ret = rbio_add_io_sector(rbio, bio_list, sector, - rbio->bioc->tgtdev_map[stripe], + ret = rbio_add_io_paddrs(rbio, bio_list, paddrs, + rbio->real_stripes, sectornr, REQ_OP_WRITE); if (ret) goto error; @@ -1317,8 +1554,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, return 0; error: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); + bio_list_put(bio_list); return -EIO; } @@ -1326,7 +1562,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - - rbio->bioc->raid_map[0]; + rbio->bioc->full_stripe_logical; int total_nr_sector = offset >> fs_info->sectorsize_bits; ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors); @@ -1357,22 +1593,17 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) } /* - * For subpage case, we can no longer set page Uptodate directly for - * stripe_pages[], thus we need to locate the sector. + * Return the index inside the rbio->stripe_sectors[] array. + * + * Return -1 if not found. */ -static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, - struct page *page, - unsigned int pgoff) +static int find_stripe_sector_nr(struct btrfs_raid_bio *rbio, phys_addr_t paddr) { - int i; - - for (i = 0; i < rbio->nr_sectors; i++) { - struct sector_ptr *sector = &rbio->stripe_sectors[i]; - - if (sector->page == page && sector->pgoff == pgoff) - return sector; + for (int i = 0; i < rbio->nr_sectors; i++) { + if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == paddr) + return i; } - return NULL; + return -1; } /* @@ -1382,38 +1613,34 @@ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + const u32 step = min(sectorsize, PAGE_SIZE); + u32 offset = 0; + phys_addr_t paddr; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { - struct sector_ptr *sector; - int pgoff; + btrfs_bio_for_each_block_all(paddr, bio, step) { + /* Hitting the first step of a sector. */ + if (IS_ALIGNED(offset, sectorsize)) { + int sector_nr = find_stripe_sector_nr(rbio, paddr); - for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; - pgoff += sectorsize) { - sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); - ASSERT(sector); - if (sector) - sector->uptodate = 1; + ASSERT(sector_nr >= 0); + if (sector_nr >= 0) + set_bit(sector_nr, rbio->stripe_uptodate_bitmap); } + offset += step; } } static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) { - struct bio_vec *bv = bio_first_bvec_all(bio); + phys_addr_t bvec_paddr = bvec_phys(bio_first_bvec_all(bio)); int i; for (i = 0; i < rbio->nr_sectors; i++) { - struct sector_ptr *sector; - - sector = &rbio->stripe_sectors[i]; - if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == bvec_paddr) break; - sector = &rbio->bio_sectors[i]; - if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + if (rbio->bio_paddrs[i * rbio->sector_nsteps] == bvec_paddr) break; } ASSERT(i < rbio->nr_sectors); @@ -1425,13 +1652,20 @@ static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bi int total_sector_nr = get_bio_sector_nr(rbio, bio); u32 bio_size = 0; struct bio_vec *bvec; - struct bvec_iter_all iter_all; + int i; - bio_for_each_segment_all(bvec, bio, iter_all) + bio_for_each_bvec_all(bvec, bio, i) bio_size += bvec->bv_len; - bitmap_set(rbio->error_bitmap, total_sector_nr, - bio_size >> rbio->bioc->fs_info->sectorsize_bits); + /* + * Since we can have multiple bios touching the error_bitmap, we cannot + * call bitmap_set() without protection. + * + * Instead use set_bit() for each bit, as set_bit() itself is atomic. + */ + for (i = total_sector_nr; i < total_sector_nr + + (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++) + set_bit(i, rbio->error_bitmap); } /* Verify the data sectors at read time. */ @@ -1439,9 +1673,12 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, struct bio *bio) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); + const u32 nr_steps = rbio->sector_nsteps; int total_sector_nr = get_bio_sector_nr(rbio, bio); - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + u32 offset = 0; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; + phys_addr_t paddr; /* No data csum for the whole stripe, no need to verify. */ if (!rbio->csum_bitmap || !rbio->csum_buf) @@ -1451,26 +1688,26 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) return; - bio_for_each_segment_all(bvec, bio, iter_all) { - int bv_offset; + btrfs_bio_for_each_block_all(paddr, bio, step) { + u8 csum_buf[BTRFS_CSUM_SIZE]; + u8 *expected_csum; - for (bv_offset = bvec->bv_offset; - bv_offset < bvec->bv_offset + bvec->bv_len; - bv_offset += fs_info->sectorsize, total_sector_nr++) { - u8 csum_buf[BTRFS_CSUM_SIZE]; - u8 *expected_csum = rbio->csum_buf + - total_sector_nr * fs_info->csum_size; - int ret; + paddrs[(offset / step) % nr_steps] = paddr; + offset += step; - /* No csum for this sector, skip to the next sector. */ - if (!test_bit(total_sector_nr, rbio->csum_bitmap)) - continue; + /* Not yet covering the full fs block, continue to the next step. */ + if (!IS_ALIGNED(offset, fs_info->sectorsize)) + continue; - ret = btrfs_check_sector_csum(fs_info, bvec->bv_page, - bv_offset, csum_buf, expected_csum); - if (ret < 0) - set_bit(total_sector_nr, rbio->error_bitmap); - } + /* No csum for this sector, skip to the next sector. */ + if (!test_bit(total_sector_nr, rbio->csum_bitmap)) + continue; + + expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size; + btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf); + if (unlikely(memcmp(csum_buf, expected_csum, fs_info->csum_size) != 0)) + set_bit(total_sector_nr, rbio->error_bitmap); + total_sector_nr++; } } @@ -1490,7 +1727,7 @@ static void raid_wait_read_end_io(struct bio *bio) wake_up(&rbio->io_wait); } -static void submit_read_bios(struct btrfs_raid_bio *rbio, +static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { struct bio *bio; @@ -1499,49 +1736,16 @@ static void submit_read_bios(struct btrfs_raid_bio *rbio, while ((bio = bio_list_pop(bio_list))) { bio->bi_end_io = raid_wait_read_end_io; - if (trace_raid56_scrub_read_recover_enabled()) { + if (trace_raid56_read_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_read_recover(rbio, bio, &trace_info); + trace_raid56_read(rbio, bio, &trace_info); } submit_bio(bio); } -} - -static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) -{ - struct bio *bio; - int total_sector_nr; - int ret = 0; - - ASSERT(bio_list_size(bio_list) == 0); - - /* - * Build a list of bios to read all sectors (including data and P/Q). - * - * This behaviro is to compensate the later csum verification and - * recovery. - */ - for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; - total_sector_nr++) { - struct sector_ptr *sector; - int stripe = total_sector_nr / rbio->stripe_nsectors; - int sectornr = total_sector_nr % rbio->stripe_nsectors; - - sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, bio_list, sector, - stripe, sectornr, REQ_OP_READ); - if (ret) - goto cleanup; - } - return 0; -cleanup: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); - return ret; + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); } static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) @@ -1549,7 +1753,7 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) const int data_pages = rbio->nr_data * rbio->stripe_npages; int ret; - ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages); + ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false); if (ret < 0) return ret; @@ -1568,7 +1772,6 @@ struct btrfs_plug_cb { struct blk_plug_cb cb; struct btrfs_fs_info *info; struct list_head rbio_list; - struct work_struct work; }; /* @@ -1600,8 +1803,8 @@ static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule) list_sort(NULL, &plug->rbio_list, plug_cmp); while (!list_empty(&plug->rbio_list)) { - cur = list_entry(plug->rbio_list.next, - struct btrfs_raid_bio, plug_list); + cur = list_first_entry(&plug->rbio_list, + struct btrfs_raid_bio, plug_list); list_del_init(&cur->plug_list); if (rbio_is_full(cur)) { @@ -1629,14 +1832,15 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) { const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; - const u64 full_stripe_start = rbio->bioc->raid_map[0]; + const u64 full_stripe_start = rbio->bioc->full_stripe_logical; const u32 orig_len = orig_bio->bi_iter.bi_size; const u32 sectorsize = fs_info->sectorsize; u64 cur_logical; - ASSERT(orig_logical >= full_stripe_start && - orig_logical + orig_len <= full_stripe_start + - rbio->nr_data * BTRFS_STRIPE_LEN); + ASSERT_RBIO_LOGICAL(orig_logical >= full_stripe_start && + orig_logical + orig_len <= full_stripe_start + + rbio->nr_data * BTRFS_STRIPE_LEN, + rbio, orig_logical); bio_list_add(&rbio->bio_list, orig_bio); rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; @@ -1660,12 +1864,12 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; - int ret = 0; rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { - ret = PTR_ERR(rbio); - goto fail; + bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); + bio_endio(bio); + return; } rbio->operation = BTRFS_RBIO_WRITE; rbio_add_bio(rbio, bio); @@ -1674,41 +1878,33 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) * Don't plug on full rbios, just get them out the door * as quickly as we can */ - if (rbio_is_full(rbio)) - goto queue_rbio; - - cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); - if (cb) { - plug = container_of(cb, struct btrfs_plug_cb, cb); - if (!plug->info) { - plug->info = fs_info; - INIT_LIST_HEAD(&plug->rbio_list); + if (!rbio_is_full(rbio)) { + cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); + if (cb) { + plug = container_of(cb, struct btrfs_plug_cb, cb); + if (!plug->info) { + plug->info = fs_info; + INIT_LIST_HEAD(&plug->rbio_list); + } + list_add_tail(&rbio->plug_list, &plug->rbio_list); + return; } - list_add_tail(&rbio->plug_list, &plug->rbio_list); - return; } -queue_rbio: + /* * Either we don't have any existing plug, or we're doing a full stripe, - * can queue the rmw work now. + * queue the rmw work now. */ start_async_work(rbio, rmw_rbio_work); - - return; - -fail: - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); } static int verify_one_sector(struct btrfs_raid_bio *rbio, int stripe_nr, int sector_nr) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; - struct sector_ptr *sector; + phys_addr_t *paddrs; u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *csum_expected; - int ret; if (!rbio->csum_bitmap || !rbio->csum_buf) return 0; @@ -1720,59 +1916,33 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, * If we're rebuilding a read, we have to use pages from the * bio list if possible. */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { - sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { + paddrs = sector_paddrs_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { - sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); + paddrs = rbio_stripe_paddrs(rbio, stripe_nr, sector_nr); } - ASSERT(sector->page); - csum_expected = rbio->csum_buf + (stripe_nr * rbio->stripe_nsectors + sector_nr) * fs_info->csum_size; - ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff, - csum_buf, csum_expected); - return ret; + btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf); + if (unlikely(memcmp(csum_buf, csum_expected, fs_info->csum_size) != 0)) + return -EIO; + return 0; } -/* - * Recover a vertical stripe specified by @sector_nr. - * @*pointers are the pre-allocated pointers by the caller, so we don't - * need to allocate/free the pointers again and again. - */ -static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, - void **pointers, void **unmap_array) +static void recover_vertical_step(struct btrfs_raid_bio *rbio, + unsigned int sector_nr, + unsigned int step_nr, + int faila, int failb, + void **pointers, void **unmap_array) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; - struct sector_ptr *sector; - const u32 sectorsize = fs_info->sectorsize; - int found_errors; - int faila; - int failb; + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); int stripe_nr; - int ret = 0; - - /* - * Now we just use bitmap to mark the horizontal stripes in - * which we have data when doing parity scrub. - */ - if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && - !test_bit(sector_nr, &rbio->dbitmap)) - return 0; - - found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, - &failb); - /* - * No errors in the veritical stripe, skip it. Can happen for recovery - * which only part of a stripe failed csum check. - */ - if (!found_errors) - return 0; - if (found_errors > rbio->bioc->max_errors) - return -EIO; + ASSERT(step_nr < rbio->sector_nsteps); + ASSERT(sector_nr < rbio->stripe_nsectors); /* * Setup our array of pointers with sectors from each stripe @@ -1781,19 +1951,18 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, * pointer order. */ for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + phys_addr_t paddr; + /* * If we're rebuilding a read, we have to use pages from the * bio list if possible. */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { - sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { + paddr = sector_paddr_in_rbio(rbio, stripe_nr, sector_nr, step_nr, 0); } else { - sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); + paddr = rbio_stripe_paddr(rbio, stripe_nr, sector_nr, step_nr); } - ASSERT(sector->page); - pointers[stripe_nr] = kmap_local_page(sector->page) + - sector->pgoff; + pointers[stripe_nr] = kmap_local_paddr(paddr); unmap_array[stripe_nr] = pointers[stripe_nr]; } @@ -1823,9 +1992,8 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, * here due to a crc mismatch and we can't give them the * data they want. */ - if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { - if (rbio->bioc->raid_map[faila] == - RAID5_P_STRIPE) + if (failb == rbio->real_stripes - 1) { + if (faila == rbio->real_stripes - 2) /* * Only P and Q are corrupted. * We only care about data stripes recovery, @@ -1839,11 +2007,11 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, goto pstripe; } - if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { - raid6_datap_recov(rbio->real_stripes, sectorsize, + if (failb == rbio->real_stripes - 2) { + raid6_datap_recov(rbio->real_stripes, step, faila, pointers); } else { - raid6_2data_recov(rbio->real_stripes, sectorsize, + raid6_2data_recov(rbio->real_stripes, step, faila, failb, pointers); } } else { @@ -1853,7 +2021,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, ASSERT(failb == -1); pstripe: /* Copy parity block into failed block to start with */ - memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); + memcpy(pointers[faila], pointers[rbio->nr_data], step); /* Rearrange the pointer array */ p = pointers[faila]; @@ -1863,40 +2031,66 @@ pstripe: pointers[rbio->nr_data - 1] = p; /* Xor in the rest */ - run_xor(pointers, rbio->nr_data - 1, sectorsize); - + run_xor(pointers, rbio->nr_data - 1, step); } +cleanup: + for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) + kunmap_local(unmap_array[stripe_nr]); +} + +/* + * Recover a vertical stripe specified by @sector_nr. + * @*pointers are the pre-allocated pointers by the caller, so we don't + * need to allocate/free the pointers again and again. + */ +static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, + void **pointers, void **unmap_array) +{ + int found_errors; + int faila; + int failb; + int ret = 0; + /* - * No matter if this is a RMW or recovery, we should have all - * failed sectors repaired in the vertical stripe, thus they are now - * uptodate. - * Especially if we determine to cache the rbio, we need to - * have at least all data sectors uptodate. - * - * If possible, also check if the repaired sector matches its data - * checksum. + * Now we just use bitmap to mark the horizontal stripes in + * which we have data when doing parity scrub. */ + if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && + !test_bit(sector_nr, &rbio->dbitmap)) + return 0; + + found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila, + &failb); + /* + * No errors in the vertical stripe, skip it. Can happen for recovery + * which only part of a stripe failed csum check. + */ + if (!found_errors) + return 0; + + if (unlikely(found_errors > rbio->bioc->max_errors)) + return -EIO; + + for (int i = 0; i < rbio->sector_nsteps; i++) + recover_vertical_step(rbio, sector_nr, i, faila, failb, + pointers, unmap_array); if (faila >= 0) { ret = verify_one_sector(rbio, faila, sector_nr); if (ret < 0) - goto cleanup; + return ret; - sector = rbio_stripe_sector(rbio, faila, sector_nr); - sector->uptodate = 1; + set_bit(rbio_sector_index(rbio, faila, sector_nr), + rbio->stripe_uptodate_bitmap); } if (failb >= 0) { - ret = verify_one_sector(rbio, faila, sector_nr); + ret = verify_one_sector(rbio, failb, sector_nr); if (ret < 0) - goto cleanup; + return ret; - sector = rbio_stripe_sector(rbio, failb, sector_nr); - sector->uptodate = 1; + set_bit(rbio_sector_index(rbio, failb, sector_nr), + rbio->stripe_uptodate_bitmap); } - -cleanup: - for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) - kunmap_local(unmap_array[stripe_nr]); return ret; } @@ -1920,11 +2114,10 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) goto out; } - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { - spin_lock_irq(&rbio->bio_list_lock); + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { + spin_lock(&rbio->bio_list_lock); set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); - spin_unlock_irq(&rbio->bio_list_lock); + spin_unlock(&rbio->bio_list_lock); } index_rbio_pages(rbio); @@ -1941,14 +2134,25 @@ out: return ret; } -static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) +static void recover_rbio(struct btrfs_raid_bio *rbio) { - struct bio *bio; + struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0; - ASSERT(bio_list_size(bio_list) == 0); + /* + * Either we're doing recover for a read failure or degraded write, + * caller should have set error bitmap correctly. + */ + ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); + + /* For recovery, we need to read all sectors including P/Q. */ + ret = alloc_rbio_pages(rbio); + if (ret < 0) + goto out; + + index_rbio_pages(rbio); + /* * Read everything that hasn't failed. However this time we will * not trust any cached sector. @@ -1961,7 +2165,7 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, total_sector_nr++) { int stripe = total_sector_nr / rbio->stripe_nsectors; int sectornr = total_sector_nr % rbio->stripe_nsectors; - struct sector_ptr *sector; + phys_addr_t *paddrs; /* * Skip the range which has error. It can be a range which is @@ -1978,79 +2182,33 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, continue; } - sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe, sectornr, REQ_OP_READ); - if (ret < 0) - goto error; + if (ret < 0) { + bio_list_put(&bio_list); + goto out; + } } - return 0; -error: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); - - return -EIO; -} - -static int recover_rbio(struct btrfs_raid_bio *rbio) -{ - struct bio_list bio_list; - struct bio *bio; - int ret; - - /* - * Either we're doing recover for a read failure or degraded write, - * caller should have set error bitmap correctly. - */ - ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); - bio_list_init(&bio_list); - - /* For recovery, we need to read all sectors including P/Q. */ - ret = alloc_rbio_pages(rbio); - if (ret < 0) - goto out; - - index_rbio_pages(rbio); - - ret = recover_assemble_read_bios(rbio, &bio_list); - if (ret < 0) - goto out; - - submit_read_bios(rbio, &bio_list); - wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + submit_read_wait_bio_list(rbio, &bio_list); ret = recover_sectors(rbio); - out: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return ret; + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void recover_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; - int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = lock_stripe_add(rbio); - if (ret == 0) { - ret = recover_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); - } + if (!lock_stripe_add(rbio)) + recover_rbio(rbio); } static void recover_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = recover_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + recover_rbio(container_of(work, struct btrfs_raid_bio, work)); } static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) @@ -2070,7 +2228,7 @@ static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_n int faila; int failb; - found_errors = get_rbio_veritical_errors(rbio, sector_nr, + found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila, &failb); /* This vertical stripe doesn't have errors. */ if (!found_errors) @@ -2137,8 +2295,8 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; struct btrfs_root *csum_root = btrfs_csum_root(fs_info, - rbio->bioc->raid_map[0]); - const u64 start = rbio->bioc->raid_map[0]; + rbio->bioc->full_stripe_logical); + const u64 start = rbio->bioc->full_stripe_logical; const u32 len = (rbio->nr_data * rbio->stripe_nsectors) << fs_info->sectorsize_bits; int ret; @@ -2170,7 +2328,7 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio) goto error; } - ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1, + ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1, rbio->csum_buf, rbio->csum_bitmap); if (ret < 0) goto error; @@ -2186,7 +2344,7 @@ error: */ btrfs_warn_rl(fs_info, "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d", - rbio->bioc->raid_map[0], ret); + rbio->bioc->full_stripe_logical, ret); no_csum: kfree(rbio->csum_buf); bitmap_free(rbio->csum_bitmap); @@ -2196,11 +2354,9 @@ no_csum: static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) { - struct bio_list bio_list; - struct bio *bio; - int ret; - - bio_list_init(&bio_list); + struct bio_list bio_list = BIO_EMPTY_LIST; + int total_sector_nr; + int ret = 0; /* * Fill the data csums we need for data verification. We need to fill @@ -2209,32 +2365,39 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) */ fill_data_csums(rbio); - ret = rmw_assemble_read_bios(rbio, &bio_list); - if (ret < 0) - goto out; + /* + * Build a list of bios to read all sectors (including data and P/Q). + * + * This behavior is to compensate the later csum verification and recovery. + */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; + phys_addr_t *paddrs; - submit_read_bios(rbio, &bio_list); - wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe, + sectornr, REQ_OP_READ); + if (ret) { + bio_list_put(&bio_list); + return ret; + } + } /* * We may or may not have any corrupted sectors (including missing dev * and csum mismatch), just let recover_sectors() to handle them all. */ - ret = recover_sectors(rbio); - return ret; -out: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return ret; + submit_read_wait_bio_list(rbio, &bio_list); + return recover_sectors(rbio); } static void raid_wait_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - blk_status_t err = bio->bi_status; - if (err) + if (bio->bi_status) rbio_update_error_bitmap(rbio, bio); bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending)) @@ -2250,11 +2413,11 @@ static void submit_write_bios(struct btrfs_raid_bio *rbio, while ((bio = bio_list_pop(bio_list))) { bio->bi_end_io = raid_wait_write_end_io; - if (trace_raid56_write_stripe_enabled()) { + if (trace_raid56_write_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_write_stripe(rbio, bio, &trace_info); + trace_raid56_write(rbio, bio, &trace_info); } submit_bio(bio); } @@ -2269,20 +2432,21 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) int i; for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { - struct sector_ptr *sector = &rbio->stripe_sectors[i]; + phys_addr_t paddr = rbio->stripe_paddrs[i * rbio->sector_nsteps]; /* * We have a sector which doesn't have page nor uptodate, * thus this rbio can not be cached one, as cached one must * have all its data sectors present and uptodate. */ - if (!sector->page || !sector->uptodate) + if (paddr == INVALID_PADDR || + !test_bit(i, rbio->stripe_uptodate_bitmap)) return true; } return false; } -static int rmw_rbio(struct btrfs_raid_bio *rbio) +static void rmw_rbio(struct btrfs_raid_bio *rbio) { struct bio_list bio_list; int sectornr; @@ -2294,38 +2458,36 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) */ ret = alloc_rbio_parity_pages(rbio); if (ret < 0) - return ret; + goto out; /* * Either full stripe write, or we have every data sector already * cached, can go to write path immediately. */ - if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio)) - goto write; - - /* - * Now we're doing sub-stripe write, also need all data stripes to do - * the full RMW. - */ - ret = alloc_rbio_data_pages(rbio); - if (ret < 0) - return ret; + if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) { + /* + * Now we're doing sub-stripe write, also need all data stripes + * to do the full RMW. + */ + ret = alloc_rbio_data_pages(rbio); + if (ret < 0) + goto out; - index_rbio_pages(rbio); + index_rbio_pages(rbio); - ret = rmw_read_wait_recover(rbio); - if (ret < 0) - return ret; + ret = rmw_read_wait_recover(rbio); + if (ret < 0) + goto out; + } -write: /* * At this stage we're not allowed to add any new bios to the * bio list any more, anyone else that wants to change this stripe * needs to do their own rmw. */ - spin_lock_irq(&rbio->bio_list_lock); + spin_lock(&rbio->bio_list_lock); set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); - spin_unlock_irq(&rbio->bio_list_lock); + spin_unlock(&rbio->bio_list_lock); bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); @@ -2348,7 +2510,7 @@ write: bio_list_init(&bio_list); ret = rmw_assemble_write_bios(rbio, &bio_list); if (ret < 0) - return ret; + goto out; /* We should have at least one bio assembled. */ ASSERT(bio_list_size(&bio_list)); @@ -2359,38 +2521,28 @@ write: for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { int found_errors; - found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); - if (found_errors > rbio->bioc->max_errors) { + found_errors = get_rbio_vertical_errors(rbio, sectornr, NULL, NULL); + if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; break; } } - return ret; +out: + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void rmw_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; - int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = lock_stripe_add(rbio); - if (ret == 0) { - ret = rmw_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); - } + if (lock_stripe_add(rbio) == 0) + rmw_rbio(rbio); } static void rmw_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = rmw_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + rmw_rbio(container_of(work, struct btrfs_raid_bio, work)); } /* @@ -2434,27 +2586,30 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, break; } } - ASSERT(i < rbio->real_stripes); + ASSERT_RBIO_STRIPE(i < rbio->real_stripes, rbio, i); bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); return rbio; } -/* Used for both parity scrub and missing. */ -void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, - unsigned int pgoff, u64 logical) +static int alloc_rbio_sector_pages(struct btrfs_raid_bio *rbio, + int sector_nr) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - int stripe_offset; - int index; + const u32 step = min(PAGE_SIZE, rbio->bioc->fs_info->sectorsize); + const u32 base = sector_nr * rbio->sector_nsteps; + + for (int i = base; i < base + rbio->sector_nsteps; i++) { + const unsigned int page_index = (i * step) >> PAGE_SHIFT; + struct page *page; - ASSERT(logical >= rbio->bioc->raid_map[0]); - ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + - BTRFS_STRIPE_LEN * rbio->nr_data); - stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); - index = stripe_offset / sectorsize; - rbio->bio_sectors[index].page = page; - rbio->bio_sectors[index].pgoff = pgoff; + if (rbio->stripe_pages[page_index]) + continue; + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + rbio->stripe_pages[page_index] = page; + } + return 0; } /* @@ -2463,42 +2618,96 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, */ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; int total_sector_nr; for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { - struct page *page; int sectornr = total_sector_nr % rbio->stripe_nsectors; - int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; + int ret; if (!test_bit(sectornr, &rbio->dbitmap)) continue; - if (rbio->stripe_pages[index]) - continue; - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - rbio->stripe_pages[index] = page; + ret = alloc_rbio_sector_pages(rbio, total_sector_nr); + if (ret < 0) + return ret; } index_stripe_sectors(rbio); return 0; } -static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) +/* Return true if the content of the step matches the caclulated one. */ +static bool verify_one_parity_step(struct btrfs_raid_bio *rbio, + void *pointers[], unsigned int sector_nr, + unsigned int step_nr) +{ + const unsigned int nr_data = rbio->nr_data; + const bool has_qstripe = (rbio->real_stripes - rbio->nr_data == 2); + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); + void *parity; + bool ret = false; + + ASSERT(step_nr < rbio->sector_nsteps); + + /* First collect one page from each data stripe. */ + for (int stripe = 0; stripe < nr_data; stripe++) + pointers[stripe] = kmap_local_paddr( + sector_paddr_in_rbio(rbio, stripe, sector_nr, + step_nr, 0)); + + if (has_qstripe) { + assert_rbio(rbio); + /* RAID6, call the library function to fill in our P/Q. */ + raid6_call.gen_syndrome(rbio->real_stripes, step, pointers); + } else { + /* RAID5. */ + memcpy(pointers[nr_data], pointers[0], step); + run_xor(pointers + 1, nr_data - 1, step); + } + + /* Check scrubbing parity and repair it. */ + parity = kmap_local_paddr(rbio_stripe_paddr(rbio, rbio->scrubp, sector_nr, step_nr)); + if (memcmp(parity, pointers[rbio->scrubp], step) != 0) + memcpy(parity, pointers[rbio->scrubp], step); + else + ret = true; + kunmap_local(parity); + + for (int stripe = nr_data - 1; stripe >= 0; stripe--) + kunmap_local(pointers[stripe]); + return ret; +} + +/* + * The @pointers array should have the P/Q parity already mapped. + */ +static void verify_one_parity_sector(struct btrfs_raid_bio *rbio, + void *pointers[], unsigned int sector_nr) +{ + bool found_error = false; + + for (int step_nr = 0; step_nr < rbio->sector_nsteps; step_nr++) { + bool match; + + match = verify_one_parity_step(rbio, pointers, sector_nr, step_nr); + if (!match) + found_error = true; + } + if (!found_error) + bitmap_clear(&rbio->dbitmap, sector_nr, 1); +} + +static int finish_parity_scrub(struct btrfs_raid_bio *rbio) { struct btrfs_io_context *bioc = rbio->bioc; - const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; unsigned long *pbitmap = &rbio->finish_pbitmap; int nr_data = rbio->nr_data; - int stripe; int sectornr; bool has_qstripe; - struct sector_ptr p_sector = { 0 }; - struct sector_ptr q_sector = { 0 }; + struct page *page; + phys_addr_t p_paddr = INVALID_PADDR; + phys_addr_t q_paddr = INVALID_PADDR; struct bio_list bio_list; - struct bio *bio; int is_replace = 0; int ret; @@ -2511,7 +2720,11 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) else BUG(); - if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { + /* + * Replace is running and our P/Q stripe is being replaced, then we + * need to duplicate the final write to replace target. + */ + if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) { is_replace = 1; bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); } @@ -2523,88 +2736,51 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) */ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - if (!need_check) - goto writeback; - - p_sector.page = alloc_page(GFP_NOFS); - if (!p_sector.page) + page = alloc_page(GFP_NOFS); + if (!page) return -ENOMEM; - p_sector.pgoff = 0; - p_sector.uptodate = 1; + p_paddr = page_to_phys(page); + page = NULL; + pointers[nr_data] = kmap_local_paddr(p_paddr); if (has_qstripe) { /* RAID6, allocate and map temp space for the Q stripe */ - q_sector.page = alloc_page(GFP_NOFS); - if (!q_sector.page) { - __free_page(p_sector.page); - p_sector.page = NULL; + page = alloc_page(GFP_NOFS); + if (!page) { + __free_page(phys_to_page(p_paddr)); + p_paddr = INVALID_PADDR; return -ENOMEM; } - q_sector.pgoff = 0; - q_sector.uptodate = 1; - pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); + q_paddr = page_to_phys(page); + page = NULL; + pointers[rbio->real_stripes - 1] = kmap_local_paddr(q_paddr); } bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* Map the parity stripe just once */ - pointers[nr_data] = kmap_local_page(p_sector.page); - - for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { - struct sector_ptr *sector; - void *parity; - - /* first collect one page from each data stripe */ - for (stripe = 0; stripe < nr_data; stripe++) { - sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; - } - if (has_qstripe) { - /* RAID6, call the library function to fill in our P/Q */ - raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, - pointers); - } else { - /* raid5 */ - memcpy(pointers[nr_data], pointers[0], sectorsize); - run_xor(pointers + 1, nr_data - 1, sectorsize); - } - - /* Check scrubbing parity and repair it */ - sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); - parity = kmap_local_page(sector->page) + sector->pgoff; - if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) - memcpy(parity, pointers[rbio->scrubp], sectorsize); - else - /* Parity is right, needn't writeback */ - bitmap_clear(&rbio->dbitmap, sectornr, 1); - kunmap_local(parity); - - for (stripe = nr_data - 1; stripe >= 0; stripe--) - kunmap_local(pointers[stripe]); - } + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) + verify_one_parity_sector(rbio, pointers, sectornr); kunmap_local(pointers[nr_data]); - __free_page(p_sector.page); - p_sector.page = NULL; - if (q_sector.page) { - kunmap_local(pointers[rbio->real_stripes - 1]); - __free_page(q_sector.page); - q_sector.page = NULL; + __free_page(phys_to_page(p_paddr)); + p_paddr = INVALID_PADDR; + if (q_paddr != INVALID_PADDR) { + __free_page(phys_to_page(q_paddr)); + q_paddr = INVALID_PADDR; } -writeback: /* * time to start writing. Make bios for everything from the * higher layers (the bio_list in our rbio) and our p/q. Ignore * everything else. */ for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { - struct sector_ptr *sector; + phys_addr_t *paddrs; - sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); - ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, + paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->scrubp, sectornr, REQ_OP_WRITE); if (ret) goto cleanup; @@ -2613,13 +2789,17 @@ writeback: if (!is_replace) goto submit_write; + /* + * Replace is running and our parity stripe needs to be duplicated to + * the target device. Check we have a valid source stripe number. + */ + ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio); for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { - struct sector_ptr *sector; + phys_addr_t *paddrs; - sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); - ret = rbio_add_io_sector(rbio, &bio_list, sector, - bioc->tgtdev_map[rbio->scrubp], - sectornr, REQ_OP_WRITE); + paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->real_stripes, + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2629,8 +2809,7 @@ submit_write: return 0; cleanup: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); + bio_list_put(&bio_list); return ret; } @@ -2667,9 +2846,9 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) int failb; int found_errors; - found_errors = get_rbio_veritical_errors(rbio, sector_nr, + found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila, &failb); - if (found_errors > rbio->bioc->max_errors) { + if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; goto out; } @@ -2693,7 +2872,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) * data, so the capability of the repair is declined. (In the * case of RAID5, we can not repair anything.) */ - if (dfail > rbio->bioc->max_errors - 1) { + if (unlikely(dfail > rbio->bioc->max_errors - 1)) { ret = -EIO; goto out; } @@ -2710,7 +2889,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) * scrubbing parity, luckily, use the other one to repair the * data, or we can not repair the data stripe. */ - if (failp != rbio->scrubp) { + if (unlikely(failp != rbio->scrubp)) { ret = -EIO; goto out; } @@ -2725,21 +2904,18 @@ out: return ret; } -static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) +static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) { - struct bio *bio; + struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0; - ASSERT(bio_list_size(bio_list) == 0); - /* Build a list of bios to read all the missing parts. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { int sectornr = total_sector_nr % rbio->stripe_nsectors; int stripe = total_sector_nr / rbio->stripe_nsectors; - struct sector_ptr *sector; + phys_addr_t *paddrs; /* No data in the vertical stripe, no need to read. */ if (!test_bit(sectornr, &rbio->dbitmap)) @@ -2747,93 +2923,76 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, /* * We want to find all the sectors missing from the rbio and - * read them from the disk. If sector_in_rbio() finds a sector + * read them from the disk. If sector_paddr_in_rbio() finds a sector * in the bio list we don't need to read it off the stripe. */ - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (sector) + paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1); + if (paddrs == NULL) continue; - sector = rbio_stripe_sector(rbio, stripe, sectornr); + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); /* * The bio cache may have handed us an uptodate sector. If so, * use it. */ - if (sector->uptodate) + if (test_bit(rbio_sector_index(rbio, stripe, sectornr), + rbio->stripe_uptodate_bitmap)) continue; - ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe, sectornr, REQ_OP_READ); - if (ret) - goto error; + if (ret) { + bio_list_put(&bio_list); + return ret; + } } + + submit_read_wait_bio_list(rbio, &bio_list); return 0; -error: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); - return ret; } -static int scrub_rbio(struct btrfs_raid_bio *rbio) +static void scrub_rbio(struct btrfs_raid_bio *rbio) { - bool need_check = false; - struct bio_list bio_list; int sector_nr; int ret; - struct bio *bio; - - bio_list_init(&bio_list); ret = alloc_rbio_essential_pages(rbio); if (ret) - goto cleanup; + goto out; bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); - ret = scrub_assemble_read_bios(rbio, &bio_list); + ret = scrub_assemble_read_bios(rbio); if (ret < 0) - goto cleanup; - - submit_read_bios(rbio, &bio_list); - wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + goto out; /* We may have some failures, recover the failed sectors first. */ ret = recover_scrub_rbio(rbio); if (ret < 0) - goto cleanup; + goto out; /* * We have every sector properly prepared. Can finish the scrub * and writeback the good content. */ - ret = finish_parity_scrub(rbio, need_check); + ret = finish_parity_scrub(rbio); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { int found_errors; - found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); - if (found_errors > rbio->bioc->max_errors) { + found_errors = get_rbio_vertical_errors(rbio, sector_nr, NULL, NULL); + if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; break; } } - return ret; - -cleanup: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return ret; +out: + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void scrub_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); - ret = scrub_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + scrub_rbio(container_of(work, struct btrfs_raid_bio, work)); } void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) @@ -2842,32 +3001,57 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) start_async_work(rbio, scrub_rbio_work_locked); } -/* The following code is used for dev replace of a missing RAID 5/6 device. */ - -struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) +/* + * This is for scrub call sites where we already have correct data contents. + * This allows us to avoid reading data stripes again. + * + * Unfortunately here we have to do folio copy, other than reusing the pages. + * This is due to the fact rbio has its own page management for its cache. + */ +void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio, + struct folio **data_folios, u64 data_logical) { - struct btrfs_fs_info *fs_info = bioc->fs_info; - struct btrfs_raid_bio *rbio; - - rbio = alloc_rbio(fs_info, bioc); - if (IS_ERR(rbio)) - return NULL; + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u64 offset_in_full_stripe = data_logical - + rbio->bioc->full_stripe_logical; + unsigned int findex = 0; + unsigned int foffset = 0; + int ret; - rbio->operation = BTRFS_RBIO_REBUILD_MISSING; - bio_list_add(&rbio->bio_list, bio); /* - * This is a special bio which is used to hold the completion handler - * and make the scrub rbio is similar to the other types + * If we hit ENOMEM temporarily, but later at + * raid56_parity_submit_scrub_rbio() time it succeeded, we just do + * the extra read, not a big deal. + * + * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time, + * the bio would got proper error number set. */ - ASSERT(!bio->bi_iter.bi_size); - - set_rbio_range_error(rbio, bio); - - return rbio; -} + ret = alloc_rbio_data_pages(rbio); + if (ret < 0) + return; -void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) -{ - start_async_work(rbio, recover_rbio_work); + /* data_logical must be at stripe boundary and inside the full stripe. */ + ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN)); + ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT)); + + for (unsigned int cur_off = offset_in_full_stripe; + cur_off < offset_in_full_stripe + BTRFS_STRIPE_LEN; + cur_off += PAGE_SIZE) { + const unsigned int pindex = cur_off >> PAGE_SHIFT; + void *kaddr; + + kaddr = kmap_local_page(rbio->stripe_pages[pindex]); + memcpy_from_folio(kaddr, data_folios[findex], foffset, PAGE_SIZE); + kunmap_local(kaddr); + + foffset += PAGE_SIZE; + ASSERT(foffset <= folio_size(data_folios[findex])); + if (foffset == folio_size(data_folios[findex])) { + findex++; + foffset = 0; + } + } + bitmap_set(rbio->stripe_uptodate_bitmap, + offset_in_full_stripe >> fs_info->sectorsize_bits, + BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); } |
