diff options
Diffstat (limited to 'fs/btrfs/raid56.c')
| -rw-r--r-- | fs/btrfs/raid56.c | 3476 |
1 files changed, 1892 insertions, 1584 deletions
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index c870ef70f817..f38d8305e46d 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -13,11 +13,14 @@ #include <linux/list_sort.h> #include <linux/raid/xor.h> #include <linux/mm.h> +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "volumes.h" #include "raid56.h" #include "async-thread.h" +#include "file-item.h" +#include "btrfs_inode.h" /* set when additional merges to this rbio are not allowed */ #define RBIO_RMW_LOCKED_BIT 1 @@ -37,6 +40,85 @@ #define BTRFS_STRIPE_HASH_TABLE_BITS 11 +static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc) +{ + if (unlikely(!bioc)) { + btrfs_crit(fs_info, "bioc=NULL"); + return; + } + btrfs_crit(fs_info, +"bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u", + bioc->logical, bioc->full_stripe_logical, bioc->size, + bioc->map_type, bioc->mirror_num, bioc->replace_nr_stripes, + bioc->replace_stripe_src, bioc->num_stripes); + for (int i = 0; i < bioc->num_stripes; i++) { + btrfs_crit(fs_info, " nr=%d devid=%llu physical=%llu", + i, bioc->stripes[i].dev->devid, + bioc->stripes[i].physical); + } +} + +static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info, + const struct btrfs_raid_bio *rbio) +{ + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) + return; + + dump_bioc(fs_info, rbio->bioc); + btrfs_crit(fs_info, +"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u sector_nsteps=%u scrubp=%u dbitmap=0x%lx", + rbio->flags, rbio->nr_sectors, rbio->nr_data, + rbio->real_stripes, rbio->stripe_nsectors, + rbio->sector_nsteps, rbio->scrubp, rbio->dbitmap); +} + +#define ASSERT_RBIO(expr, rbio) \ +({ \ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ + const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ + (rbio)->bioc->fs_info : NULL; \ + \ + btrfs_dump_rbio(__fs_info, (rbio)); \ + } \ + ASSERT((expr)); \ +}) + +#define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr) \ +({ \ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ + const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ + (rbio)->bioc->fs_info : NULL; \ + \ + btrfs_dump_rbio(__fs_info, (rbio)); \ + btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr)); \ + } \ + ASSERT((expr)); \ +}) + +#define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr) \ +({ \ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ + const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ + (rbio)->bioc->fs_info : NULL; \ + \ + btrfs_dump_rbio(__fs_info, (rbio)); \ + btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr)); \ + } \ + ASSERT((expr)); \ +}) + +#define ASSERT_RBIO_LOGICAL(expr, rbio, logical) \ +({ \ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ + const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ + (rbio)->bioc->fs_info : NULL; \ + \ + btrfs_dump_rbio(__fs_info, (rbio)); \ + btrfs_crit(__fs_info, "logical=%llu", (logical)); \ + } \ + ASSERT((expr)); \ +}) + /* Used by the raid56 code to lock stripes for read/modify/write */ struct btrfs_stripe_hash { struct list_head hash_list; @@ -51,147 +133,56 @@ struct btrfs_stripe_hash_table { struct btrfs_stripe_hash table[]; }; -enum btrfs_rbio_ops { - BTRFS_RBIO_WRITE, - BTRFS_RBIO_READ_REBUILD, - BTRFS_RBIO_PARITY_SCRUB, - BTRFS_RBIO_REBUILD_MISSING, -}; - -struct btrfs_raid_bio { - struct btrfs_fs_info *fs_info; - struct btrfs_bio *bbio; - - /* while we're doing rmw on a stripe - * we put it into a hash table so we can - * lock the stripe and merge more rbios - * into it. - */ - struct list_head hash_list; - - /* - * LRU list for the stripe cache - */ - struct list_head stripe_cache; - - /* - * for scheduling work in the helper threads - */ - struct btrfs_work work; - - /* - * bio list and bio_list_lock are used - * to add more bios into the stripe - * in hopes of avoiding the full rmw - */ - struct bio_list bio_list; - spinlock_t bio_list_lock; - - /* also protected by the bio_list_lock, the - * plug list is used by the plugging code - * to collect partial bios while plugged. The - * stripe locking code also uses it to hand off - * the stripe lock to the next pending IO - */ - struct list_head plug_list; - - /* - * flags that tell us if it is safe to - * merge with this bio - */ - unsigned long flags; - - /* size of each individual stripe on disk */ - int stripe_len; - - /* number of data stripes (no p/q) */ - int nr_data; - - int real_stripes; - - int stripe_npages; - /* - * set if we're doing a parity rebuild - * for a read from higher up, which is handled - * differently from a parity rebuild as part of - * rmw - */ - enum btrfs_rbio_ops operation; - - /* first bad stripe */ - int faila; - - /* second bad stripe (for raid6 use) */ - int failb; - - int scrubp; - /* - * number of pages needed to represent the full - * stripe - */ - int nr_pages; - - /* - * size of all the bios in the bio_list. This - * helps us decide if the rbio maps to a full - * stripe or not - */ - int bio_list_bytes; - - int generic_bio_cnt; - - refcount_t refs; - - atomic_t stripes_pending; +/* + * The PFN may still be valid, but our paddrs should always be block size + * aligned, thus such -1 paddr is definitely not a valid one. + */ +#define INVALID_PADDR (~(phys_addr_t)0) - atomic_t error; - /* - * these are two arrays of pointers. We allocate the - * rbio big enough to hold them both and setup their - * locations when the rbio is allocated - */ +static void rmw_rbio_work(struct work_struct *work); +static void rmw_rbio_work_locked(struct work_struct *work); +static void index_rbio_pages(struct btrfs_raid_bio *rbio); +static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); - /* pointers to pages that we allocated for - * reading/writing stripes directly from the disk (including P/Q) - */ - struct page **stripe_pages; +static int finish_parity_scrub(struct btrfs_raid_bio *rbio); +static void scrub_rbio_work_locked(struct work_struct *work); - /* - * pointers to the pages in the bio_list. Stored - * here for faster lookup - */ - struct page **bio_pages; +static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) +{ + bitmap_free(rbio->error_bitmap); + kfree(rbio->stripe_pages); + kfree(rbio->bio_paddrs); + kfree(rbio->stripe_paddrs); + kfree(rbio->finish_pointers); +} - /* - * bitmap to record which horizontal stripe has data - */ - unsigned long *dbitmap; +static void free_raid_bio(struct btrfs_raid_bio *rbio) +{ + int i; - /* allocated with real_stripes-many pointers for finish_*() calls */ - void **finish_pointers; + if (!refcount_dec_and_test(&rbio->refs)) + return; - /* allocated with stripe_npages-many bits for finish_*() calls */ - unsigned long *finish_pbitmap; -}; + WARN_ON(!list_empty(&rbio->stripe_cache)); + WARN_ON(!list_empty(&rbio->hash_list)); + WARN_ON(!bio_list_empty(&rbio->bio_list)); -static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); -static noinline void finish_rmw(struct btrfs_raid_bio *rbio); -static void rmw_work(struct btrfs_work *work); -static void read_rebuild_work(struct btrfs_work *work); -static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); -static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); -static void __free_raid_bio(struct btrfs_raid_bio *rbio); -static void index_rbio_pages(struct btrfs_raid_bio *rbio); -static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); + for (i = 0; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) { + __free_page(rbio->stripe_pages[i]); + rbio->stripe_pages[i] = NULL; + } + } -static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, - int need_check); -static void scrub_parity_work(struct btrfs_work *work); + btrfs_put_bioc(rbio->bioc); + free_raid_bio_pointers(rbio); + kfree(rbio); +} -static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func) +static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) { - btrfs_init_work(&rbio->work, work_func, NULL, NULL); - btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); + INIT_WORK(&rbio->work, work_func); + queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); } /* @@ -204,8 +195,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) struct btrfs_stripe_hash_table *x; struct btrfs_stripe_hash *cur; struct btrfs_stripe_hash *h; - int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; - int i; + unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS; if (info->stripe_hash_table) return 0; @@ -226,21 +216,38 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) h = table->table; - for (i = 0; i < num_entries; i++) { + for (unsigned int i = 0; i < num_entries; i++) { cur = h + i; INIT_LIST_HEAD(&cur->hash_list); spin_lock_init(&cur->lock); } x = cmpxchg(&info->stripe_hash_table, NULL, table); - if (x) - kvfree(x); + kvfree(x); return 0; } +static void memcpy_from_bio_to_stripe(struct btrfs_raid_bio *rbio, unsigned int sector_nr) +{ + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); + + ASSERT(sector_nr < rbio->nr_sectors); + for (int i = 0; i < rbio->sector_nsteps; i++) { + unsigned int index = sector_nr * rbio->sector_nsteps + i; + phys_addr_t dst = rbio->stripe_paddrs[index]; + phys_addr_t src = rbio->bio_paddrs[index]; + + ASSERT(dst != INVALID_PADDR); + ASSERT(src != INVALID_PADDR); + + memcpy_page(phys_to_page(dst), offset_in_page(dst), + phys_to_page(src), offset_in_page(src), step); + } +} + /* * caching an rbio means to copy anything from the - * bio_pages array into the stripe_pages array. We + * bio_sectors array into the stripe_pages array. We * use the page uptodate bit in the stripe cache array * to indicate if it has valid data * @@ -250,26 +257,27 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) static void cache_rbio_pages(struct btrfs_raid_bio *rbio) { int i; - char *s; - char *d; int ret; ret = alloc_rbio_pages(rbio); if (ret) return; - for (i = 0; i < rbio->nr_pages; i++) { - if (!rbio->bio_pages[i]) + for (i = 0; i < rbio->nr_sectors; i++) { + /* Some range not covered by bio (partial write), skip it */ + if (rbio->bio_paddrs[i * rbio->sector_nsteps] == INVALID_PADDR) { + /* + * Even if the sector is not covered by bio, if it is + * a data sector it should still be uptodate as it is + * read from disk. + */ + if (i < rbio->nr_data * rbio->stripe_nsectors) + ASSERT(test_bit(i, rbio->stripe_uptodate_bitmap)); continue; + } - s = kmap(rbio->bio_pages[i]); - d = kmap(rbio->stripe_pages[i]); - - copy_page(d, s); - - kunmap(rbio->bio_pages[i]); - kunmap(rbio->stripe_pages[i]); - SetPageUptodate(rbio->stripe_pages[i]); + memcpy_from_bio_to_stripe(rbio, i); + set_bit(i, rbio->stripe_uptodate_bitmap); } set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); } @@ -279,7 +287,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) */ static int rbio_bucket(struct btrfs_raid_bio *rbio) { - u64 num = rbio->bbio->raid_map[0]; + u64 num = rbio->bioc->full_stripe_logical; /* * we shift down quite a bit. We're using byte @@ -292,32 +300,143 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio) return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); } +/* Get the sector number of the first sector covered by @page_nr. */ +static u32 page_nr_to_sector_nr(struct btrfs_raid_bio *rbio, unsigned int page_nr) +{ + u32 sector_nr; + + ASSERT(page_nr < rbio->nr_pages); + + sector_nr = (page_nr << PAGE_SHIFT) >> rbio->bioc->fs_info->sectorsize_bits; + ASSERT(sector_nr < rbio->nr_sectors); + return sector_nr; +} + /* - * stealing an rbio means taking all the uptodate pages from the stripe - * array in the source rbio and putting them into the destination rbio + * Get the number of sectors covered by @page_nr. + * + * For bs > ps cases, the result will always be 1. + * For bs <= ps cases, the result will be ps / bs. + */ +static u32 page_nr_to_num_sectors(struct btrfs_raid_bio *rbio, unsigned int page_nr) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + u32 nr_sectors; + + ASSERT(page_nr < rbio->nr_pages); + + nr_sectors = round_up(PAGE_SIZE, fs_info->sectorsize) >> fs_info->sectorsize_bits; + ASSERT(nr_sectors > 0); + return nr_sectors; +} + +static __maybe_unused bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, + unsigned int page_nr) +{ + const u32 sector_nr = page_nr_to_sector_nr(rbio, page_nr); + const u32 nr_bits = page_nr_to_num_sectors(rbio, page_nr); + int i; + + ASSERT(page_nr < rbio->nr_pages); + ASSERT(sector_nr + nr_bits < rbio->nr_sectors); + + for (i = sector_nr; i < sector_nr + nr_bits; i++) { + if (!test_bit(i, rbio->stripe_uptodate_bitmap)) + return false; + } + return true; +} + +/* + * Update the stripe_sectors[] array to use correct page and pgoff + * + * Should be called every time any page pointer in stripes_pages[] got modified. + */ +static void index_stripe_sectors(struct btrfs_raid_bio *rbio) +{ + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); + u32 offset; + int i; + + for (i = 0, offset = 0; i < rbio->nr_sectors * rbio->sector_nsteps; + i++, offset += step) { + int page_index = offset >> PAGE_SHIFT; + + ASSERT(page_index < rbio->nr_pages); + if (!rbio->stripe_pages[page_index]) + continue; + + rbio->stripe_paddrs[i] = page_to_phys(rbio->stripe_pages[page_index]) + + offset_in_page(offset); + } +} + +static void steal_rbio_page(struct btrfs_raid_bio *src, + struct btrfs_raid_bio *dest, int page_nr) +{ + const u32 sector_nr = page_nr_to_sector_nr(src, page_nr); + const u32 nr_bits = page_nr_to_num_sectors(src, page_nr); + + ASSERT(page_nr < src->nr_pages); + ASSERT(sector_nr + nr_bits < src->nr_sectors); + + if (dest->stripe_pages[page_nr]) + __free_page(dest->stripe_pages[page_nr]); + dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; + src->stripe_pages[page_nr] = NULL; + + /* Also update the stripe_uptodate_bitmap bits. */ + bitmap_set(dest->stripe_uptodate_bitmap, sector_nr, nr_bits); +} + +static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) +{ + const int sector_nr = page_nr_to_sector_nr(rbio, page_nr); + + /* + * We have ensured PAGE_SIZE is aligned with sectorsize, thus + * we won't have a page which is half data half parity. + * + * Thus if the first sector of the page belongs to data stripes, then + * the full page belongs to data stripes. + */ + return (sector_nr < rbio->nr_data * rbio->stripe_nsectors); +} + +/* + * Stealing an rbio means taking all the uptodate pages from the stripe array + * in the source rbio and putting them into the destination rbio. + * + * This will also update the involved stripe_sectors[] which are referring to + * the old pages. */ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) { int i; - struct page *s; - struct page *d; if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) return; for (i = 0; i < dest->nr_pages; i++) { - s = src->stripe_pages[i]; - if (!s || !PageUptodate(s)) { - continue; - } + struct page *p = src->stripe_pages[i]; - d = dest->stripe_pages[i]; - if (d) - __free_page(d); + /* + * We don't need to steal P/Q pages as they will always be + * regenerated for RMW or full write anyway. + */ + if (!is_data_stripe_page(src, i)) + continue; - dest->stripe_pages[i] = s; - src->stripe_pages[i] = NULL; + /* + * If @src already has RBIO_CACHE_READY_BIT, it should have + * all data stripe pages present and uptodate. + */ + ASSERT(p); + ASSERT(full_page_sectors_uptodate(src, i)); + steal_rbio_page(src, dest, i); } + index_stripe_sectors(dest); + index_stripe_sectors(src); } /* @@ -330,10 +449,11 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) static void merge_rbio(struct btrfs_raid_bio *dest, struct btrfs_raid_bio *victim) { - bio_list_merge(&dest->bio_list, &victim->bio_list); + bio_list_merge_init(&dest->bio_list, &victim->bio_list); dest->bio_list_bytes += victim->bio_list_bytes; - dest->generic_bio_cnt += victim->generic_bio_cnt; - bio_list_init(&victim->bio_list); + /* Also inherit the bitmaps from @victim. */ + bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, + dest->stripe_nsectors); } /* @@ -353,7 +473,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) return; - table = rbio->fs_info->stripe_hash_table; + table = rbio->bioc->fs_info->stripe_hash_table; h = table->table + bucket; /* hold the lock for the bucket because we may be @@ -394,7 +514,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) spin_unlock(&h->lock); if (freeit) - __free_raid_bio(rbio); + free_raid_bio(rbio); } /* @@ -403,16 +523,15 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) { struct btrfs_stripe_hash_table *table; - unsigned long flags; if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) return; - table = rbio->fs_info->stripe_hash_table; + table = rbio->bioc->fs_info->stripe_hash_table; - spin_lock_irqsave(&table->cache_lock, flags); + spin_lock(&table->cache_lock); __remove_rbio_from_cache(rbio); - spin_unlock_irqrestore(&table->cache_lock, flags); + spin_unlock(&table->cache_lock); } /* @@ -421,19 +540,17 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) { struct btrfs_stripe_hash_table *table; - unsigned long flags; struct btrfs_raid_bio *rbio; table = info->stripe_hash_table; - spin_lock_irqsave(&table->cache_lock, flags); + spin_lock(&table->cache_lock); while (!list_empty(&table->stripe_cache)) { - rbio = list_entry(table->stripe_cache.next, - struct btrfs_raid_bio, - stripe_cache); + rbio = list_first_entry(&table->stripe_cache, + struct btrfs_raid_bio, stripe_cache); __remove_rbio_from_cache(rbio); } - spin_unlock_irqrestore(&table->cache_lock, flags); + spin_unlock(&table->cache_lock); } /* @@ -463,14 +580,13 @@ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) static void cache_rbio(struct btrfs_raid_bio *rbio) { struct btrfs_stripe_hash_table *table; - unsigned long flags; if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) return; - table = rbio->fs_info->stripe_hash_table; + table = rbio->bioc->fs_info->stripe_hash_table; - spin_lock_irqsave(&table->cache_lock, flags); + spin_lock(&table->cache_lock); spin_lock(&rbio->bio_list_lock); /* bump our ref if we were not in the list before */ @@ -489,15 +605,15 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) if (table->cache_size > RBIO_CACHE_SIZE) { struct btrfs_raid_bio *found; - found = list_entry(table->stripe_cache.prev, - struct btrfs_raid_bio, - stripe_cache); + found = list_last_entry(&table->stripe_cache, + struct btrfs_raid_bio, + stripe_cache); if (found != rbio) __remove_rbio_from_cache(found); } - spin_unlock_irqrestore(&table->cache_lock, flags); + spin_unlock(&table->cache_lock); } /* @@ -526,15 +642,14 @@ static void run_xor(void **pages, int src_cnt, ssize_t len) */ static int rbio_is_full(struct btrfs_raid_bio *rbio) { - unsigned long flags; unsigned long size = rbio->bio_list_bytes; int ret = 1; - spin_lock_irqsave(&rbio->bio_list_lock, flags); - if (size != rbio->nr_data * rbio->stripe_len) + spin_lock(&rbio->bio_list_lock); + if (size != rbio->nr_data * BTRFS_STRIPE_LEN) ret = 0; - BUG_ON(size > rbio->nr_data * rbio->stripe_len); - spin_unlock_irqrestore(&rbio->bio_list_lock, flags); + BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); + spin_unlock(&rbio->bio_list_lock); return ret; } @@ -567,8 +682,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, test_bit(RBIO_CACHE_BIT, &cur->flags)) return 0; - if (last->bbio->raid_map[0] != - cur->bbio->raid_map[0]) + if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical) return 0; /* we can't merge with different operations */ @@ -585,64 +699,68 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, if (last->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; - if (last->operation == BTRFS_RBIO_REBUILD_MISSING) + if (last->operation == BTRFS_RBIO_READ_REBUILD) return 0; - if (last->operation == BTRFS_RBIO_READ_REBUILD) { - int fa = last->faila; - int fb = last->failb; - int cur_fa = cur->faila; - int cur_fb = cur->failb; + return 1; +} - if (last->faila >= last->failb) { - fa = last->failb; - fb = last->faila; - } +/* Return the sector index for @stripe_nr and @sector_nr. */ +static unsigned int rbio_sector_index(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr) +{ + unsigned int ret; - if (cur->faila >= cur->failb) { - cur_fa = cur->failb; - cur_fb = cur->faila; - } + ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr); + ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr); - if (fa != cur_fa || fb != cur_fb) - return 0; - } - return 1; + ret = stripe_nr * rbio->stripe_nsectors + sector_nr; + ASSERT(ret < rbio->nr_sectors); + return ret; } -static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, - int index) +/* Return the paddr array index for @stripe_nr, @sector_nr and @step_nr. */ +static unsigned int rbio_paddr_index(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr, + unsigned int step_nr) { - return stripe * rbio->stripe_npages + index; + unsigned int ret; + + ASSERT_RBIO_SECTOR(step_nr < rbio->sector_nsteps, rbio, step_nr); + + ret = rbio_sector_index(rbio, stripe_nr, sector_nr) * rbio->sector_nsteps + step_nr; + ASSERT(ret < rbio->nr_sectors * rbio->sector_nsteps); + return ret; } -/* - * these are just the pages from the rbio array, not from anything - * the FS sent down to us - */ -static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, - int index) +static phys_addr_t rbio_stripe_paddr(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, unsigned int sector_nr, + unsigned int step_nr) { - return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; + return rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr)]; } -/* - * helper to index into the pstripe - */ -static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) +static phys_addr_t rbio_pstripe_paddr(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr, unsigned int step_nr) { - return rbio_stripe_page(rbio, rbio->nr_data, index); + return rbio_stripe_paddr(rbio, rbio->nr_data, sector_nr, step_nr); } -/* - * helper to index into the qstripe, returns null - * if there is no qstripe - */ -static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) +static phys_addr_t rbio_qstripe_paddr(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr, unsigned int step_nr) { if (rbio->nr_data + 1 == rbio->real_stripes) - return NULL; - return rbio_stripe_page(rbio, rbio->nr_data + 1, index); + return INVALID_PADDR; + return rbio_stripe_paddr(rbio, rbio->nr_data + 1, sector_nr, step_nr); +} + +/* Return a paddr pointer into the rbio::stripe_paddrs[] for the specified sector. */ +static phys_addr_t *rbio_stripe_paddrs(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, unsigned int sector_nr) +{ + return &rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, 0)]; } /* @@ -672,16 +790,15 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) struct btrfs_stripe_hash *h; struct btrfs_raid_bio *cur; struct btrfs_raid_bio *pending; - unsigned long flags; struct btrfs_raid_bio *freeit = NULL; struct btrfs_raid_bio *cache_drop = NULL; int ret = 0; - h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio); + h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); - spin_lock_irqsave(&h->lock, flags); + spin_lock(&h->lock); list_for_each_entry(cur, &h->hash_list, hash_list) { - if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0]) + if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical) continue; spin_lock(&cur->bio_list_lock); @@ -739,14 +856,16 @@ lockit: refcount_inc(&rbio->refs); list_add(&rbio->hash_list, &h->hash_list); out: - spin_unlock_irqrestore(&h->lock, flags); + spin_unlock(&h->lock); if (cache_drop) remove_rbio_from_cache(cache_drop); if (freeit) - __free_raid_bio(freeit); + free_raid_bio(freeit); return ret; } +static void recover_rbio_work_locked(struct work_struct *work); + /* * called as rmw or parity rebuild is completed. If the plug list has more * rbios waiting for this stripe, the next one on the list will be started @@ -755,16 +874,15 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) { int bucket; struct btrfs_stripe_hash *h; - unsigned long flags; int keep_cache = 0; bucket = rbio_bucket(rbio); - h = rbio->fs_info->stripe_hash_table->table + bucket; + h = rbio->bioc->fs_info->stripe_hash_table->table + bucket; if (list_empty(&rbio->plug_list)) cache_rbio(rbio); - spin_lock_irqsave(&h->lock, flags); + spin_lock(&h->lock); spin_lock(&rbio->bio_list_lock); if (!list_empty(&rbio->hash_list)) { @@ -801,19 +919,16 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) list_add(&next->hash_list, &h->hash_list); refcount_inc(&next->refs); spin_unlock(&rbio->bio_list_lock); - spin_unlock_irqrestore(&h->lock, flags); + spin_unlock(&h->lock); - if (next->operation == BTRFS_RBIO_READ_REBUILD) - start_async_work(next, read_rebuild_work); - else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { - steal_rbio(rbio, next); - start_async_work(next, read_rebuild_work); + if (next->operation == BTRFS_RBIO_READ_REBUILD) { + start_async_work(next, recover_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); - start_async_work(next, rmw_work); + start_async_work(next, rmw_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { steal_rbio(rbio, next); - start_async_work(next, scrub_parity_work); + start_async_work(next, scrub_rbio_work_locked); } goto done_nolock; @@ -821,43 +936,21 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) } done: spin_unlock(&rbio->bio_list_lock); - spin_unlock_irqrestore(&h->lock, flags); + spin_unlock(&h->lock); done_nolock: if (!keep_cache) remove_rbio_from_cache(rbio); } -static void __free_raid_bio(struct btrfs_raid_bio *rbio) -{ - int i; - - if (!refcount_dec_and_test(&rbio->refs)) - return; - - WARN_ON(!list_empty(&rbio->stripe_cache)); - WARN_ON(!list_empty(&rbio->hash_list)); - WARN_ON(!bio_list_empty(&rbio->bio_list)); - - for (i = 0; i < rbio->nr_pages; i++) { - if (rbio->stripe_pages[i]) { - __free_page(rbio->stripe_pages[i]); - rbio->stripe_pages[i] = NULL; - } - } - - btrfs_put_bbio(rbio->bbio); - kfree(rbio); -} - -static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) +static void rbio_endio_bio_list(struct bio *cur, blk_status_t status) { struct bio *next; while (cur) { next = cur->bi_next; cur->bi_next = NULL; - cur->bi_status = err; + cur->bi_status = status; bio_endio(cur); cur = next; } @@ -867,13 +960,22 @@ static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) * this frees the rbio and runs through all the bios in the * bio_list and calls end_io on them */ -static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *extra; - if (rbio->generic_bio_cnt) - btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); + kfree(rbio->csum_buf); + bitmap_free(rbio->csum_bitmap); + rbio->csum_buf = NULL; + rbio->csum_bitmap = NULL; + + /* + * Clear the data bitmap, as the rbio may be cached for later usage. + * do this before before unlock_stripe() so there will be no new bio + * for this bio. + */ + bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors); /* * At this moment, rbio->bio_list is empty, however since rbio does not @@ -885,83 +987,70 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) */ unlock_stripe(rbio); extra = bio_list_get(&rbio->bio_list); - __free_raid_bio(rbio); + free_raid_bio(rbio); - rbio_endio_bio_list(cur, err); + rbio_endio_bio_list(cur, status); if (extra) - rbio_endio_bio_list(extra, err); + rbio_endio_bio_list(extra, status); } /* - * end io function used by finish_rmw. When we finally - * get here, we've written a full stripe - */ -static void raid_write_end_io(struct bio *bio) -{ - struct btrfs_raid_bio *rbio = bio->bi_private; - blk_status_t err = bio->bi_status; - int max_errors; - - if (err) - fail_bio_stripe(rbio, bio); - - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; - - err = BLK_STS_OK; - - /* OK, we have read all the stripes we need to. */ - max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? - 0 : rbio->bbio->max_errors; - if (atomic_read(&rbio->error) > max_errors) - err = BLK_STS_IOERR; - - rbio_orig_end_io(rbio, err); -} - -/* - * the read/modify/write code wants to use the original bio for - * any pages it included, and then use the rbio for everything - * else. This function decides if a given index (stripe number) - * and page number in that stripe fall inside the original bio - * or the rbio. + * Get paddr pointer for the sector specified by its @stripe_nr and @sector_nr. * - * if you set bio_list_only, you'll get a NULL back for any ranges - * that are outside the bio_list + * @rbio: The raid bio + * @stripe_nr: Stripe number, valid range [0, real_stripe) + * @sector_nr: Sector number inside the stripe, + * valid range [0, stripe_nsectors) + * @bio_list_only: Whether to use sectors inside the bio list only. * - * This doesn't take any refs on anything, you get a bare page pointer - * and the caller must bump refs as required. + * The read/modify/write code wants to reuse the original bio page as much + * as possible, and only use stripe_sectors as fallback. * - * You must call index_rbio_pages once before you can trust - * the answers from this function. + * Return NULL if bio_list_only is set but the specified sector has no + * coresponding bio. */ -static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, - int index, int pagenr, int bio_list_only) +static phys_addr_t *sector_paddrs_in_rbio(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr, + bool bio_list_only) { - int chunk_page; - struct page *p = NULL; - - chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; + phys_addr_t *ret = NULL; + const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, 0); - spin_lock_irq(&rbio->bio_list_lock); - p = rbio->bio_pages[chunk_page]; - spin_unlock_irq(&rbio->bio_list_lock); + ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps); - if (p || bio_list_only) - return p; - - return rbio->stripe_pages[chunk_page]; + scoped_guard(spinlock, &rbio->bio_list_lock) { + if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) { + /* Don't return sector without a valid page pointer */ + if (rbio->bio_paddrs[index] != INVALID_PADDR) + ret = &rbio->bio_paddrs[index]; + return ret; + } + } + return &rbio->stripe_paddrs[index]; } /* - * number of pages we need for the entire stripe across all the - * drives + * Similar to sector_paddr_in_rbio(), but with extra consideration for + * bs > ps cases, where we can have multiple steps for a fs block. */ -static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) +static phys_addr_t sector_paddr_in_rbio(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr, int step_nr, + bool bio_list_only) { - return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes; + phys_addr_t ret = INVALID_PADDR; + const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr); + + ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps); + + scoped_guard(spinlock, &rbio->bio_list_lock) { + if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) { + /* Don't return sector without a valid page pointer */ + if (rbio->bio_paddrs[index] != INVALID_PADDR) + ret = rbio->bio_paddrs[index]; + return ret; + } + } + return rbio->stripe_paddrs[index]; } /* @@ -969,177 +1058,265 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) * this does not allocate any pages for rbio->pages. */ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, - struct btrfs_bio *bbio, - u64 stripe_len) -{ + struct btrfs_io_context *bioc) +{ + const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes; + const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; + const unsigned int num_pages = stripe_npages * real_stripes; + const unsigned int stripe_nsectors = + BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; + const unsigned int num_sectors = stripe_nsectors * real_stripes; + const unsigned int step = min(fs_info->sectorsize, PAGE_SIZE); + const unsigned int sector_nsteps = fs_info->sectorsize / step; struct btrfs_raid_bio *rbio; - int nr_data = 0; - int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; - int num_pages = rbio_nr_pages(stripe_len, real_stripes); - int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); - void *p; - - rbio = kzalloc(sizeof(*rbio) + - sizeof(*rbio->stripe_pages) * num_pages + - sizeof(*rbio->bio_pages) * num_pages + - sizeof(*rbio->finish_pointers) * real_stripes + - sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) + - sizeof(*rbio->finish_pbitmap) * - BITS_TO_LONGS(stripe_npages), - GFP_NOFS); + + /* + * For bs <= ps cases, ps must be aligned to bs. + * For bs > ps cases, bs must be aligned to ps. + */ + ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize) || + IS_ALIGNED(fs_info->sectorsize, PAGE_SIZE)); + /* + * Our current stripe len should be fixed to 64k thus stripe_nsectors + * (at most 16) should be no larger than BITS_PER_LONG. + */ + ASSERT(stripe_nsectors <= BITS_PER_LONG); + + /* + * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256 + * (limited by u8). + */ + ASSERT(real_stripes >= 2); + ASSERT(real_stripes <= U8_MAX); + + rbio = kzalloc(sizeof(*rbio), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); + rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), + GFP_NOFS); + rbio->bio_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS); + rbio->stripe_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS); + rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); + rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); + rbio->stripe_uptodate_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); + + if (!rbio->stripe_pages || !rbio->bio_paddrs || !rbio->stripe_paddrs || + !rbio->finish_pointers || !rbio->error_bitmap || !rbio->stripe_uptodate_bitmap) { + free_raid_bio_pointers(rbio); + kfree(rbio); + return ERR_PTR(-ENOMEM); + } + for (int i = 0; i < num_sectors * sector_nsteps; i++) { + rbio->stripe_paddrs[i] = INVALID_PADDR; + rbio->bio_paddrs[i] = INVALID_PADDR; + } bio_list_init(&rbio->bio_list); + init_waitqueue_head(&rbio->io_wait); INIT_LIST_HEAD(&rbio->plug_list); spin_lock_init(&rbio->bio_list_lock); INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); - rbio->bbio = bbio; - rbio->fs_info = fs_info; - rbio->stripe_len = stripe_len; + btrfs_get_bioc(bioc); + rbio->bioc = bioc; rbio->nr_pages = num_pages; + rbio->nr_sectors = num_sectors; rbio->real_stripes = real_stripes; rbio->stripe_npages = stripe_npages; - rbio->faila = -1; - rbio->failb = -1; + rbio->stripe_nsectors = stripe_nsectors; + rbio->sector_nsteps = sector_nsteps; refcount_set(&rbio->refs, 1); - atomic_set(&rbio->error, 0); atomic_set(&rbio->stripes_pending, 0); - /* - * the stripe_pages, bio_pages, etc arrays point to the extra - * memory we allocated past the end of the rbio - */ - p = rbio + 1; -#define CONSUME_ALLOC(ptr, count) do { \ - ptr = p; \ - p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ - } while (0) - CONSUME_ALLOC(rbio->stripe_pages, num_pages); - CONSUME_ALLOC(rbio->bio_pages, num_pages); - CONSUME_ALLOC(rbio->finish_pointers, real_stripes); - CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages)); - CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages)); -#undef CONSUME_ALLOC - - if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) - nr_data = real_stripes - 1; - else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) - nr_data = real_stripes - 2; - else - BUG(); + ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); + rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); + ASSERT(rbio->nr_data > 0); - rbio->nr_data = nr_data; return rbio; } /* allocate pages for all the stripes in the bio, including parity */ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) { - int i; - struct page *page; + int ret; - for (i = 0; i < rbio->nr_pages; i++) { - if (rbio->stripe_pages[i]) - continue; - page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!page) - return -ENOMEM; - rbio->stripe_pages[i] = page; - } + ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false); + if (ret < 0) + return ret; + /* Mapping all sectors */ + index_stripe_sectors(rbio); return 0; } /* only allocate pages for p/q stripes */ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) { - int i; - struct page *page; + const int data_pages = rbio->nr_data * rbio->stripe_npages; + int ret; - i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); + ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, + rbio->stripe_pages + data_pages, false); + if (ret < 0) + return ret; - for (; i < rbio->nr_pages; i++) { - if (rbio->stripe_pages[i]) - continue; - page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!page) - return -ENOMEM; - rbio->stripe_pages[i] = page; + index_stripe_sectors(rbio); + return 0; +} + +/* + * Return the total number of errors found in the vertical stripe of @sector_nr. + * + * @faila and @failb will also be updated to the first and second stripe + * number of the errors. + */ +static int get_rbio_vertical_errors(struct btrfs_raid_bio *rbio, int sector_nr, + int *faila, int *failb) +{ + int stripe_nr; + int found_errors = 0; + + if (faila || failb) { + /* + * Both @faila and @failb should be valid pointers if any of + * them is specified. + */ + ASSERT(faila && failb); + *faila = -1; + *failb = -1; } + + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr; + + if (test_bit(total_sector_nr, rbio->error_bitmap)) { + found_errors++; + if (faila) { + /* Update faila and failb. */ + if (*faila < 0) + *faila = stripe_nr; + else if (*failb < 0) + *failb = stripe_nr; + } + } + } + return found_errors; +} + +static int bio_add_paddrs(struct bio *bio, phys_addr_t *paddrs, unsigned int nr_steps, + unsigned int step) +{ + int added = 0; + int ret; + + for (int i = 0; i < nr_steps; i++) { + ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, + offset_in_page(paddrs[i])); + if (ret != step) + goto revert; + added += ret; + } + return added; +revert: + /* + * We don't need to revert the bvec, as the bio will be submitted immediately, + * as long as the size is reduced the extra bvec will not be accessed. + */ + bio->bi_iter.bi_size -= added; return 0; } /* - * add a single page from a specific stripe into our list of bios for IO - * this will try to merge into existing bios if possible, and returns - * zero if all went well. + * Add a single sector @sector into our list of bios for IO. + * + * Return 0 if everything went well. + * Return <0 for error, and no byte will be added to @rbio. */ -static int rbio_add_io_page(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list, - struct page *page, - int stripe_nr, - unsigned long page_index, - unsigned long bio_max_len) +static int rbio_add_io_paddrs(struct btrfs_raid_bio *rbio, struct bio_list *bio_list, + phys_addr_t *paddrs, unsigned int stripe_nr, + unsigned int sector_nr, enum req_op op) { + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 step = min(sectorsize, PAGE_SIZE); struct bio *last = bio_list->tail; - u64 last_end = 0; int ret; struct bio *bio; - struct btrfs_bio_stripe *stripe; + struct btrfs_io_stripe *stripe; u64 disk_start; - stripe = &rbio->bbio->stripes[stripe_nr]; - disk_start = stripe->physical + (page_index << PAGE_SHIFT); + /* + * Note: here stripe_nr has taken device replace into consideration, + * thus it can be larger than rbio->real_stripe. + * So here we check against bioc->num_stripes, not rbio->real_stripes. + */ + ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes, + rbio, stripe_nr); + ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, + rbio, sector_nr); + ASSERT(paddrs != NULL); + + stripe = &rbio->bioc->stripes[stripe_nr]; + disk_start = stripe->physical + sector_nr * sectorsize; /* if the device is missing, just fail this stripe */ - if (!stripe->dev->bdev) - return fail_rbio_index(rbio, stripe_nr); + if (!stripe->dev->bdev) { + int found_errors; + + set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr, + rbio->error_bitmap); + + /* Check if we have reached tolerance early. */ + found_errors = get_rbio_vertical_errors(rbio, sector_nr, + NULL, NULL); + if (unlikely(found_errors > rbio->bioc->max_errors)) + return -EIO; + return 0; + } /* see if we can add this page onto our existing bio */ if (last) { - last_end = (u64)last->bi_iter.bi_sector << 9; + u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT; last_end += last->bi_iter.bi_size; /* * we can't merge these if they are from different * devices or if they are not contiguous */ - if (last_end == disk_start && stripe->dev->bdev && - !last->bi_status && - last->bi_disk == stripe->dev->bdev->bd_disk && - last->bi_partno == stripe->dev->bdev->bd_partno) { - ret = bio_add_page(last, page, PAGE_SIZE, 0); - if (ret == PAGE_SIZE) + if (last_end == disk_start && !last->bi_status && + last->bi_bdev == stripe->dev->bdev) { + ret = bio_add_paddrs(last, paddrs, rbio->sector_nsteps, step); + if (ret == sectorsize) return 0; } } /* put a new bio on the list */ - bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); - bio->bi_iter.bi_size = 0; - bio_set_dev(bio, stripe->dev->bdev); - bio->bi_iter.bi_sector = disk_start >> 9; - - bio_add_page(bio, page, PAGE_SIZE, 0); + bio = bio_alloc(stripe->dev->bdev, + max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), + op, GFP_NOFS); + bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; + bio->bi_private = rbio; + + ret = bio_add_paddrs(bio, paddrs, rbio->sector_nsteps, step); + ASSERT(ret == sectorsize); bio_list_add(bio_list, bio); return 0; } -/* - * while we're doing the read/modify/write cycle, we could - * have errors in reading pages off the disk. This checks - * for errors and if we're not able to read the page it'll - * trigger parity reconstruction. The rmw will be finished - * after we've reconstructed the failed stripes - */ -static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) +static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) { - if (rbio->faila >= 0 || rbio->failb >= 0) { - BUG_ON(rbio->faila == rbio->real_stripes - 1); - __raid56_parity_recover(rbio); - } else { - finish_rmw(rbio); + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); + const u32 step_bits = min(fs_info->sectorsize_bits, PAGE_SHIFT); + struct bvec_iter iter = bio->bi_iter; + phys_addr_t paddr; + u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + rbio->bioc->full_stripe_logical; + + btrfs_bio_for_each_block(paddr, bio, &iter, step) { + unsigned int index = (offset >> step_bits); + + rbio->bio_paddrs[index] = paddr; + offset += step; } } @@ -1154,498 +1331,437 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) static void index_rbio_pages(struct btrfs_raid_bio *rbio) { struct bio *bio; - u64 start; - unsigned long stripe_offset; - unsigned long page_index; - - spin_lock_irq(&rbio->bio_list_lock); - bio_list_for_each(bio, &rbio->bio_list) { - struct bio_vec bvec; - struct bvec_iter iter; - int i = 0; - - start = (u64)bio->bi_iter.bi_sector << 9; - stripe_offset = start - rbio->bbio->raid_map[0]; - page_index = stripe_offset >> PAGE_SHIFT; - - if (bio_flagged(bio, BIO_CLONED)) - bio->bi_iter = btrfs_io_bio(bio)->iter; - - bio_for_each_segment(bvec, bio, iter) { - rbio->bio_pages[page_index + i] = bvec.bv_page; - i++; - } + + spin_lock(&rbio->bio_list_lock); + bio_list_for_each(bio, &rbio->bio_list) + index_one_bio(rbio, bio); + + spin_unlock(&rbio->bio_list_lock); +} + +static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, + struct raid56_bio_trace_info *trace_info) +{ + const struct btrfs_io_context *bioc = rbio->bioc; + int i; + + ASSERT(bioc); + + /* We rely on bio->bi_bdev to find the stripe number. */ + if (!bio->bi_bdev) + goto not_found; + + for (i = 0; i < bioc->num_stripes; i++) { + if (bio->bi_bdev != bioc->stripes[i].dev->bdev) + continue; + trace_info->stripe_nr = i; + trace_info->devid = bioc->stripes[i].dev->devid; + trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + bioc->stripes[i].physical; + return; } - spin_unlock_irq(&rbio->bio_list_lock); + +not_found: + trace_info->devid = -1; + trace_info->offset = -1; + trace_info->stripe_nr = -1; } -/* - * this is called from one of two situations. We either - * have a full stripe from the higher layers, or we've read all - * the missing bits off disk. - * - * This will calculate the parity and then send down any - * changed blocks. - */ -static noinline void finish_rmw(struct btrfs_raid_bio *rbio) +static inline void bio_list_put(struct bio_list *bio_list) { - struct btrfs_bio *bbio = rbio->bbio; - void **pointers = rbio->finish_pointers; - int nr_data = rbio->nr_data; - int stripe; - int pagenr; - bool has_qstripe; - struct bio_list bio_list; struct bio *bio; - int ret; - bio_list_init(&bio_list); + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); +} - if (rbio->real_stripes - rbio->nr_data == 1) - has_qstripe = false; - else if (rbio->real_stripes - rbio->nr_data == 2) - has_qstripe = true; - else - BUG(); +static void assert_rbio(struct btrfs_raid_bio *rbio) +{ + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) + return; - /* at this point we either have a full stripe, - * or we've read the full stripe from the drive. - * recalculate the parity and write the new results. - * - * We're not allowed to add any new bios to the - * bio list here, anyone else that wants to - * change this stripe needs to do their own rmw. + /* + * At least two stripes (2 disks RAID5), and since real_stripes is U8, + * we won't go beyond 256 disks anyway. */ - spin_lock_irq(&rbio->bio_list_lock); - set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); - spin_unlock_irq(&rbio->bio_list_lock); - - atomic_set(&rbio->error, 0); + ASSERT_RBIO(rbio->real_stripes >= 2, rbio); + ASSERT_RBIO(rbio->nr_data > 0, rbio); /* - * now that we've set rmw_locked, run through the - * bio list one last time and map the page pointers - * - * We don't cache full rbios because we're assuming - * the higher layers are unlikely to use this area of - * the disk again soon. If they do use it again, - * hopefully they will send another full bio. + * This is another check to make sure nr data stripes is smaller + * than total stripes. */ - index_rbio_pages(rbio); - if (!rbio_is_full(rbio)) - cache_rbio_pages(rbio); - else - clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio); +} - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *p; - /* first collect one page from each data stripe */ - for (stripe = 0; stripe < nr_data; stripe++) { - p = page_in_rbio(rbio, stripe, pagenr, 0); - pointers[stripe] = kmap(p); - } +static inline void *kmap_local_paddr(phys_addr_t paddr) +{ + /* The sector pointer must have a page mapped to it. */ + ASSERT(paddr != INVALID_PADDR); - /* then add the parity stripe */ - p = rbio_pstripe_page(rbio, pagenr); - SetPageUptodate(p); - pointers[stripe++] = kmap(p); + return kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr); +} - if (has_qstripe) { +static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int sector_nr, + unsigned int step_nr) +{ + void **pointers = rbio->finish_pointers; + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); + int stripe; + const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; - /* - * raid6, add the qstripe and call the - * library function to fill in our p/q - */ - p = rbio_qstripe_page(rbio, pagenr); - SetPageUptodate(p); - pointers[stripe++] = kmap(p); + /* First collect one sector from each data stripe */ + for (stripe = 0; stripe < rbio->nr_data; stripe++) + pointers[stripe] = kmap_local_paddr( + sector_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0)); - raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, - pointers); - } else { - /* raid5 */ - copy_page(pointers[nr_data], pointers[0]); - run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); - } + /* Then add the parity stripe */ + pointers[stripe++] = kmap_local_paddr(rbio_pstripe_paddr(rbio, sector_nr, step_nr)); + if (has_qstripe) { + /* + * RAID6, add the qstripe and call the library function + * to fill in our p/q + */ + pointers[stripe++] = kmap_local_paddr( + rbio_qstripe_paddr(rbio, sector_nr, step_nr)); - for (stripe = 0; stripe < rbio->real_stripes; stripe++) - kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + assert_rbio(rbio); + raid6_call.gen_syndrome(rbio->real_stripes, step, pointers); + } else { + /* raid5 */ + memcpy(pointers[rbio->nr_data], pointers[0], step); + run_xor(pointers + 1, rbio->nr_data - 1, step); } + for (stripe = stripe - 1; stripe >= 0; stripe--) + kunmap_local(pointers[stripe]); +} + +/* Generate PQ for one vertical stripe. */ +static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) +{ + const bool has_qstripe = (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6); + + for (int i = 0; i < rbio->sector_nsteps; i++) + generate_pq_vertical_step(rbio, sectornr, i); + + set_bit(rbio_sector_index(rbio, rbio->nr_data, sectornr), + rbio->stripe_uptodate_bitmap); + if (has_qstripe) + set_bit(rbio_sector_index(rbio, rbio->nr_data + 1, sectornr), + rbio->stripe_uptodate_bitmap); +} + +static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + /* The total sector number inside the full stripe. */ + int total_sector_nr; + int sectornr; + int stripe; + int ret; + + ASSERT(bio_list_size(bio_list) == 0); + + /* We should have at least one data sector. */ + ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); /* - * time to start writing. Make bios for everything from the - * higher layers (the bio_list in our rbio) and our p/q. Ignore - * everything else. + * Reset errors, as we may have errors inherited from from degraded + * write. */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *page; - if (stripe < rbio->nr_data) { - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (!page) - continue; - } else { - page = rbio_stripe_page(rbio, stripe, pagenr); - } + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); - ret = rbio_add_io_page(rbio, &bio_list, - page, stripe, pagenr, rbio->stripe_len); - if (ret) - goto cleanup; - } - } + /* + * Start assembly. Make bios for everything from the higher layers (the + * bio_list in our rbio) and our P/Q. Ignore everything else. + */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + phys_addr_t *paddrs; - if (likely(!bbio->num_tgtdevs)) - goto write_data; + stripe = total_sector_nr / rbio->stripe_nsectors; + sectornr = total_sector_nr % rbio->stripe_nsectors; - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - if (!bbio->tgtdev_map[stripe]) + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) continue; - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *page; - if (stripe < rbio->nr_data) { - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (!page) - continue; - } else { - page = rbio_stripe_page(rbio, stripe, pagenr); - } - - ret = rbio_add_io_page(rbio, &bio_list, page, - rbio->bbio->tgtdev_map[stripe], - pagenr, rbio->stripe_len); - if (ret) - goto cleanup; + if (stripe < rbio->nr_data) { + paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1); + if (paddrs == NULL) + continue; + } else { + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); } + + ret = rbio_add_io_paddrs(rbio, bio_list, paddrs, stripe, + sectornr, REQ_OP_WRITE); + if (ret) + goto error; } -write_data: - atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); - BUG_ON(atomic_read(&rbio->stripes_pending) == 0); + if (likely(!rbio->bioc->replace_nr_stripes)) + return 0; - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; + /* + * Make a copy for the replace target device. + * + * Thus the source stripe number (in replace_stripe_src) should be valid. + */ + ASSERT(rbio->bioc->replace_stripe_src >= 0); - bio->bi_private = rbio; - bio->bi_end_io = raid_write_end_io; - bio->bi_opf = REQ_OP_WRITE; + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + phys_addr_t *paddrs; - submit_bio(bio); - } - return; + stripe = total_sector_nr / rbio->stripe_nsectors; + sectornr = total_sector_nr % rbio->stripe_nsectors; -cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); + /* + * For RAID56, there is only one device that can be replaced, + * and replace_stripe_src[0] indicates the stripe number we + * need to copy from. + */ + if (stripe != rbio->bioc->replace_stripe_src) { + /* + * We can skip the whole stripe completely, note + * total_sector_nr will be increased by one anyway. + */ + ASSERT(sectornr == 0); + total_sector_nr += rbio->stripe_nsectors - 1; + continue; + } - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + + if (stripe < rbio->nr_data) { + paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1); + if (paddrs == NULL) + continue; + } else { + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); + } + + ret = rbio_add_io_paddrs(rbio, bio_list, paddrs, + rbio->real_stripes, + sectornr, REQ_OP_WRITE); + if (ret) + goto error; + } + + return 0; +error: + bio_list_put(bio_list); + return -EIO; } -/* - * helper to find the stripe number for a given bio. Used to figure out which - * stripe has failed. This expects the bio to correspond to a physical disk, - * so it looks up based on physical sector numbers. - */ -static int find_bio_stripe(struct btrfs_raid_bio *rbio, - struct bio *bio) +static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) { - u64 physical = bio->bi_iter.bi_sector; - u64 stripe_start; - int i; - struct btrfs_bio_stripe *stripe; - - physical <<= 9; - - for (i = 0; i < rbio->bbio->num_stripes; i++) { - stripe = &rbio->bbio->stripes[i]; - stripe_start = stripe->physical; - if (physical >= stripe_start && - physical < stripe_start + rbio->stripe_len && - stripe->dev->bdev && - bio->bi_disk == stripe->dev->bdev->bd_disk && - bio->bi_partno == stripe->dev->bdev->bd_partno) { - return i; + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + rbio->bioc->full_stripe_logical; + int total_nr_sector = offset >> fs_info->sectorsize_bits; + + ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors); + + bitmap_set(rbio->error_bitmap, total_nr_sector, + bio->bi_iter.bi_size >> fs_info->sectorsize_bits); + + /* + * Special handling for raid56_alloc_missing_rbio() used by + * scrub/replace. Unlike call path in raid56_parity_recover(), they + * pass an empty bio here. Thus we have to find out the missing device + * and mark the stripe error instead. + */ + if (bio->bi_iter.bi_size == 0) { + bool found_missing = false; + int stripe_nr; + + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + if (!rbio->bioc->stripes[stripe_nr].dev->bdev) { + found_missing = true; + bitmap_set(rbio->error_bitmap, + stripe_nr * rbio->stripe_nsectors, + rbio->stripe_nsectors); + } } + ASSERT(found_missing); } - return -1; } /* - * helper to find the stripe number for a given - * bio (before mapping). Used to figure out which stripe has - * failed. This looks up based on logical block numbers. + * Return the index inside the rbio->stripe_sectors[] array. + * + * Return -1 if not found. */ -static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, - struct bio *bio) +static int find_stripe_sector_nr(struct btrfs_raid_bio *rbio, phys_addr_t paddr) { - u64 logical = bio->bi_iter.bi_sector; - u64 stripe_start; - int i; - - logical <<= 9; - - for (i = 0; i < rbio->nr_data; i++) { - stripe_start = rbio->bbio->raid_map[i]; - if (logical >= stripe_start && - logical < stripe_start + rbio->stripe_len) { + for (int i = 0; i < rbio->nr_sectors; i++) { + if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == paddr) return i; - } } return -1; } /* - * returns -EIO if we had too many failures + * this sets each page in the bio uptodate. It should only be used on private + * rbio pages, nothing that comes in from the higher layers */ -static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) +static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) { - unsigned long flags; - int ret = 0; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 step = min(sectorsize, PAGE_SIZE); + u32 offset = 0; + phys_addr_t paddr; - spin_lock_irqsave(&rbio->bio_list_lock, flags); + ASSERT(!bio_flagged(bio, BIO_CLONED)); - /* we already know this stripe is bad, move on */ - if (rbio->faila == failed || rbio->failb == failed) - goto out; + btrfs_bio_for_each_block_all(paddr, bio, step) { + /* Hitting the first step of a sector. */ + if (IS_ALIGNED(offset, sectorsize)) { + int sector_nr = find_stripe_sector_nr(rbio, paddr); - if (rbio->faila == -1) { - /* first failure on this rbio */ - rbio->faila = failed; - atomic_inc(&rbio->error); - } else if (rbio->failb == -1) { - /* second failure on this rbio */ - rbio->failb = failed; - atomic_inc(&rbio->error); - } else { - ret = -EIO; + ASSERT(sector_nr >= 0); + if (sector_nr >= 0) + set_bit(sector_nr, rbio->stripe_uptodate_bitmap); + } + offset += step; } -out: - spin_unlock_irqrestore(&rbio->bio_list_lock, flags); - - return ret; } -/* - * helper to fail a stripe based on a physical disk - * bio. - */ -static int fail_bio_stripe(struct btrfs_raid_bio *rbio, - struct bio *bio) +static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) { - int failed = find_bio_stripe(rbio, bio); - - if (failed < 0) - return -EIO; + phys_addr_t bvec_paddr = bvec_phys(bio_first_bvec_all(bio)); + int i; - return fail_rbio_index(rbio, failed); + for (i = 0; i < rbio->nr_sectors; i++) { + if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == bvec_paddr) + break; + if (rbio->bio_paddrs[i * rbio->sector_nsteps] == bvec_paddr) + break; + } + ASSERT(i < rbio->nr_sectors); + return i; } -/* - * this sets each page in the bio uptodate. It should only be used on private - * rbio pages, nothing that comes in from the higher layers - */ -static void set_bio_pages_uptodate(struct bio *bio) +static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio) { + int total_sector_nr = get_bio_sector_nr(rbio, bio); + u32 bio_size = 0; struct bio_vec *bvec; - struct bvec_iter_all iter_all; + int i; - ASSERT(!bio_flagged(bio, BIO_CLONED)); + bio_for_each_bvec_all(bvec, bio, i) + bio_size += bvec->bv_len; - bio_for_each_segment_all(bvec, bio, iter_all) - SetPageUptodate(bvec->bv_page); + /* + * Since we can have multiple bios touching the error_bitmap, we cannot + * call bitmap_set() without protection. + * + * Instead use set_bit() for each bit, as set_bit() itself is atomic. + */ + for (i = total_sector_nr; i < total_sector_nr + + (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++) + set_bit(i, rbio->error_bitmap); } -/* - * end io for the read phase of the rmw cycle. All the bios here are physical - * stripe bios we've read from the disk so we can recalculate the parity of the - * stripe. - * - * This will usually kick off finish_rmw once all the bios are read in, but it - * may trigger parity reconstruction if we had any errors along the way - */ -static void raid_rmw_end_io(struct bio *bio) +/* Verify the data sectors at read time. */ +static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, + struct bio *bio) { - struct btrfs_raid_bio *rbio = bio->bi_private; + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); + const u32 nr_steps = rbio->sector_nsteps; + int total_sector_nr = get_bio_sector_nr(rbio, bio); + u32 offset = 0; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; + phys_addr_t paddr; - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(bio); - - bio_put(bio); + /* No data csum for the whole stripe, no need to verify. */ + if (!rbio->csum_bitmap || !rbio->csum_buf) + return; - if (!atomic_dec_and_test(&rbio->stripes_pending)) + /* P/Q stripes, they have no data csum to verify against. */ + if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) return; - if (atomic_read(&rbio->error) > rbio->bbio->max_errors) - goto cleanup; + btrfs_bio_for_each_block_all(paddr, bio, step) { + u8 csum_buf[BTRFS_CSUM_SIZE]; + u8 *expected_csum; - /* - * this will normally call finish_rmw to start our write - * but if there are any failed stripes we'll reconstruct - * from parity first - */ - validate_rbio_for_rmw(rbio); - return; + paddrs[(offset / step) % nr_steps] = paddr; + offset += step; -cleanup: + /* Not yet covering the full fs block, continue to the next step. */ + if (!IS_ALIGNED(offset, fs_info->sectorsize)) + continue; + + /* No csum for this sector, skip to the next sector. */ + if (!test_bit(total_sector_nr, rbio->csum_bitmap)) + continue; - rbio_orig_end_io(rbio, BLK_STS_IOERR); + expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size; + btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf); + if (unlikely(memcmp(csum_buf, expected_csum, fs_info->csum_size) != 0)) + set_bit(total_sector_nr, rbio->error_bitmap); + total_sector_nr++; + } } -/* - * the stripe must be locked by the caller. It will - * unlock after all the writes are done - */ -static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) +static void raid_wait_read_end_io(struct bio *bio) { - int bios_to_read = 0; - struct bio_list bio_list; - int ret; - int pagenr; - int stripe; - struct bio *bio; - - bio_list_init(&bio_list); - - ret = alloc_rbio_pages(rbio); - if (ret) - goto cleanup; - - index_rbio_pages(rbio); - - atomic_set(&rbio->error, 0); - /* - * build a list of bios to read all the missing parts of this - * stripe - */ - for (stripe = 0; stripe < rbio->nr_data; stripe++) { - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *page; - /* - * we want to find all the pages missing from - * the rbio and read them from the disk. If - * page_in_rbio finds a page in the bio list - * we don't need to read it off the stripe. - */ - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (page) - continue; - - page = rbio_stripe_page(rbio, stripe, pagenr); - /* - * the bio cache may have handed us an uptodate - * page. If so, be happy and use it - */ - if (PageUptodate(page)) - continue; + struct btrfs_raid_bio *rbio = bio->bi_private; - ret = rbio_add_io_page(rbio, &bio_list, page, - stripe, pagenr, rbio->stripe_len); - if (ret) - goto cleanup; - } + if (bio->bi_status) { + rbio_update_error_bitmap(rbio, bio); + } else { + set_bio_pages_uptodate(rbio, bio); + verify_bio_data_sectors(rbio, bio); } - bios_to_read = bio_list_size(&bio_list); - if (!bios_to_read) { - /* - * this can happen if others have merged with - * us, it means there is nothing left to read. - * But if there are missing devices it may not be - * safe to do the full stripe write yet. - */ - goto finish; - } + bio_put(bio); + if (atomic_dec_and_test(&rbio->stripes_pending)) + wake_up(&rbio->io_wait); +} - /* - * the bbio may be freed once we submit the last bio. Make sure - * not to touch it after that - */ - atomic_set(&rbio->stripes_pending, bios_to_read); - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; +static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; - bio->bi_private = rbio; - bio->bi_end_io = raid_rmw_end_io; - bio->bi_opf = REQ_OP_READ; + atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); + while ((bio = bio_list_pop(bio_list))) { + bio->bi_end_io = raid_wait_read_end_io; - btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + if (trace_raid56_read_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_read(rbio, bio, &trace_info); + } submit_bio(bio); } - /* the actual write will happen once the reads are done */ - return 0; - -cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - return -EIO; - -finish: - validate_rbio_for_rmw(rbio); - return 0; + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); } -/* - * if the upper layers pass in a full stripe, we thank them by only allocating - * enough pages to hold the parity, and sending it all down quickly. - */ -static int full_stripe_write(struct btrfs_raid_bio *rbio) +static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) { + const int data_pages = rbio->nr_data * rbio->stripe_npages; int ret; - ret = alloc_rbio_parity_pages(rbio); - if (ret) { - __free_raid_bio(rbio); + ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false); + if (ret < 0) return ret; - } - ret = lock_stripe_add(rbio); - if (ret == 0) - finish_rmw(rbio); + index_stripe_sectors(rbio); return 0; } /* - * partial stripe writes get handed over to async helpers. - * We're really hoping to merge a few more writes into this - * rbio before calculating new parity - */ -static int partial_stripe_write(struct btrfs_raid_bio *rbio) -{ - int ret; - - ret = lock_stripe_add(rbio); - if (ret == 0) - start_async_work(rbio, rmw_work); - return 0; -} - -/* - * sometimes while we were reading from the drive to - * recalculate parity, enough new bios come into create - * a full stripe. So we do a check here to see if we can - * go directly to finish_rmw - */ -static int __raid56_parity_write(struct btrfs_raid_bio *rbio) -{ - /* head off into rmw land if we don't have a full stripe */ - if (!rbio_is_full(rbio)) - return partial_stripe_write(rbio); - return full_stripe_write(rbio); -} - -/* * We use plugging call backs to collect full stripes. * Any time we get a partial stripe write while plugged * we collect it into a list. When the unplug comes down, @@ -1656,18 +1772,18 @@ struct btrfs_plug_cb { struct blk_plug_cb cb; struct btrfs_fs_info *info; struct list_head rbio_list; - struct btrfs_work work; }; /* * rbios on the plug list are sorted for easier merging. */ -static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) +static int plug_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { - struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, - plug_list); - struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, - plug_list); + const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, + plug_list); + const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, + plug_list); u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; @@ -1678,466 +1794,466 @@ static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) return 0; } -static void run_plug(struct btrfs_plug_cb *plug) +static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule) { + struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb); struct btrfs_raid_bio *cur; struct btrfs_raid_bio *last = NULL; - /* - * sort our plug list then try to merge - * everything we can in hopes of creating full - * stripes. - */ list_sort(NULL, &plug->rbio_list, plug_cmp); + while (!list_empty(&plug->rbio_list)) { - cur = list_entry(plug->rbio_list.next, - struct btrfs_raid_bio, plug_list); + cur = list_first_entry(&plug->rbio_list, + struct btrfs_raid_bio, plug_list); list_del_init(&cur->plug_list); if (rbio_is_full(cur)) { - int ret; - - /* we have a full stripe, send it down */ - ret = full_stripe_write(cur); - BUG_ON(ret); + /* We have a full stripe, queue it down. */ + start_async_work(cur, rmw_rbio_work); continue; } if (last) { if (rbio_can_merge(last, cur)) { merge_rbio(last, cur); - __free_raid_bio(cur); + free_raid_bio(cur); continue; - } - __raid56_parity_write(last); + start_async_work(last, rmw_rbio_work); } last = cur; } - if (last) { - __raid56_parity_write(last); - } + if (last) + start_async_work(last, rmw_rbio_work); kfree(plug); } -/* - * if the unplug comes from schedule, we have to push the - * work off to a helper thread - */ -static void unplug_work(struct btrfs_work *work) +/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ +static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) { - struct btrfs_plug_cb *plug; - plug = container_of(work, struct btrfs_plug_cb, work); - run_plug(plug); -} + const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; + const u64 full_stripe_start = rbio->bioc->full_stripe_logical; + const u32 orig_len = orig_bio->bi_iter.bi_size; + const u32 sectorsize = fs_info->sectorsize; + u64 cur_logical; -static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) -{ - struct btrfs_plug_cb *plug; - plug = container_of(cb, struct btrfs_plug_cb, cb); + ASSERT_RBIO_LOGICAL(orig_logical >= full_stripe_start && + orig_logical + orig_len <= full_stripe_start + + rbio->nr_data * BTRFS_STRIPE_LEN, + rbio, orig_logical); - if (from_schedule) { - btrfs_init_work(&plug->work, unplug_work, NULL, NULL); - btrfs_queue_work(plug->info->rmw_workers, - &plug->work); - return; + bio_list_add(&rbio->bio_list, orig_bio); + rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; + + /* Update the dbitmap. */ + for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; + cur_logical += sectorsize) { + int bit = ((u32)(cur_logical - full_stripe_start) >> + fs_info->sectorsize_bits) % rbio->stripe_nsectors; + + set_bit(bit, &rbio->dbitmap); } - run_plug(plug); } /* * our main entry point for writes from the rest of the FS. */ -int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len) +void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) { + struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; - int ret; - rbio = alloc_rbio(fs_info, bbio, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { - btrfs_put_bbio(bbio); - return PTR_ERR(rbio); + bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); + bio_endio(bio); + return; } - bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_iter.bi_size; rbio->operation = BTRFS_RBIO_WRITE; - - btrfs_bio_counter_inc_noblocked(fs_info); - rbio->generic_bio_cnt = 1; + rbio_add_bio(rbio, bio); /* - * don't plug on full rbios, just get them out the door + * Don't plug on full rbios, just get them out the door * as quickly as we can */ - if (rbio_is_full(rbio)) { - ret = full_stripe_write(rbio); - if (ret) - btrfs_bio_counter_dec(fs_info); - return ret; - } - - cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); - if (cb) { - plug = container_of(cb, struct btrfs_plug_cb, cb); - if (!plug->info) { - plug->info = fs_info; - INIT_LIST_HEAD(&plug->rbio_list); + if (!rbio_is_full(rbio)) { + cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); + if (cb) { + plug = container_of(cb, struct btrfs_plug_cb, cb); + if (!plug->info) { + plug->info = fs_info; + INIT_LIST_HEAD(&plug->rbio_list); + } + list_add_tail(&rbio->plug_list, &plug->rbio_list); + return; } - list_add_tail(&rbio->plug_list, &plug->rbio_list); - ret = 0; - } else { - ret = __raid56_parity_write(rbio); - if (ret) - btrfs_bio_counter_dec(fs_info); } - return ret; + + /* + * Either we don't have any existing plug, or we're doing a full stripe, + * queue the rmw work now. + */ + start_async_work(rbio, rmw_rbio_work); } -/* - * all parity reconstruction happens here. We've read in everything - * we can find from the drives and this does the heavy lifting of - * sorting the good from the bad. - */ -static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) +static int verify_one_sector(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr) { - int pagenr, stripe; - void **pointers; - int faila = -1, failb = -1; - struct page *page; - blk_status_t err; - int i; + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + phys_addr_t *paddrs; + u8 csum_buf[BTRFS_CSUM_SIZE]; + u8 *csum_expected; - pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); - if (!pointers) { - err = BLK_STS_RESOURCE; - goto cleanup_io; + if (!rbio->csum_bitmap || !rbio->csum_buf) + return 0; + + /* No way to verify P/Q as they are not covered by data csum. */ + if (stripe_nr >= rbio->nr_data) + return 0; + /* + * If we're rebuilding a read, we have to use pages from the + * bio list if possible. + */ + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { + paddrs = sector_paddrs_in_rbio(rbio, stripe_nr, sector_nr, 0); + } else { + paddrs = rbio_stripe_paddrs(rbio, stripe_nr, sector_nr); } - faila = rbio->faila; - failb = rbio->failb; + csum_expected = rbio->csum_buf + + (stripe_nr * rbio->stripe_nsectors + sector_nr) * + fs_info->csum_size; + btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf); + if (unlikely(memcmp(csum_buf, csum_expected, fs_info->csum_size) != 0)) + return -EIO; + return 0; +} + +static void recover_vertical_step(struct btrfs_raid_bio *rbio, + unsigned int sector_nr, + unsigned int step_nr, + int faila, int failb, + void **pointers, void **unmap_array) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); + int stripe_nr; - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { - spin_lock_irq(&rbio->bio_list_lock); - set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); - spin_unlock_irq(&rbio->bio_list_lock); - } + ASSERT(step_nr < rbio->sector_nsteps); + ASSERT(sector_nr < rbio->stripe_nsectors); - index_rbio_pages(rbio); + /* + * Setup our array of pointers with sectors from each stripe + * + * NOTE: store a duplicate array of pointers to preserve the + * pointer order. + */ + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + phys_addr_t paddr; - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { /* - * Now we just use bitmap to mark the horizontal stripes in - * which we have data when doing parity scrub. + * If we're rebuilding a read, we have to use pages from the + * bio list if possible. */ - if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && - !test_bit(pagenr, rbio->dbitmap)) - continue; + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { + paddr = sector_paddr_in_rbio(rbio, stripe_nr, sector_nr, step_nr, 0); + } else { + paddr = rbio_stripe_paddr(rbio, stripe_nr, sector_nr, step_nr); + } + pointers[stripe_nr] = kmap_local_paddr(paddr); + unmap_array[stripe_nr] = pointers[stripe_nr]; + } - /* setup our array of pointers with pages - * from each stripe - */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + /* All raid6 handling here */ + if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { + /* Single failure, rebuild from parity raid5 style */ + if (failb < 0) { + if (faila == rbio->nr_data) + /* + * Just the P stripe has failed, without + * a bad data or Q stripe. + * We have nothing to do, just skip the + * recovery for this stripe. + */ + goto cleanup; /* - * if we're rebuilding a read, we have to use - * pages from the bio list + * a single failure in raid6 is rebuilt + * in the pstripe code below */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && - (stripe == faila || stripe == failb)) { - page = page_in_rbio(rbio, stripe, pagenr, 0); - } else { - page = rbio_stripe_page(rbio, stripe, pagenr); - } - pointers[stripe] = kmap(page); + goto pstripe; } - /* all raid6 handling here */ - if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) { - /* - * single failure, rebuild from parity raid5 - * style - */ - if (failb < 0) { - if (faila == rbio->nr_data) { - /* - * Just the P stripe has failed, without - * a bad data or Q stripe. - * TODO, we should redo the xor here. - */ - err = BLK_STS_IOERR; - goto cleanup; - } + /* + * If the q stripe is failed, do a pstripe reconstruction from + * the xors. + * If both the q stripe and the P stripe are failed, we're + * here due to a crc mismatch and we can't give them the + * data they want. + */ + if (failb == rbio->real_stripes - 1) { + if (faila == rbio->real_stripes - 2) /* - * a single failure in raid6 is rebuilt - * in the pstripe code below + * Only P and Q are corrupted. + * We only care about data stripes recovery, + * can skip this vertical stripe. */ - goto pstripe; - } - - /* make sure our ps and qs are in order */ - if (faila > failb) { - int tmp = failb; - failb = faila; - faila = tmp; - } - - /* if the q stripe is failed, do a pstripe reconstruction - * from the xors. - * If both the q stripe and the P stripe are failed, we're - * here due to a crc mismatch and we can't give them the - * data they want + goto cleanup; + /* + * Otherwise we have one bad data stripe and + * a good P stripe. raid5! */ - if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { - if (rbio->bbio->raid_map[faila] == - RAID5_P_STRIPE) { - err = BLK_STS_IOERR; - goto cleanup; - } - /* - * otherwise we have one bad data stripe and - * a good P stripe. raid5! - */ - goto pstripe; - } + goto pstripe; + } - if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) { - raid6_datap_recov(rbio->real_stripes, - PAGE_SIZE, faila, pointers); - } else { - raid6_2data_recov(rbio->real_stripes, - PAGE_SIZE, faila, failb, - pointers); - } + if (failb == rbio->real_stripes - 2) { + raid6_datap_recov(rbio->real_stripes, step, + faila, pointers); } else { - void *p; + raid6_2data_recov(rbio->real_stripes, step, + faila, failb, pointers); + } + } else { + void *p; - /* rebuild from P stripe here (raid5 or raid6) */ - BUG_ON(failb != -1); + /* Rebuild from P stripe here (raid5 or raid6). */ + ASSERT(failb == -1); pstripe: - /* Copy parity block into failed block to start with */ - copy_page(pointers[faila], pointers[rbio->nr_data]); - - /* rearrange the pointer array */ - p = pointers[faila]; - for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) - pointers[stripe] = pointers[stripe + 1]; - pointers[rbio->nr_data - 1] = p; - - /* xor in the rest */ - run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE); - } - /* if we're doing this rebuild as part of an rmw, go through - * and set all of our private rbio pages in the - * failed stripes as uptodate. This way finish_rmw will - * know they can be trusted. If this was a read reconstruction, - * other endio functions will fiddle the uptodate bits - */ - if (rbio->operation == BTRFS_RBIO_WRITE) { - for (i = 0; i < rbio->stripe_npages; i++) { - if (faila != -1) { - page = rbio_stripe_page(rbio, faila, i); - SetPageUptodate(page); - } - if (failb != -1) { - page = rbio_stripe_page(rbio, failb, i); - SetPageUptodate(page); - } - } - } - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - /* - * if we're rebuilding a read, we have to use - * pages from the bio list - */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && - (stripe == faila || stripe == failb)) { - page = page_in_rbio(rbio, stripe, pagenr, 0); - } else { - page = rbio_stripe_page(rbio, stripe, pagenr); - } - kunmap(page); - } + /* Copy parity block into failed block to start with */ + memcpy(pointers[faila], pointers[rbio->nr_data], step); + + /* Rearrange the pointer array */ + p = pointers[faila]; + for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1; + stripe_nr++) + pointers[stripe_nr] = pointers[stripe_nr + 1]; + pointers[rbio->nr_data - 1] = p; + + /* Xor in the rest */ + run_xor(pointers, rbio->nr_data - 1, step); } - err = BLK_STS_OK; cleanup: - kfree(pointers); + for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) + kunmap_local(unmap_array[stripe_nr]); +} + +/* + * Recover a vertical stripe specified by @sector_nr. + * @*pointers are the pre-allocated pointers by the caller, so we don't + * need to allocate/free the pointers again and again. + */ +static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, + void **pointers, void **unmap_array) +{ + int found_errors; + int faila; + int failb; + int ret = 0; -cleanup_io: /* - * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a - * valid rbio which is consistent with ondisk content, thus such a - * valid rbio can be cached to avoid further disk reads. + * Now we just use bitmap to mark the horizontal stripes in + * which we have data when doing parity scrub. */ - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { - /* - * - In case of two failures, where rbio->failb != -1: - * - * Do not cache this rbio since the above read reconstruction - * (raid6_datap_recov() or raid6_2data_recov()) may have - * changed some content of stripes which are not identical to - * on-disk content any more, otherwise, a later write/recover - * may steal stripe_pages from this rbio and end up with - * corruptions or rebuild failures. - * - * - In case of single failure, where rbio->failb == -1: - * - * Cache this rbio iff the above read reconstruction is - * executed without problems. - */ - if (err == BLK_STS_OK && rbio->failb < 0) - cache_rbio_pages(rbio); - else - clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - - rbio_orig_end_io(rbio, err); - } else if (err == BLK_STS_OK) { - rbio->faila = -1; - rbio->failb = -1; - - if (rbio->operation == BTRFS_RBIO_WRITE) - finish_rmw(rbio); - else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) - finish_parity_scrub(rbio, 0); - else - BUG(); - } else { - rbio_orig_end_io(rbio, err); + if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && + !test_bit(sector_nr, &rbio->dbitmap)) + return 0; + + found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila, + &failb); + /* + * No errors in the vertical stripe, skip it. Can happen for recovery + * which only part of a stripe failed csum check. + */ + if (!found_errors) + return 0; + + if (unlikely(found_errors > rbio->bioc->max_errors)) + return -EIO; + + for (int i = 0; i < rbio->sector_nsteps; i++) + recover_vertical_step(rbio, sector_nr, i, faila, failb, + pointers, unmap_array); + if (faila >= 0) { + ret = verify_one_sector(rbio, faila, sector_nr); + if (ret < 0) + return ret; + + set_bit(rbio_sector_index(rbio, faila, sector_nr), + rbio->stripe_uptodate_bitmap); + } + if (failb >= 0) { + ret = verify_one_sector(rbio, failb, sector_nr); + if (ret < 0) + return ret; + + set_bit(rbio_sector_index(rbio, failb, sector_nr), + rbio->stripe_uptodate_bitmap); } + return ret; } -/* - * This is called only for stripes we've read from disk to - * reconstruct the parity. - */ -static void raid_recover_end_io(struct bio *bio) +static int recover_sectors(struct btrfs_raid_bio *rbio) { - struct btrfs_raid_bio *rbio = bio->bi_private; + void **pointers = NULL; + void **unmap_array = NULL; + int sectornr; + int ret = 0; /* - * we only read stripe pages off the disk, set them - * up to date if there were no errors + * @pointers array stores the pointer for each sector. + * + * @unmap_array stores copy of pointers that does not get reordered + * during reconstruction so that kunmap_local works. */ - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(bio); - bio_put(bio); + pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + if (!pointers || !unmap_array) { + ret = -ENOMEM; + goto out; + } - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { + spin_lock(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock(&rbio->bio_list_lock); + } - if (atomic_read(&rbio->error) > rbio->bbio->max_errors) - rbio_orig_end_io(rbio, BLK_STS_IOERR); - else - __raid_recover_end_io(rbio); + index_rbio_pages(rbio); + + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + ret = recover_vertical(rbio, sectornr, pointers, unmap_array); + if (ret < 0) + break; + } + +out: + kfree(pointers); + kfree(unmap_array); + return ret; } -/* - * reads everything we need off the disk to reconstruct - * the parity. endio handlers trigger final reconstruction - * when the IO is done. - * - * This is used both for reads from the higher layers and for - * parity construction required to finish a rmw cycle. - */ -static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) +static void recover_rbio(struct btrfs_raid_bio *rbio) { - int bios_to_read = 0; - struct bio_list bio_list; - int ret; - int pagenr; - int stripe; - struct bio *bio; + struct bio_list bio_list = BIO_EMPTY_LIST; + int total_sector_nr; + int ret = 0; - bio_list_init(&bio_list); + /* + * Either we're doing recover for a read failure or degraded write, + * caller should have set error bitmap correctly. + */ + ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); + /* For recovery, we need to read all sectors including P/Q. */ ret = alloc_rbio_pages(rbio); - if (ret) - goto cleanup; + if (ret < 0) + goto out; - atomic_set(&rbio->error, 0); + index_rbio_pages(rbio); /* - * read everything that hasn't failed. Thanks to the - * stripe cache, it is possible that some or all of these - * pages are going to be uptodate. + * Read everything that hasn't failed. However this time we will + * not trust any cached sector. + * As we may read out some stale data but higher layer is not reading + * that stale part. + * + * So here we always re-read everything in recovery path. */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - if (rbio->faila == stripe || rbio->failb == stripe) { - atomic_inc(&rbio->error); - continue; - } - - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *p; + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; + phys_addr_t *paddrs; + /* + * Skip the range which has error. It can be a range which is + * marked error (for csum mismatch), or it can be a missing + * device. + */ + if (!rbio->bioc->stripes[stripe].dev->bdev || + test_bit(total_sector_nr, rbio->error_bitmap)) { /* - * the rmw code may have already read this - * page in + * Also set the error bit for missing device, which + * may not yet have its error bit set. */ - p = rbio_stripe_page(rbio, stripe, pagenr); - if (PageUptodate(p)) - continue; - - ret = rbio_add_io_page(rbio, &bio_list, - rbio_stripe_page(rbio, stripe, pagenr), - stripe, pagenr, rbio->stripe_len); - if (ret < 0) - goto cleanup; + set_bit(total_sector_nr, rbio->error_bitmap); + continue; } - } - bios_to_read = bio_list_size(&bio_list); - if (!bios_to_read) { - /* - * we might have no bios to read just because the pages - * were up to date, or we might have no bios to read because - * the devices were gone. - */ - if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { - __raid_recover_end_io(rbio); + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe, + sectornr, REQ_OP_READ); + if (ret < 0) { + bio_list_put(&bio_list); goto out; - } else { - goto cleanup; } } - /* - * the bbio may be freed once we submit the last bio. Make sure - * not to touch it after that - */ - atomic_set(&rbio->stripes_pending, bios_to_read); - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; + submit_read_wait_bio_list(rbio, &bio_list); + ret = recover_sectors(rbio); +out: + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); +} - bio->bi_private = rbio; - bio->bi_end_io = raid_recover_end_io; - bio->bi_opf = REQ_OP_READ; +static void recover_rbio_work(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio; - btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + rbio = container_of(work, struct btrfs_raid_bio, work); + if (!lock_stripe_add(rbio)) + recover_rbio(rbio); +} - submit_bio(bio); - } -out: - return 0; +static void recover_rbio_work_locked(struct work_struct *work) +{ + recover_rbio(container_of(work, struct btrfs_raid_bio, work)); +} -cleanup: - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) - rbio_orig_end_io(rbio, BLK_STS_IOERR); +static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) +{ + bool found = false; + int sector_nr; - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); + /* + * This is for RAID6 extra recovery tries, thus mirror number should + * be large than 2. + * Mirror 1 means read from data stripes. Mirror 2 means rebuild using + * RAID5 methods. + */ + ASSERT(mirror_num > 2); + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { + int found_errors; + int faila; + int failb; + + found_errors = get_rbio_vertical_errors(rbio, sector_nr, + &faila, &failb); + /* This vertical stripe doesn't have errors. */ + if (!found_errors) + continue; - return -EIO; + /* + * If we found errors, there should be only one error marked + * by previous set_rbio_range_error(). + */ + ASSERT(found_errors == 1); + found = true; + + /* Now select another stripe to mark as error. */ + failb = rbio->real_stripes - (mirror_num - 1); + if (failb <= faila) + failb--; + + /* Set the extra bit in error bitmap. */ + if (failb >= 0) + set_bit(failb * rbio->stripe_nsectors + sector_nr, + rbio->error_bitmap); + } + + /* We should found at least one vertical stripe with error.*/ + ASSERT(found); } /* @@ -2146,121 +2262,309 @@ cleanup: * so we assume the bio they send down corresponds to a failed part * of the drive. */ -int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len, - int mirror_num, int generic_io) +void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + int mirror_num) { + struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; - int ret; - - if (generic_io) { - ASSERT(bbio->mirror_num == mirror_num); - btrfs_io_bio(bio)->mirror_num = mirror_num; - } - rbio = alloc_rbio(fs_info, bbio, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { - if (generic_io) - btrfs_put_bbio(bbio); - return PTR_ERR(rbio); + bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); + bio_endio(bio); + return; } rbio->operation = BTRFS_RBIO_READ_REBUILD; - bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_iter.bi_size; - - rbio->faila = find_logical_bio_stripe(rbio, bio); - if (rbio->faila == -1) { - btrfs_warn(fs_info, - "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)", - __func__, (u64)bio->bi_iter.bi_sector << 9, - (u64)bio->bi_iter.bi_size, bbio->map_type); - if (generic_io) - btrfs_put_bbio(bbio); - kfree(rbio); - return -EIO; - } + rbio_add_bio(rbio, bio); - if (generic_io) { - btrfs_bio_counter_inc_noblocked(fs_info); - rbio->generic_bio_cnt = 1; - } else { - btrfs_get_bbio(bbio); - } + set_rbio_range_error(rbio, bio); /* * Loop retry: * for 'mirror == 2', reconstruct from all other stripes. * for 'mirror_num > 2', select a stripe to fail on every retry. */ - if (mirror_num > 2) { + if (mirror_num > 2) + set_rbio_raid6_extra_error(rbio, mirror_num); + + start_async_work(rbio, recover_rbio_work); +} + +static void fill_data_csums(struct btrfs_raid_bio *rbio) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, + rbio->bioc->full_stripe_logical); + const u64 start = rbio->bioc->full_stripe_logical; + const u32 len = (rbio->nr_data * rbio->stripe_nsectors) << + fs_info->sectorsize_bits; + int ret; + + /* The rbio should not have its csum buffer initialized. */ + ASSERT(!rbio->csum_buf && !rbio->csum_bitmap); + + /* + * Skip the csum search if: + * + * - The rbio doesn't belong to data block groups + * Then we are doing IO for tree blocks, no need to search csums. + * + * - The rbio belongs to mixed block groups + * This is to avoid deadlock, as we're already holding the full + * stripe lock, if we trigger a metadata read, and it needs to do + * raid56 recovery, we will deadlock. + */ + if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) || + rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA) + return; + + rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors * + fs_info->csum_size, GFP_NOFS); + rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors, + GFP_NOFS); + if (!rbio->csum_buf || !rbio->csum_bitmap) { + ret = -ENOMEM; + goto error; + } + + ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1, + rbio->csum_buf, rbio->csum_bitmap); + if (ret < 0) + goto error; + if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) + goto no_csum; + return; + +error: + /* + * We failed to allocate memory or grab the csum, but it's not fatal, + * we can still continue. But better to warn users that RMW is no + * longer safe for this particular sub-stripe write. + */ + btrfs_warn_rl(fs_info, +"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d", + rbio->bioc->full_stripe_logical, ret); +no_csum: + kfree(rbio->csum_buf); + bitmap_free(rbio->csum_bitmap); + rbio->csum_buf = NULL; + rbio->csum_bitmap = NULL; +} + +static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list = BIO_EMPTY_LIST; + int total_sector_nr; + int ret = 0; + + /* + * Fill the data csums we need for data verification. We need to fill + * the csum_bitmap/csum_buf first, as our endio function will try to + * verify the data sectors. + */ + fill_data_csums(rbio); + + /* + * Build a list of bios to read all sectors (including data and P/Q). + * + * This behavior is to compensate the later csum verification and recovery. + */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; + phys_addr_t *paddrs; + + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe, + sectornr, REQ_OP_READ); + if (ret) { + bio_list_put(&bio_list); + return ret; + } + } + + /* + * We may or may not have any corrupted sectors (including missing dev + * and csum mismatch), just let recover_sectors() to handle them all. + */ + submit_read_wait_bio_list(rbio, &bio_list); + return recover_sectors(rbio); +} + +static void raid_wait_write_end_io(struct bio *bio) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (bio->bi_status) + rbio_update_error_bitmap(rbio, bio); + bio_put(bio); + if (atomic_dec_and_test(&rbio->stripes_pending)) + wake_up(&rbio->io_wait); +} + +static void submit_write_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; + + atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); + while ((bio = bio_list_pop(bio_list))) { + bio->bi_end_io = raid_wait_write_end_io; + + if (trace_raid56_write_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_write(rbio, bio, &trace_info); + } + submit_bio(bio); + } +} + +/* + * To determine if we need to read any sector from the disk. + * Should only be utilized in RMW path, to skip cached rbio. + */ +static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) +{ + int i; + + for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { + phys_addr_t paddr = rbio->stripe_paddrs[i * rbio->sector_nsteps]; + /* - * 'mirror == 3' is to fail the p stripe and - * reconstruct from the q stripe. 'mirror > 3' is to - * fail a data stripe and reconstruct from p+q stripe. + * We have a sector which doesn't have page nor uptodate, + * thus this rbio can not be cached one, as cached one must + * have all its data sectors present and uptodate. */ - rbio->failb = rbio->real_stripes - (mirror_num - 1); - ASSERT(rbio->failb > 0); - if (rbio->failb <= rbio->faila) - rbio->failb--; + if (paddr == INVALID_PADDR || + !test_bit(i, rbio->stripe_uptodate_bitmap)) + return true; } + return false; +} - ret = lock_stripe_add(rbio); +static void rmw_rbio(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list; + int sectornr; + int ret = 0; /* - * __raid56_parity_recover will end the bio with - * any errors it hits. We don't want to return - * its error value up the stack because our caller - * will end up calling bio_endio with any nonzero - * return + * Allocate the pages for parity first, as P/Q pages will always be + * needed for both full-stripe and sub-stripe writes. */ - if (ret == 0) - __raid56_parity_recover(rbio); + ret = alloc_rbio_parity_pages(rbio); + if (ret < 0) + goto out; + /* - * our rbio has been added to the list of - * rbios that will be handled after the - * currently lock owner is done + * Either full stripe write, or we have every data sector already + * cached, can go to write path immediately. */ - return 0; + if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) { + /* + * Now we're doing sub-stripe write, also need all data stripes + * to do the full RMW. + */ + ret = alloc_rbio_data_pages(rbio); + if (ret < 0) + goto out; + + index_rbio_pages(rbio); + + ret = rmw_read_wait_recover(rbio); + if (ret < 0) + goto out; + } + + /* + * At this stage we're not allowed to add any new bios to the + * bio list any more, anyone else that wants to change this stripe + * needs to do their own rmw. + */ + spin_lock(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock(&rbio->bio_list_lock); + + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); + index_rbio_pages(rbio); + + /* + * We don't cache full rbios because we're assuming + * the higher layers are unlikely to use this area of + * the disk again soon. If they do use it again, + * hopefully they will send another full bio. + */ + if (!rbio_is_full(rbio)) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) + generate_pq_vertical(rbio, sectornr); + + bio_list_init(&bio_list); + ret = rmw_assemble_write_bios(rbio, &bio_list); + if (ret < 0) + goto out; + + /* We should have at least one bio assembled. */ + ASSERT(bio_list_size(&bio_list)); + submit_write_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + /* We may have more errors than our tolerance during the read. */ + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + int found_errors; + + found_errors = get_rbio_vertical_errors(rbio, sectornr, NULL, NULL); + if (unlikely(found_errors > rbio->bioc->max_errors)) { + ret = -EIO; + break; + } + } +out: + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } -static void rmw_work(struct btrfs_work *work) +static void rmw_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; rbio = container_of(work, struct btrfs_raid_bio, work); - raid56_rmw_stripe(rbio); + if (lock_stripe_add(rbio) == 0) + rmw_rbio(rbio); } -static void read_rebuild_work(struct btrfs_work *work) +static void rmw_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - - rbio = container_of(work, struct btrfs_raid_bio, work); - __raid56_parity_recover(rbio); + rmw_rbio(container_of(work, struct btrfs_raid_bio, work)); } /* * The following code is used to scrub/replace the parity stripe * - * Caller must have already increased bio_counter for getting @bbio. + * Caller must have already increased bio_counter for getting @bioc. * * Note: We need make sure all the pages that add into the scrub/replace * raid bio are correct and not be changed during the scrub/replace. That * is those pages just hold metadata or file data with checksum. */ -struct btrfs_raid_bio * -raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len, - struct btrfs_device *scrub_dev, - unsigned long *dbitmap, int stripe_nsectors) +struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, + struct btrfs_io_context *bioc, + struct btrfs_device *scrub_dev, + unsigned long *dbitmap, int stripe_nsectors) { + struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; int i; - rbio = alloc_rbio(fs_info, bbio, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) return NULL; bio_list_add(&rbio->bio_list, bio); @@ -2272,45 +2576,40 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, rbio->operation = BTRFS_RBIO_PARITY_SCRUB; /* - * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted + * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted * to the end position, so this search can start from the first parity * stripe. */ for (i = rbio->nr_data; i < rbio->real_stripes; i++) { - if (bbio->stripes[i].dev == scrub_dev) { + if (bioc->stripes[i].dev == scrub_dev) { rbio->scrubp = i; break; } } - ASSERT(i < rbio->real_stripes); - - /* Now we just support the sectorsize equals to page size */ - ASSERT(fs_info->sectorsize == PAGE_SIZE); - ASSERT(rbio->stripe_npages == stripe_nsectors); - bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); - - /* - * We have already increased bio_counter when getting bbio, record it - * so we can free it at rbio_orig_end_io(). - */ - rbio->generic_bio_cnt = 1; + ASSERT_RBIO_STRIPE(i < rbio->real_stripes, rbio, i); + bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); return rbio; } -/* Used for both parity scrub and missing. */ -void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, - u64 logical) +static int alloc_rbio_sector_pages(struct btrfs_raid_bio *rbio, + int sector_nr) { - int stripe_offset; - int index; + const u32 step = min(PAGE_SIZE, rbio->bioc->fs_info->sectorsize); + const u32 base = sector_nr * rbio->sector_nsteps; - ASSERT(logical >= rbio->bbio->raid_map[0]); - ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + - rbio->stripe_len * rbio->nr_data); - stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); - index = stripe_offset >> PAGE_SHIFT; - rbio->bio_pages[index] = page; + for (int i = base; i < base + rbio->sector_nsteps; i++) { + const unsigned int page_index = (i * step) >> PAGE_SHIFT; + struct page *page; + + if (rbio->stripe_pages[page_index]) + continue; + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + rbio->stripe_pages[page_index] = page; + } + return 0; } /* @@ -2319,40 +2618,96 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, */ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) { - int i; - int bit; - int index; - struct page *page; + int total_sector_nr; - for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { - for (i = 0; i < rbio->real_stripes; i++) { - index = i * rbio->stripe_npages + bit; - if (rbio->stripe_pages[index]) - continue; + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + int sectornr = total_sector_nr % rbio->stripe_nsectors; + int ret; - page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!page) - return -ENOMEM; - rbio->stripe_pages[index] = page; - } + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + ret = alloc_rbio_sector_pages(rbio, total_sector_nr); + if (ret < 0) + return ret; } + index_stripe_sectors(rbio); return 0; } -static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, - int need_check) +/* Return true if the content of the step matches the caclulated one. */ +static bool verify_one_parity_step(struct btrfs_raid_bio *rbio, + void *pointers[], unsigned int sector_nr, + unsigned int step_nr) { - struct btrfs_bio *bbio = rbio->bbio; + const unsigned int nr_data = rbio->nr_data; + const bool has_qstripe = (rbio->real_stripes - rbio->nr_data == 2); + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); + void *parity; + bool ret = false; + + ASSERT(step_nr < rbio->sector_nsteps); + + /* First collect one page from each data stripe. */ + for (int stripe = 0; stripe < nr_data; stripe++) + pointers[stripe] = kmap_local_paddr( + sector_paddr_in_rbio(rbio, stripe, sector_nr, + step_nr, 0)); + + if (has_qstripe) { + assert_rbio(rbio); + /* RAID6, call the library function to fill in our P/Q. */ + raid6_call.gen_syndrome(rbio->real_stripes, step, pointers); + } else { + /* RAID5. */ + memcpy(pointers[nr_data], pointers[0], step); + run_xor(pointers + 1, nr_data - 1, step); + } + + /* Check scrubbing parity and repair it. */ + parity = kmap_local_paddr(rbio_stripe_paddr(rbio, rbio->scrubp, sector_nr, step_nr)); + if (memcmp(parity, pointers[rbio->scrubp], step) != 0) + memcpy(parity, pointers[rbio->scrubp], step); + else + ret = true; + kunmap_local(parity); + + for (int stripe = nr_data - 1; stripe >= 0; stripe--) + kunmap_local(pointers[stripe]); + return ret; +} + +/* + * The @pointers array should have the P/Q parity already mapped. + */ +static void verify_one_parity_sector(struct btrfs_raid_bio *rbio, + void *pointers[], unsigned int sector_nr) +{ + bool found_error = false; + + for (int step_nr = 0; step_nr < rbio->sector_nsteps; step_nr++) { + bool match; + + match = verify_one_parity_step(rbio, pointers, sector_nr, step_nr); + if (!match) + found_error = true; + } + if (!found_error) + bitmap_clear(&rbio->dbitmap, sector_nr, 1); +} + +static int finish_parity_scrub(struct btrfs_raid_bio *rbio) +{ + struct btrfs_io_context *bioc = rbio->bioc; void **pointers = rbio->finish_pointers; - unsigned long *pbitmap = rbio->finish_pbitmap; + unsigned long *pbitmap = &rbio->finish_pbitmap; int nr_data = rbio->nr_data; - int stripe; - int pagenr; + int sectornr; bool has_qstripe; - struct page *p_page = NULL; - struct page *q_page = NULL; + struct page *page; + phys_addr_t p_paddr = INVALID_PADDR; + phys_addr_t q_paddr = INVALID_PADDR; struct bio_list bio_list; - struct bio *bio; int is_replace = 0; int ret; @@ -2365,9 +2720,13 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, else BUG(); - if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { + /* + * Replace is running and our P/Q stripe is being replaced, then we + * need to duplicate the final write to replace target. + */ + if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) { is_replace = 1; - bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); + bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); } /* @@ -2377,83 +2736,52 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, */ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - if (!need_check) - goto writeback; - - p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!p_page) - goto cleanup; - SetPageUptodate(p_page); + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + p_paddr = page_to_phys(page); + page = NULL; + pointers[nr_data] = kmap_local_paddr(p_paddr); if (has_qstripe) { - q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!q_page) { - __free_page(p_page); - goto cleanup; + /* RAID6, allocate and map temp space for the Q stripe */ + page = alloc_page(GFP_NOFS); + if (!page) { + __free_page(phys_to_page(p_paddr)); + p_paddr = INVALID_PADDR; + return -ENOMEM; } - SetPageUptodate(q_page); + q_paddr = page_to_phys(page); + page = NULL; + pointers[rbio->real_stripes - 1] = kmap_local_paddr(q_paddr); } - atomic_set(&rbio->error, 0); - - for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { - struct page *p; - void *parity; - /* first collect one page from each data stripe */ - for (stripe = 0; stripe < nr_data; stripe++) { - p = page_in_rbio(rbio, stripe, pagenr, 0); - pointers[stripe] = kmap(p); - } - - /* then add the parity stripe */ - pointers[stripe++] = kmap(p_page); - - if (has_qstripe) { - /* - * raid6, add the qstripe and call the - * library function to fill in our p/q - */ - pointers[stripe++] = kmap(q_page); + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); - raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, - pointers); - } else { - /* raid5 */ - copy_page(pointers[nr_data], pointers[0]); - run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); - } + /* Map the parity stripe just once */ - /* Check scrubbing parity and repair it */ - p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); - parity = kmap(p); - if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) - copy_page(parity, pointers[rbio->scrubp]); - else - /* Parity is right, needn't writeback */ - bitmap_clear(rbio->dbitmap, pagenr, 1); - kunmap(p); + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) + verify_one_parity_sector(rbio, pointers, sectornr); - for (stripe = 0; stripe < nr_data; stripe++) - kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); - kunmap(p_page); + kunmap_local(pointers[nr_data]); + __free_page(phys_to_page(p_paddr)); + p_paddr = INVALID_PADDR; + if (q_paddr != INVALID_PADDR) { + __free_page(phys_to_page(q_paddr)); + q_paddr = INVALID_PADDR; } - __free_page(p_page); - if (q_page) - __free_page(q_page); - -writeback: /* * time to start writing. Make bios for everything from the * higher layers (the bio_list in our rbio) and our p/q. Ignore * everything else. */ - for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { - struct page *page; + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { + phys_addr_t *paddrs; - page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); - ret = rbio_add_io_page(rbio, &bio_list, - page, rbio->scrubp, pagenr, rbio->stripe_len); + paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->scrubp, + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2461,45 +2789,28 @@ writeback: if (!is_replace) goto submit_write; - for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { - struct page *page; + /* + * Replace is running and our parity stripe needs to be duplicated to + * the target device. Check we have a valid source stripe number. + */ + ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio); + for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { + phys_addr_t *paddrs; - page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); - ret = rbio_add_io_page(rbio, &bio_list, page, - bbio->tgtdev_map[rbio->scrubp], - pagenr, rbio->stripe_len); + paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->real_stripes, + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } submit_write: - nr_data = bio_list_size(&bio_list); - if (!nr_data) { - /* Every parity is right */ - rbio_orig_end_io(rbio, BLK_STS_OK); - return; - } - - atomic_set(&rbio->stripes_pending, nr_data); - - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; - - bio->bi_private = rbio; - bio->bi_end_io = raid_write_end_io; - bio->bi_opf = REQ_OP_WRITE; - - submit_bio(bio); - } - return; + submit_write_bios(rbio, &bio_list); + return 0; cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); + bio_list_put(&bio_list); + return ret; } static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) @@ -2509,241 +2820,238 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) return 0; } -/* - * While we're doing the parity check and repair, we could have errors - * in reading pages off the disk. This checks for errors and if we're - * not able to read the page it'll trigger parity reconstruction. The - * parity scrub will be finished after we've reconstructed the failed - * stripes - */ -static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) +static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) { - if (atomic_read(&rbio->error) > rbio->bbio->max_errors) - goto cleanup; + void **pointers = NULL; + void **unmap_array = NULL; + int sector_nr; + int ret = 0; - if (rbio->faila >= 0 || rbio->failb >= 0) { + /* + * @pointers array stores the pointer for each sector. + * + * @unmap_array stores copy of pointers that does not get reordered + * during reconstruction so that kunmap_local works. + */ + pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + if (!pointers || !unmap_array) { + ret = -ENOMEM; + goto out; + } + + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { int dfail = 0, failp = -1; + int faila; + int failb; + int found_errors; + + found_errors = get_rbio_vertical_errors(rbio, sector_nr, + &faila, &failb); + if (unlikely(found_errors > rbio->bioc->max_errors)) { + ret = -EIO; + goto out; + } + if (found_errors == 0) + continue; - if (is_data_stripe(rbio, rbio->faila)) - dfail++; - else if (is_parity_stripe(rbio->faila)) - failp = rbio->faila; + /* We should have at least one error here. */ + ASSERT(faila >= 0 || failb >= 0); - if (is_data_stripe(rbio, rbio->failb)) + if (is_data_stripe(rbio, faila)) dfail++; - else if (is_parity_stripe(rbio->failb)) - failp = rbio->failb; + else if (is_parity_stripe(faila)) + failp = faila; + if (is_data_stripe(rbio, failb)) + dfail++; + else if (is_parity_stripe(failb)) + failp = failb; /* - * Because we can not use a scrubbing parity to repair - * the data, so the capability of the repair is declined. - * (In the case of RAID5, we can not repair anything) + * Because we can not use a scrubbing parity to repair the + * data, so the capability of the repair is declined. (In the + * case of RAID5, we can not repair anything.) */ - if (dfail > rbio->bbio->max_errors - 1) - goto cleanup; - + if (unlikely(dfail > rbio->bioc->max_errors - 1)) { + ret = -EIO; + goto out; + } /* - * If all data is good, only parity is correctly, just - * repair the parity. + * If all data is good, only parity is correctly, just repair + * the parity, no need to recover data stripes. */ - if (dfail == 0) { - finish_parity_scrub(rbio, 0); - return; - } + if (dfail == 0) + continue; /* * Here means we got one corrupted data stripe and one - * corrupted parity on RAID6, if the corrupted parity - * is scrubbing parity, luckily, use the other one to repair - * the data, or we can not repair the data stripe. + * corrupted parity on RAID6, if the corrupted parity is + * scrubbing parity, luckily, use the other one to repair the + * data, or we can not repair the data stripe. */ - if (failp != rbio->scrubp) - goto cleanup; + if (unlikely(failp != rbio->scrubp)) { + ret = -EIO; + goto out; + } - __raid_recover_end_io(rbio); - } else { - finish_parity_scrub(rbio, 1); + ret = recover_vertical(rbio, sector_nr, pointers, unmap_array); + if (ret < 0) + goto out; } - return; - -cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); +out: + kfree(pointers); + kfree(unmap_array); + return ret; } -/* - * end io for the read phase of the rmw cycle. All the bios here are physical - * stripe bios we've read from the disk so we can recalculate the parity of the - * stripe. - * - * This will usually kick off finish_rmw once all the bios are read in, but it - * may trigger parity reconstruction if we had any errors along the way - */ -static void raid56_parity_scrub_end_io(struct bio *bio) +static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) { - struct btrfs_raid_bio *rbio = bio->bi_private; + struct bio_list bio_list = BIO_EMPTY_LIST; + int total_sector_nr; + int ret = 0; - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(bio); + /* Build a list of bios to read all the missing parts. */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + int sectornr = total_sector_nr % rbio->stripe_nsectors; + int stripe = total_sector_nr / rbio->stripe_nsectors; + phys_addr_t *paddrs; - bio_put(bio); + /* No data in the vertical stripe, no need to read. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + /* + * We want to find all the sectors missing from the rbio and + * read them from the disk. If sector_paddr_in_rbio() finds a sector + * in the bio list we don't need to read it off the stripe. + */ + paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1); + if (paddrs == NULL) + continue; - /* - * this will normally call finish_rmw to start our write - * but if there are any failed stripes we'll reconstruct - * from parity first - */ - validate_rbio_for_parity_scrub(rbio); + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); + /* + * The bio cache may have handed us an uptodate sector. If so, + * use it. + */ + if (test_bit(rbio_sector_index(rbio, stripe, sectornr), + rbio->stripe_uptodate_bitmap)) + continue; + + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe, + sectornr, REQ_OP_READ); + if (ret) { + bio_list_put(&bio_list); + return ret; + } + } + + submit_read_wait_bio_list(rbio, &bio_list); + return 0; } -static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) +static void scrub_rbio(struct btrfs_raid_bio *rbio) { - int bios_to_read = 0; - struct bio_list bio_list; + int sector_nr; int ret; - int pagenr; - int stripe; - struct bio *bio; - - bio_list_init(&bio_list); ret = alloc_rbio_essential_pages(rbio); if (ret) - goto cleanup; + goto out; - atomic_set(&rbio->error, 0); - /* - * build a list of bios to read all the missing parts of this - * stripe - */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { - struct page *page; - /* - * we want to find all the pages missing from - * the rbio and read them from the disk. If - * page_in_rbio finds a page in the bio list - * we don't need to read it off the stripe. - */ - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (page) - continue; + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); - page = rbio_stripe_page(rbio, stripe, pagenr); - /* - * the bio cache may have handed us an uptodate - * page. If so, be happy and use it - */ - if (PageUptodate(page)) - continue; - - ret = rbio_add_io_page(rbio, &bio_list, page, - stripe, pagenr, rbio->stripe_len); - if (ret) - goto cleanup; - } - } + ret = scrub_assemble_read_bios(rbio); + if (ret < 0) + goto out; - bios_to_read = bio_list_size(&bio_list); - if (!bios_to_read) { - /* - * this can happen if others have merged with - * us, it means there is nothing left to read. - * But if there are missing devices it may not be - * safe to do the full stripe write yet. - */ - goto finish; - } + /* We may have some failures, recover the failed sectors first. */ + ret = recover_scrub_rbio(rbio); + if (ret < 0) + goto out; /* - * the bbio may be freed once we submit the last bio. Make sure - * not to touch it after that + * We have every sector properly prepared. Can finish the scrub + * and writeback the good content. */ - atomic_set(&rbio->stripes_pending, bios_to_read); - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) + ret = finish_parity_scrub(rbio); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { + int found_errors; + + found_errors = get_rbio_vertical_errors(rbio, sector_nr, NULL, NULL); + if (unlikely(found_errors > rbio->bioc->max_errors)) { + ret = -EIO; break; - - bio->bi_private = rbio; - bio->bi_end_io = raid56_parity_scrub_end_io; - bio->bi_opf = REQ_OP_READ; - - btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - - submit_bio(bio); + } } - /* the actual write will happen once the reads are done */ - return; - -cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return; - -finish: - validate_rbio_for_parity_scrub(rbio); +out: + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } -static void scrub_parity_work(struct btrfs_work *work) +static void scrub_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - - rbio = container_of(work, struct btrfs_raid_bio, work); - raid56_parity_scrub_stripe(rbio); + scrub_rbio(container_of(work, struct btrfs_raid_bio, work)); } void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) { if (!lock_stripe_add(rbio)) - start_async_work(rbio, scrub_parity_work); + start_async_work(rbio, scrub_rbio_work_locked); } -/* The following code is used for dev replace of a missing RAID 5/6 device. */ - -struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 length) -{ - struct btrfs_raid_bio *rbio; - - rbio = alloc_rbio(fs_info, bbio, length); - if (IS_ERR(rbio)) - return NULL; +/* + * This is for scrub call sites where we already have correct data contents. + * This allows us to avoid reading data stripes again. + * + * Unfortunately here we have to do folio copy, other than reusing the pages. + * This is due to the fact rbio has its own page management for its cache. + */ +void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio, + struct folio **data_folios, u64 data_logical) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u64 offset_in_full_stripe = data_logical - + rbio->bioc->full_stripe_logical; + unsigned int findex = 0; + unsigned int foffset = 0; + int ret; - rbio->operation = BTRFS_RBIO_REBUILD_MISSING; - bio_list_add(&rbio->bio_list, bio); /* - * This is a special bio which is used to hold the completion handler - * and make the scrub rbio is similar to the other types + * If we hit ENOMEM temporarily, but later at + * raid56_parity_submit_scrub_rbio() time it succeeded, we just do + * the extra read, not a big deal. + * + * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time, + * the bio would got proper error number set. */ - ASSERT(!bio->bi_iter.bi_size); + ret = alloc_rbio_data_pages(rbio); + if (ret < 0) + return; - rbio->faila = find_logical_bio_stripe(rbio, bio); - if (rbio->faila == -1) { - BUG(); - kfree(rbio); - return NULL; + /* data_logical must be at stripe boundary and inside the full stripe. */ + ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN)); + ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT)); + + for (unsigned int cur_off = offset_in_full_stripe; + cur_off < offset_in_full_stripe + BTRFS_STRIPE_LEN; + cur_off += PAGE_SIZE) { + const unsigned int pindex = cur_off >> PAGE_SHIFT; + void *kaddr; + + kaddr = kmap_local_page(rbio->stripe_pages[pindex]); + memcpy_from_folio(kaddr, data_folios[findex], foffset, PAGE_SIZE); + kunmap_local(kaddr); + + foffset += PAGE_SIZE; + ASSERT(foffset <= folio_size(data_folios[findex])); + if (foffset == folio_size(data_folios[findex])) { + findex++; + foffset = 0; + } } - - /* - * When we get bbio, we have already increased bio_counter, record it - * so we can free it at rbio_orig_end_io() - */ - rbio->generic_bio_cnt = 1; - - return rbio; -} - -void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) -{ - if (!lock_stripe_add(rbio)) - start_async_work(rbio, read_rebuild_work); + bitmap_set(rbio->stripe_uptodate_bitmap, + offset_in_full_stripe >> fs_info->sectorsize_bits, + BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); } |
