diff options
Diffstat (limited to 'drivers/md/raid5-cache.c')
| -rw-r--r-- | drivers/md/raid5-cache.c | 493 |
1 files changed, 263 insertions, 230 deletions
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index bfa1e907c472..e29e69335c69 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2015 Shaohua Li <shli@fb.com> * Copyright (C) 2016 Song Liu <songliubraving@fb.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * */ #include <linux/kernel.h> #include <linux/wait.h> @@ -23,7 +14,7 @@ #include <linux/types.h> #include "md.h" #include "raid5.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "raid5-log.h" /* @@ -125,16 +116,16 @@ struct r5l_log { struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ struct kmem_cache *io_kc; - mempool_t *io_pool; - struct bio_set *bs; - mempool_t *meta_pool; + mempool_t io_pool; + struct bio_set bs; + mempool_t meta_pool; - struct md_thread *reclaim_thread; + struct md_thread __rcu *reclaim_thread; unsigned long reclaim_target; /* number of space that need to be * reclaimed. if it's 0, reclaim spaces * used by io_units which are in * IO_UNIT_STRIPE_END state (eg, reclaim - * dones't wait for specific io_unit + * doesn't wait for specific io_unit * switching to IO_UNIT_STRIPE_END * state) */ wait_queue_head_t iounit_wait; @@ -204,9 +195,7 @@ struct r5l_log { static inline sector_t r5c_tree_index(struct r5conf *conf, sector_t sect) { - sector_t offset; - - offset = sector_div(sect, conf->chunk_sectors); + sector_div(sect, conf->chunk_sectors); return sect; } @@ -236,9 +225,10 @@ struct r5l_io_unit { bool need_split_bio; struct bio *split_bio; - unsigned int has_flush:1; /* include flush request */ - unsigned int has_fua:1; /* include fua request */ - unsigned int has_null_flush:1; /* include empty flush request */ + unsigned int has_flush:1; /* include flush request */ + unsigned int has_fua:1; /* include fua request */ + unsigned int has_null_flush:1; /* include null flush request */ + unsigned int has_flush_payload:1; /* include flush payload */ /* * io isn't sent yet, flush/fua request can only be submitted till it's * the first IO in running_ios list @@ -306,8 +296,8 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev) wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { - wbi2 = r5_next_bio(wbi, dev->sector); + dev->sector + RAID5_STRIPE_SECTORS(conf)) { + wbi2 = r5_next_bio(conf, wbi, dev->sector); md_write_end(conf->mddev); bio_endio(wbi); wbi = wbi2; @@ -323,10 +313,6 @@ void r5c_handle_cached_data_endio(struct r5conf *conf, if (sh->dev[i].written) { set_bit(R5_UPTODATE, &sh->dev[i].flags); r5c_return_dev_pending_writes(conf, &sh->dev[i]); - bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, - !test_bit(STRIPE_DEGRADED, &sh->state), - 0); } } } @@ -337,8 +323,9 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space); void r5c_check_stripe_cache_usage(struct r5conf *conf) { int total_cached; + struct r5l_log *log = READ_ONCE(conf->log); - if (!r5c_is_writeback(conf->log)) + if (!r5c_is_writeback(log)) return; total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + @@ -354,7 +341,7 @@ void r5c_check_stripe_cache_usage(struct r5conf *conf) */ if (total_cached > conf->min_nr_stripes * 1 / 2 || atomic_read(&conf->empty_inactive_list_nr) > 0) - r5l_wake_reclaim(conf->log, 0); + r5l_wake_reclaim(log, 0); } /* @@ -363,7 +350,9 @@ void r5c_check_stripe_cache_usage(struct r5conf *conf) */ void r5c_check_cached_full_stripe(struct r5conf *conf) { - if (!r5c_is_writeback(conf->log)) + struct r5l_log *log = READ_ONCE(conf->log); + + if (!r5c_is_writeback(log)) return; /* @@ -372,8 +361,8 @@ void r5c_check_cached_full_stripe(struct r5conf *conf) */ if (atomic_read(&conf->r5c_cached_full_stripes) >= min(R5C_FULL_STRIPE_FLUSH_BATCH(conf), - conf->chunk_sectors >> STRIPE_SHIFT)) - r5l_wake_reclaim(conf->log, 0); + conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf))) + r5l_wake_reclaim(log, 0); } /* @@ -406,7 +395,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf) */ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); if (!r5c_is_writeback(log)) return 0; @@ -459,7 +448,7 @@ static inline void r5c_update_log_state(struct r5l_log *log) void r5c_make_stripe_write_out(struct stripe_head *sh) { struct r5conf *conf = sh->raid_conf; - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); BUG_ON(!r5c_is_writeback(log)); @@ -501,7 +490,7 @@ static void r5c_handle_parity_cached(struct stripe_head *sh) */ static void r5c_finish_cache_stripe(struct stripe_head *sh) { - struct r5l_log *log = sh->raid_conf->log; + struct r5l_log *log = READ_ONCE(sh->raid_conf->log); if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); @@ -538,7 +527,7 @@ static void r5l_log_run_stripes(struct r5l_log *log) { struct r5l_io_unit *io, *next; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { /* don't change list order */ @@ -554,7 +543,7 @@ static void r5l_move_to_end_ios(struct r5l_log *log) { struct r5l_io_unit *io, *next; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { /* don't change list order */ @@ -571,15 +560,27 @@ static void r5l_log_endio(struct bio *bio) struct r5l_io_unit *io_deferred; struct r5l_log *log = io->log; unsigned long flags; + bool has_null_flush; + bool has_flush_payload; if (bio->bi_status) md_error(log->rdev->mddev, log->rdev); bio_put(bio); - mempool_free(io->meta_page, log->meta_pool); + mempool_free(io->meta_page, &log->meta_pool); spin_lock_irqsave(&log->io_list_lock, flags); __r5l_set_io_unit_state(io, IO_UNIT_IO_END); + + /* + * if the io doesn't not have null_flush or flush payload, + * it is not safe to access it after releasing io_list_lock. + * Therefore, it is necessary to check the condition with + * the lock held. + */ + has_null_flush = io->has_null_flush; + has_flush_payload = io->has_flush_payload; + if (log->need_cache_flush && !list_empty(&io->stripe_list)) r5l_move_to_end_ios(log); else @@ -600,19 +601,23 @@ static void r5l_log_endio(struct bio *bio) if (log->need_cache_flush) md_wakeup_thread(log->rdev->mddev->thread); - if (io->has_null_flush) { + /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */ + if (has_null_flush) { struct bio *bi; WARN_ON(bio_list_empty(&io->flush_barriers)); while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { bio_endio(bi); - atomic_dec(&io->pending_stripe); + if (atomic_dec_and_test(&io->pending_stripe)) { + __r5l_stripe_write_finished(io); + return; + } } } - - /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */ - if (atomic_read(&io->pending_stripe) == 0) - __r5l_stripe_write_finished(io); + /* decrease pending_stripe for flush payload */ + if (has_flush_payload) + if (atomic_dec_and_test(&io->pending_stripe)) + __r5l_stripe_write_finished(io); } static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) @@ -676,6 +681,7 @@ static void r5c_disable_writeback_async(struct work_struct *work) struct r5l_log *log = container_of(work, struct r5l_log, disable_writeback_work); struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) return; @@ -684,17 +690,20 @@ static void r5c_disable_writeback_async(struct work_struct *work) /* wait superblock change before suspend */ wait_event(mddev->sb_wait, + !READ_ONCE(conf->log) || !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); - mddev_suspend(mddev); - log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; - mddev_resume(mddev); + log = READ_ONCE(conf->log); + if (log) { + mddev_suspend(mddev, false); + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; + mddev_resume(mddev); + } } static void r5l_submit_current_io(struct r5l_log *log) { struct r5l_io_unit *io = log->current_io; - struct bio *bio; struct r5l_meta_block *block; unsigned long flags; u32 crc; @@ -705,9 +714,8 @@ static void r5l_submit_current_io(struct r5l_log *log) block = page_address(io->meta_page); block->meta_size = cpu_to_le32(io->meta_offset); - crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); + crc = crc32c(log->uuid_checksum, block, PAGE_SIZE); block->checksum = cpu_to_le32(crc); - bio = io->current_bio; log->current_io = NULL; spin_lock_irqsave(&log->io_list_lock, flags); @@ -725,10 +733,9 @@ static void r5l_submit_current_io(struct r5l_log *log) static struct bio *r5l_bio_alloc(struct r5l_log *log) { - struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); + struct bio *bio = bio_alloc_bioset(log->rdev->bdev, BIO_MAX_VECS, + REQ_OP_WRITE, GFP_NOIO, &log->bs); - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - bio->bi_bdev = log->rdev->bdev; bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; return bio; @@ -757,7 +764,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) struct r5l_io_unit *io; struct r5l_meta_block *block; - io = mempool_alloc(log->io_pool, GFP_ATOMIC); + io = mempool_alloc(&log->io_pool, GFP_ATOMIC); if (!io) return NULL; memset(io, 0, sizeof(*io)); @@ -768,7 +775,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) bio_list_init(&io->flush_barriers); io->state = IO_UNIT_RUNNING; - io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); + io->meta_page = mempool_alloc(&log->meta_pool, GFP_NOIO); block = page_address(io->meta_page); clear_page(block); block->magic = cpu_to_le32(R5LOG_MAGIC); @@ -783,7 +790,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) io->current_bio = r5l_bio_alloc(log); io->current_bio->bi_end_io = r5l_log_endio; io->current_bio->bi_private = io; - bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); + __bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); r5_reserve_log_entry(log, io); @@ -881,6 +888,11 @@ static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect) payload->size = cpu_to_le32(sizeof(__le64)); payload->flush_stripes[0] = cpu_to_le64(sect); io->meta_offset += meta_size; + /* multiple flush payloads count as one pending_stripe */ + if (!io->has_flush_payload) { + io->has_flush_payload = 1; + atomic_inc(&io->pending_stripe); + } mutex_unlock(&log->io_mutex); } @@ -1007,10 +1019,10 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) /* checksum is already calculated in last run */ if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) continue; - addr = kmap_atomic(sh->dev[i].page); - sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, - addr, PAGE_SIZE); - kunmap_atomic(addr); + addr = kmap_local_page(sh->dev[i].page); + sh->dev[i].log_checksum = crc32c(log->uuid_checksum, + addr, PAGE_SIZE); + kunmap_local(addr); } parity_pages = 1 + !!(sh->qd_idx >= 0); data_pages = write_disks - parity_pages; @@ -1083,9 +1095,6 @@ void r5l_write_stripe_run(struct r5l_log *log) int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) { - if (!log) - return -ENODEV; - if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { /* * in write through (journal only) @@ -1140,7 +1149,7 @@ static void r5l_run_no_space_stripes(struct r5l_log *log) static sector_t r5c_calculate_new_cp(struct r5conf *conf) { struct stripe_head *sh; - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); sector_t new_cp; unsigned long flags; @@ -1148,12 +1157,12 @@ static sector_t r5c_calculate_new_cp(struct r5conf *conf) return log->next_checkpoint; spin_lock_irqsave(&log->stripe_in_journal_lock, flags); - if (list_empty(&conf->log->stripe_in_journal_list)) { + if (list_empty(&log->stripe_in_journal_list)) { /* all stripes flushed */ spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); return log->next_checkpoint; } - sh = list_first_entry(&conf->log->stripe_in_journal_list, + sh = list_first_entry(&log->stripe_in_journal_list, struct stripe_head, r5c); new_cp = sh->log_start; spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); @@ -1172,7 +1181,7 @@ static void r5l_run_no_mem_stripe(struct r5l_log *log) { struct stripe_head *sh; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); if (!list_empty(&log->no_mem_stripes)) { sh = list_first_entry(&log->no_mem_stripes, @@ -1188,7 +1197,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log) struct r5l_io_unit *io, *next; bool found = false; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { /* don't change list order */ @@ -1198,7 +1207,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log) log->next_checkpoint = io->log_start; list_del(&io->log_sibling); - mempool_free(io, log->io_pool); + mempool_free(io, &log->io_pool); r5l_run_no_mem_stripe(log); found = true; @@ -1249,6 +1258,7 @@ static void r5l_log_flush_endio(struct bio *bio) if (bio->bi_status) md_error(log->rdev->mddev, log->rdev); + bio_uninit(bio); spin_lock_irqsave(&log->io_list_lock, flags); list_for_each_entry(io, &log->flushing_ios, log_sibling) @@ -1290,10 +1300,9 @@ void r5l_flush_stripe_to_raid(struct r5l_log *log) if (!do_flush) return; - bio_reset(&log->flush_bio); - log->flush_bio.bi_bdev = log->rdev->bdev; + bio_init(&log->flush_bio, log->rdev->bdev, NULL, 0, + REQ_OP_WRITE | REQ_PREFLUSH); log->flush_bio.bi_end_io = r5l_log_flush_endio; - log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; submit_bio(&log->flush_bio); } @@ -1306,7 +1315,7 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log, r5l_write_super(log, end); - if (!blk_queue_discard(bdev_get_queue(bdev))) + if (!bdev_max_discard_sectors(bdev)) return; mddev = log->rdev->mddev; @@ -1315,9 +1324,9 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log, * superblock is updated to new log tail. Updating superblock (either * directly call md_update_sb() or depend on md thread) must hold * reconfig mutex. On the other hand, raid5_quiesce is called with - * reconfig_mutex hold. The first step of raid5_quiesce() is waitting - * for all IO finish, hence waitting for reclaim thread, while reclaim - * thread is calling this function and waitting for reconfig mutex. So + * reconfig_mutex hold. The first step of raid5_quiesce() is waiting + * for all IO finish, hence waiting for reclaim thread, while reclaim + * thread is calling this function and waiting for reconfig mutex. So * there is a deadlock. We workaround this issue with a trylock. * FIXME: we could miss discard if we can't take reconfig mutex */ @@ -1332,14 +1341,14 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log, if (log->last_checkpoint < end) { blkdev_issue_discard(bdev, log->last_checkpoint + log->rdev->data_offset, - end - log->last_checkpoint, GFP_NOIO, 0); + end - log->last_checkpoint, GFP_NOIO); } else { blkdev_issue_discard(bdev, log->last_checkpoint + log->rdev->data_offset, log->device_size - log->last_checkpoint, - GFP_NOIO, 0); + GFP_NOIO); blkdev_issue_discard(bdev, log->rdev->data_offset, end, - GFP_NOIO, 0); + GFP_NOIO); } } @@ -1360,7 +1369,7 @@ static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) * raid5_release_stripe() while holding conf->device_lock */ BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); - assert_spin_locked(&conf->device_lock); + lockdep_assert_held(&conf->device_lock); list_del_init(&sh->lru); atomic_inc(&sh->count); @@ -1387,8 +1396,8 @@ void r5c_flush_cache(struct r5conf *conf, int num) int count; struct stripe_head *sh, *next; - assert_spin_locked(&conf->device_lock); - if (!conf->log) + lockdep_assert_held(&conf->device_lock); + if (!READ_ONCE(conf->log)) return; count = 0; @@ -1409,7 +1418,7 @@ void r5c_flush_cache(struct r5conf *conf, int num) static void r5c_do_reclaim(struct r5conf *conf) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); struct stripe_head *sh; int count = 0; unsigned long flags; @@ -1538,7 +1547,7 @@ static void r5l_reclaim_thread(struct md_thread *thread) { struct mddev *mddev = thread->mddev; struct r5conf *conf = mddev->private; - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); if (!log) return; @@ -1553,45 +1562,40 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space) if (!log) return; + + target = READ_ONCE(log->reclaim_target); do { - target = log->reclaim_target; if (new < target) return; - } while (cmpxchg(&log->reclaim_target, target, new) != target); + } while (!try_cmpxchg(&log->reclaim_target, &target, new)); md_wakeup_thread(log->reclaim_thread); } -void r5l_quiesce(struct r5l_log *log, int state) +void r5l_quiesce(struct r5l_log *log, int quiesce) { - struct mddev *mddev; - if (!log || state == 2) - return; - if (state == 0) - kthread_unpark(log->reclaim_thread->tsk); - else if (state == 1) { + struct mddev *mddev = log->rdev->mddev; + struct md_thread *thread = rcu_dereference_protected( + log->reclaim_thread, lockdep_is_held(&mddev->reconfig_mutex)); + + if (quiesce) { /* make sure r5l_write_super_and_discard_space exits */ - mddev = log->rdev->mddev; wake_up(&mddev->sb_wait); - kthread_park(log->reclaim_thread->tsk); + kthread_park(thread->tsk); r5l_wake_reclaim(log, MaxSector); r5l_do_reclaim(log); - } + } else + kthread_unpark(thread->tsk); } bool r5l_log_disk_error(struct r5conf *conf) { - struct r5l_log *log; - bool ret; - /* don't allow write if journal disk is missing */ - rcu_read_lock(); - log = rcu_dereference(conf->log); + struct r5l_log *log = READ_ONCE(conf->log); + /* don't allow write if journal disk is missing */ if (!log) - ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); + return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); else - ret = test_bit(Faulty, &log->rdev->flags); - rcu_read_unlock(); - return ret; + return test_bit(Faulty, &log->rdev->flags); } #define R5L_RECOVERY_PAGE_POOL_SIZE 256 @@ -1613,10 +1617,10 @@ struct r5l_recovery_ctx { * just copy data from the pool. */ struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE]; + struct bio_vec ra_bvec[R5L_RECOVERY_PAGE_POOL_SIZE]; sector_t pool_offset; /* offset of first page in the pool */ int total_pages; /* total allocated pages */ int valid_pages; /* pages with valid data */ - struct bio *ra_bio; /* bio to do the read ahead */ }; static int r5l_recovery_allocate_ra_pool(struct r5l_log *log, @@ -1624,10 +1628,6 @@ static int r5l_recovery_allocate_ra_pool(struct r5l_log *log, { struct page *page; - ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, log->bs); - if (!ctx->ra_bio) - return -ENOMEM; - ctx->valid_pages = 0; ctx->total_pages = 0; while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) { @@ -1639,10 +1639,8 @@ static int r5l_recovery_allocate_ra_pool(struct r5l_log *log, ctx->total_pages += 1; } - if (ctx->total_pages == 0) { - bio_put(ctx->ra_bio); + if (ctx->total_pages == 0) return -ENOMEM; - } ctx->pool_offset = 0; return 0; @@ -1655,7 +1653,6 @@ static void r5l_recovery_free_ra_pool(struct r5l_log *log, for (i = 0; i < ctx->total_pages; ++i) put_page(ctx->ra_pool[i]); - bio_put(ctx->ra_bio); } /* @@ -1668,17 +1665,19 @@ static int r5l_recovery_fetch_ra_pool(struct r5l_log *log, struct r5l_recovery_ctx *ctx, sector_t offset) { - bio_reset(ctx->ra_bio); - ctx->ra_bio->bi_bdev = log->rdev->bdev; - bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0); - ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset; + struct bio bio; + int ret; + + bio_init(&bio, log->rdev->bdev, ctx->ra_bvec, + R5L_RECOVERY_PAGE_POOL_SIZE, REQ_OP_READ); + bio.bi_iter.bi_sector = log->rdev->data_offset + offset; ctx->valid_pages = 0; ctx->pool_offset = offset; while (ctx->valid_pages < ctx->total_pages) { - bio_add_page(ctx->ra_bio, - ctx->ra_pool[ctx->valid_pages], PAGE_SIZE, 0); + __bio_add_page(&bio, ctx->ra_pool[ctx->valid_pages], PAGE_SIZE, + 0); ctx->valid_pages += 1; offset = r5l_ring_add(log, offset, BLOCK_SECTORS); @@ -1687,7 +1686,9 @@ static int r5l_recovery_fetch_ra_pool(struct r5l_log *log, break; } - return submit_bio_wait(ctx->ra_bio); + ret = submit_bio_wait(&bio); + bio_uninit(&bio); + return ret; } /* @@ -1740,7 +1741,7 @@ static int r5l_recovery_read_meta_block(struct r5l_log *log, le64_to_cpu(mb->position) != ctx->pos) return -EINVAL; - crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); + crc = crc32c(log->uuid_checksum, mb, PAGE_SIZE); if (stored_crc != crc) return -EINVAL; @@ -1779,9 +1780,8 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, return -ENOMEM; r5l_recovery_create_empty_meta_block(log, page, pos, seq); mb = page_address(page); - mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, - mb, PAGE_SIZE)); - if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, + mb->checksum = cpu_to_le32(crc32c(log->uuid_checksum, mb, PAGE_SIZE)); + if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE | REQ_SYNC | REQ_FUA, false)) { __free_page(page); return -EIO; @@ -1885,28 +1885,22 @@ r5l_recovery_replay_one_stripe(struct r5conf *conf, continue; /* in case device is broken */ - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[disk_index].rdev); + rdev = conf->disks[disk_index].rdev; if (rdev) { atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); sync_page_io(rdev, sh->sector, PAGE_SIZE, - sh->dev[disk_index].page, REQ_OP_WRITE, 0, + sh->dev[disk_index].page, REQ_OP_WRITE, false); rdev_dec_pending(rdev, rdev->mddev); - rcu_read_lock(); } - rrdev = rcu_dereference(conf->disks[disk_index].replacement); + rrdev = conf->disks[disk_index].replacement; if (rrdev) { atomic_inc(&rrdev->nr_pending); - rcu_read_unlock(); sync_page_io(rrdev, sh->sector, PAGE_SIZE, - sh->dev[disk_index].page, REQ_OP_WRITE, 0, + sh->dev[disk_index].page, REQ_OP_WRITE, false); rdev_dec_pending(rrdev, rrdev->mddev); - rcu_read_lock(); } - rcu_read_unlock(); } ctx->data_parity_stripes++; out: @@ -1914,12 +1908,15 @@ out: } static struct stripe_head * -r5c_recovery_alloc_stripe(struct r5conf *conf, - sector_t stripe_sect) +r5c_recovery_alloc_stripe( + struct r5conf *conf, + sector_t stripe_sect, + int noblock) { struct stripe_head *sh; - sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0); + sh = raid5_get_active_stripe(conf, NULL, stripe_sect, + noblock ? R5_GAS_NOBLOCK : 0); if (!sh) return NULL; /* no more stripe available */ @@ -1977,9 +1974,9 @@ r5l_recovery_verify_data_checksum(struct r5l_log *log, u32 checksum; r5l_recovery_read_page(log, ctx, page, log_offset); - addr = kmap_atomic(page); - checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); - kunmap_atomic(addr); + addr = kmap_local_page(page); + checksum = crc32c(log->uuid_checksum, addr, PAGE_SIZE); + kunmap_local(addr); return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; } @@ -2129,7 +2126,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, stripe_sect); if (!sh) { - sh = r5c_recovery_alloc_stripe(conf, stripe_sect); + sh = r5c_recovery_alloc_stripe(conf, stripe_sect, 1); /* * cannot get stripe from raid5_get_active_stripe * try replay some stripes @@ -2138,20 +2135,29 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, r5c_recovery_replay_stripes( cached_stripe_list, ctx); sh = r5c_recovery_alloc_stripe( - conf, stripe_sect); + conf, stripe_sect, 1); } if (!sh) { + int new_size = conf->min_nr_stripes * 2; pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", mdname(mddev), - conf->min_nr_stripes * 2); - raid5_set_cache_size(mddev, - conf->min_nr_stripes * 2); - sh = r5c_recovery_alloc_stripe(conf, - stripe_sect); + new_size); + ret = raid5_set_cache_size(mddev, new_size); + if (conf->min_nr_stripes <= new_size / 2) { + pr_err("md/raid:%s: Cannot increase cache size, ret=%d, new_size=%d, min_nr_stripes=%d, max_nr_stripes=%d\n", + mdname(mddev), + ret, + new_size, + conf->min_nr_stripes, + conf->max_nr_stripes); + return -ENOMEM; + } + sh = r5c_recovery_alloc_stripe( + conf, stripe_sect, 0); } if (!sh) { pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", - mdname(mddev)); + mdname(mddev)); return -ENOMEM; } list_add_tail(&sh->lru, cached_stripe_list); @@ -2370,13 +2376,13 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, payload->size = cpu_to_le32(BLOCK_SECTORS); payload->location = cpu_to_le64( raid5_compute_blocknr(sh, i, 0)); - addr = kmap_atomic(dev->page); + addr = kmap_local_page(dev->page); payload->checksum[0] = cpu_to_le32( - crc32c_le(log->uuid_checksum, addr, - PAGE_SIZE)); - kunmap_atomic(addr); + crc32c(log->uuid_checksum, addr, + PAGE_SIZE)); + kunmap_local(addr); sync_page_io(log->rdev, write_pos, PAGE_SIZE, - dev->page, REQ_OP_WRITE, 0, false); + dev->page, REQ_OP_WRITE, false); write_pos = r5l_ring_add(log, write_pos, BLOCK_SECTORS); offset += sizeof(__le32) + @@ -2385,10 +2391,10 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, } } mb->meta_size = cpu_to_le32(offset); - mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, - mb, PAGE_SIZE)); + mb->checksum = cpu_to_le32(crc32c(log->uuid_checksum, + mb, PAGE_SIZE)); sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, - REQ_OP_WRITE, REQ_SYNC | REQ_FUA, false); + REQ_OP_WRITE | REQ_SYNC | REQ_FUA, false); sh->log_start = ctx->pos; list_add_tail(&sh->r5c, &log->stripe_in_journal_list); atomic_inc(&log->stripe_in_journal_count); @@ -2407,10 +2413,15 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, struct mddev *mddev = log->rdev->mddev; struct r5conf *conf = mddev->private; struct stripe_head *sh, *next; + bool cleared_pending = false; if (ctx->data_only_stripes == 0) return; + if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { + cleared_pending = true; + clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); + } log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK; list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { @@ -2420,12 +2431,13 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, raid5_release_stripe(sh); } - md_wakeup_thread(conf->mddev->thread); /* reuse conf->wait_for_quiescent in recovery */ wait_event(conf->wait_for_quiescent, atomic_read(&conf->active_stripes) == 0); log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; + if (cleared_pending) + set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); } static int r5l_recovery_log(struct r5l_log *log) @@ -2463,10 +2475,10 @@ static int r5l_recovery_log(struct r5l_log *log) ctx->seq += 10000; if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0)) - pr_debug("md/raid:%s: starting from clean shutdown\n", + pr_info("md/raid:%s: starting from clean shutdown\n", mdname(mddev)); else - pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", + pr_info("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", mdname(mddev), ctx->data_only_stripes, ctx->data_parity_stripes); @@ -2507,11 +2519,16 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp) static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; int ret; - if (!conf->log) - return 0; + ret = mddev_lock(mddev); + if (ret) + return ret; + + conf = mddev->private; + if (!conf || !conf->log) + goto out_unlock; switch (conf->log->r5c_journal_mode) { case R5C_JOURNAL_MODE_WRITE_THROUGH: @@ -2529,6 +2546,9 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) default: ret = 0; } + +out_unlock: + mddev_unlock(mddev); return ret; } @@ -2540,23 +2560,21 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) */ int r5c_journal_mode_set(struct mddev *mddev, int mode) { - struct r5conf *conf = mddev->private; - struct r5l_log *log = conf->log; - - if (!log) - return -ENODEV; + struct r5conf *conf; if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH || mode > R5C_JOURNAL_MODE_WRITE_BACK) return -EINVAL; + conf = mddev->private; + if (!conf || !conf->log) + return -ENODEV; + if (raid5_calc_degraded(conf) > 0 && mode == R5C_JOURNAL_MODE_WRITE_BACK) return -EINVAL; - mddev_suspend(mddev); conf->log->r5c_journal_mode = mode; - mddev_resume(mddev); pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", mdname(mddev), mode, r5c_journal_mode_str[mode]); @@ -2569,6 +2587,7 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev, { int mode = ARRAY_SIZE(r5c_journal_mode_str); size_t len = length; + int ret; if (len < 2) return -EINVAL; @@ -2580,8 +2599,12 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev, if (strlen(r5c_journal_mode_str[mode]) == len && !strncmp(page, r5c_journal_mode_str[mode], len)) break; - - return r5c_journal_mode_set(mddev, mode) ?: length; + ret = mddev_suspend_and_lock(mddev); + if (ret) + return ret; + ret = r5c_journal_mode_set(mddev, mode); + mddev_unlock_and_resume(mddev); + return ret ?: length; } struct md_sysfs_entry @@ -2601,11 +2624,11 @@ int r5c_try_caching_write(struct r5conf *conf, struct stripe_head_state *s, int disks) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); int i; struct r5dev *dev; int to_cache = 0; - void **pslot; + void __rcu **pslot; sector_t tree_index; int ret; uintptr_t refcount; @@ -2768,11 +2791,10 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, struct stripe_head_state *s) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); int i; - int do_wakeup = 0; sector_t tree_index; - void **pslot; + void __rcu **pslot; uintptr_t refcount; if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) @@ -2787,7 +2809,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, for (i = sh->disks; i--; ) { clear_bit(R5_InJournal, &sh->dev[i].flags); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - do_wakeup = 1; + wake_up_bit(&sh->dev[i].flags, R5_Overlap); } /* @@ -2800,9 +2822,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, if (atomic_dec_and_test(&conf->pending_full_writes)) md_wakeup_thread(conf->mddev->thread); - if (do_wakeup) - wake_up(&conf->wait_for_overlap); - spin_lock_irq(&log->stripe_in_journal_lock); list_del_init(&sh->r5c); spin_unlock_irq(&log->stripe_in_journal_lock); @@ -2864,10 +2883,10 @@ int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh) if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) continue; - addr = kmap_atomic(sh->dev[i].page); - sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, - addr, PAGE_SIZE); - kunmap_atomic(addr); + addr = kmap_local_page(sh->dev[i].page); + sh->dev[i].log_checksum = crc32c(log->uuid_checksum, + addr, PAGE_SIZE); + kunmap_local(addr); pages++; } WARN_ON(pages == 0); @@ -2907,14 +2926,13 @@ int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh) /* check whether this big stripe is in write back cache. */ bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); sector_t tree_index; void *slot; if (!log) return false; - WARN_ON_ONCE(!rcu_read_lock_held()); tree_index = r5c_tree_index(conf, sect); slot = radix_tree_lookup(&log->big_stripe_tree, tree_index); return slot != NULL; @@ -2937,7 +2955,7 @@ static int r5l_load_log(struct r5l_log *log) if (!page) return -ENOMEM; - if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) { + if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, false)) { ret = -EIO; goto ioerr; } @@ -2950,7 +2968,7 @@ static int r5l_load_log(struct r5l_log *log) } stored_crc = le32_to_cpu(mb->checksum); mb->checksum = 0; - expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); + expected_crc = crc32c(log->uuid_checksum, mb, PAGE_SIZE); if (stored_crc != expected_crc) { create_super = true; goto create; @@ -2961,7 +2979,7 @@ static int r5l_load_log(struct r5l_log *log) } create: if (create_super) { - log->last_cp_seq = prandom_u32(); + log->last_cp_seq = get_random_u32(); cp = 0; r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq); /* @@ -2995,28 +3013,45 @@ ioerr: return ret; } +int r5l_start(struct r5l_log *log) +{ + int ret; + + if (!log) + return 0; + + ret = r5l_load_log(log); + if (ret) { + struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + + r5l_exit_log(conf); + } + return ret; +} + void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) { struct r5conf *conf = mddev->private; - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); if (!log) return; if ((raid5_calc_degraded(conf) > 0 || test_bit(Journal, &rdev->flags)) && - conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) + log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) schedule_work(&log->disable_writeback_work); } int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) { - struct request_queue *q = bdev_get_queue(rdev->bdev); struct r5l_log *log; - char b[BDEVNAME_SIZE]; + struct md_thread *thread; + int ret; - pr_debug("md/raid:%s: using device %s as journal\n", - mdname(conf->mddev), bdevname(rdev->bdev, b)); + pr_debug("md/raid:%s: using device %pg as journal\n", + mdname(conf->mddev), rdev->bdev); if (PAGE_SIZE != 4096) return -EINVAL; @@ -3040,11 +3075,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (!log) return -ENOMEM; log->rdev = rdev; - - log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; - - log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, - sizeof(rdev->mddev->uuid)); + log->need_cache_flush = bdev_write_cache(rdev->bdev); + log->uuid_checksum = crc32c(~0, rdev->mddev->uuid, + sizeof(rdev->mddev->uuid)); mutex_init(&log->io_mutex); @@ -3053,32 +3086,33 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) INIT_LIST_HEAD(&log->io_end_ios); INIT_LIST_HEAD(&log->flushing_ios); INIT_LIST_HEAD(&log->finished_ios); - bio_init(&log->flush_bio, NULL, 0); log->io_kc = KMEM_CACHE(r5l_io_unit, 0); if (!log->io_kc) goto io_kc; - log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc); - if (!log->io_pool) + ret = mempool_init_slab_pool(&log->io_pool, R5L_POOL_SIZE, log->io_kc); + if (ret) goto io_pool; - log->bs = bioset_create(R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS); - if (!log->bs) + ret = bioset_init(&log->bs, R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (ret) goto io_bs; - log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0); - if (!log->meta_pool) + ret = mempool_init_page_pool(&log->meta_pool, R5L_POOL_SIZE, 0); + if (ret) goto out_mempool; spin_lock_init(&log->tree_lock); - INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN); + INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT); - log->reclaim_thread = md_register_thread(r5l_reclaim_thread, - log->rdev->mddev, "reclaim"); - if (!log->reclaim_thread) + thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, + "reclaim"); + if (!thread) goto reclaim_thread; - log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; + + thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; + rcu_assign_pointer(log->reclaim_thread, thread); init_waitqueue_head(&log->iounit_wait); @@ -3095,23 +3129,17 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) spin_lock_init(&log->stripe_in_journal_lock); atomic_set(&log->stripe_in_journal_count, 0); - rcu_assign_pointer(conf->log, log); - - if (r5l_load_log(log)) - goto error; + WRITE_ONCE(conf->log, log); set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); return 0; -error: - rcu_assign_pointer(conf->log, NULL); - md_unregister_thread(&log->reclaim_thread); reclaim_thread: - mempool_destroy(log->meta_pool); + mempool_exit(&log->meta_pool); out_mempool: - bioset_free(log->bs); + bioset_exit(&log->bs); io_bs: - mempool_destroy(log->io_pool); + mempool_exit(&log->io_pool); io_pool: kmem_cache_destroy(log->io_kc); io_kc: @@ -3123,14 +3151,19 @@ void r5l_exit_log(struct r5conf *conf) { struct r5l_log *log = conf->log; - conf->log = NULL; - synchronize_rcu(); + md_unregister_thread(conf->mddev, &log->reclaim_thread); + /* + * 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to + * ensure disable_writeback_work wakes up and exits. + */ + WRITE_ONCE(conf->log, NULL); + wake_up(&conf->mddev->sb_wait); flush_work(&log->disable_writeback_work); - md_unregister_thread(&log->reclaim_thread); - mempool_destroy(log->meta_pool); - bioset_free(log->bs); - mempool_destroy(log->io_pool); + + mempool_exit(&log->meta_pool); + bioset_exit(&log->bs); + mempool_exit(&log->io_pool); kmem_cache_destroy(log->io_kc); kfree(log); } |
