diff options
Diffstat (limited to 'drivers/md/raid5.h')
| -rw-r--r-- | drivers/md/raid5.h | 365 |
1 files changed, 330 insertions, 35 deletions
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index b0b663b119a8..eafc6e9ed6ee 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -1,8 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _RAID5_H #define _RAID5_H #include <linux/raid/xor.h> #include <linux/dmaengine.h> +#include <linux/local_lock.h> /* * @@ -49,7 +51,7 @@ * can't distinguish between a clean block that has been generated * from parity calculations, and a clean block that has been * successfully written to the spare ( or to parity when resyncing). - * To distingush these states we have a stripe bit STRIPE_INSYNC that + * To distinguish these states we have a stripe bit STRIPE_INSYNC that * is set whenever a write is scheduled to the spare, or to the parity * disc if there is no spare. A sync request clears this bit, and * when we find it set with no buffers locked, we know the sync is @@ -155,7 +157,7 @@ */ /* - * Operations state - intermediate states that are visible outside of + * Operations state - intermediate states that are visible outside of * STRIPE_ACTIVE. * In general _idle indicates nothing is running, _run indicates a data * processing operation is active, and _result means the data processing result @@ -194,9 +196,11 @@ enum reconstruct_states { reconstruct_state_result, }; +#define DEFAULT_STRIPE_SIZE 4096 struct stripe_head { struct hlist_node hash; struct list_head lru; /* inactive_list or handle_list */ + struct llist_node release_list; struct r5conf *raid_conf; short generation; /* increments with every * reshape */ @@ -204,13 +208,35 @@ struct stripe_head { short pd_idx; /* parity disk index */ short qd_idx; /* 'Q' disk index for raid6 */ short ddf_layout;/* use DDF ordering to calculate Q */ + short hash_lock_index; unsigned long state; /* state flags */ atomic_t count; /* nr of active thread/requests */ int bm_seq; /* sequence number for bitmap flushes */ int disks; /* disks in stripe */ + int overwrite_disks; /* total overwrite disks in stripe, + * this is only checked when stripe + * has STRIPE_BATCH_READY + */ enum check_states check_state; enum reconstruct_states reconstruct_state; spinlock_t stripe_lock; + int cpu; + struct r5worker_group *group; + + struct stripe_head *batch_head; /* protected by stripe lock */ + spinlock_t batch_lock; /* only header's lock is useful */ + struct list_head batch_list; /* protected by head's batch lock*/ + + union { + struct r5l_io_unit *log_io; + struct ppl_io_unit *ppl_io; + }; + + struct list_head log_list; + sector_t log_start; /* first meta block on the journal */ + struct list_head r5c; /* for r5c_cache->stripe_in_journal */ + + struct page *ppl_page; /* partial parity of this stripe */ /** * struct stripe_operations * @target - STRIPE_OP_COMPUTE_BLK target @@ -222,17 +248,27 @@ struct stripe_head { int target, target2; enum sum_check_flags zero_sum_result; } ops; + +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + /* These pages will be used by bios in dev[i] */ + struct page **pages; + int nr_pages; /* page array size */ + int stripes_per_page; +#endif struct r5dev { /* rreq and rvec are used for the replacement device when * writing data to both devices. */ struct bio req, rreq; struct bio_vec vec, rvec; - struct page *page; + struct page *page, *orig_page; + unsigned int offset; /* offset of the page */ struct bio *toread, *read, *towrite, *written; sector_t sector; /* sector of this page */ unsigned long flags; - } dev[1]; /* allocated with extra space depending of RAID geometry */ + u32 log_checksum; + unsigned short write_hint; + } dev[]; /* allocated depending of RAID geometry ("disks" member) */ }; /* stripe_head_state - collects and tracks the dynamic state of a stripe_head @@ -248,14 +284,16 @@ struct stripe_head_state { int syncing, expanding, expanded, replacing; int locked, uptodate, to_read, to_write, failed, written; int to_fill, compute, req_compute, non_overwrite; + int injournal, just_cached; int failed_num[2]; int p_failed, q_failed; int dec_preread_active; unsigned long ops_request; - struct bio *return_bi; struct md_rdev *blocked_rdev; int handle_bad_blocks; + int log_failed; + int waiting_extra_page; }; /* Flags for struct r5dev.flags */ @@ -295,6 +333,17 @@ enum r5dev_flags { * data in, and now is a good time to write it out. */ R5_Discard, /* Discard the stripe */ + R5_SkipCopy, /* Don't copy data from bio to stripe cache */ + R5_InJournal, /* data being written is in the journal device. + * if R5_InJournal is set for parity pd_idx, all the + * data and parity being written are in the journal + * device + */ + R5_OrigPageUPTDODATE, /* with write back cache, we read old data into + * dev->orig_page for prexor. When this flag is + * set, orig_page contains latest data in the + * raid disk. + */ }; /* @@ -306,9 +355,9 @@ enum { STRIPE_SYNC_REQUESTED, STRIPE_SYNCING, STRIPE_INSYNC, + STRIPE_REPLACED, STRIPE_PREREAD_ACTIVE, STRIPE_DELAYED, - STRIPE_DEGRADED, STRIPE_BIT_DELAY, STRIPE_EXPANDING, STRIPE_EXPAND_SOURCE, @@ -317,11 +366,42 @@ enum { STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */ STRIPE_BIOFILL_RUN, STRIPE_COMPUTE_RUN, - STRIPE_OPS_REQ_PENDING, STRIPE_ON_UNPLUG_LIST, STRIPE_DISCARD, + STRIPE_ON_RELEASE_LIST, + STRIPE_BATCH_READY, + STRIPE_BATCH_ERR, + STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c) + * this bit is used in two scenarios: + * + * 1. write-out phase + * set in first entry of r5l_write_stripe + * clear in second entry of r5l_write_stripe + * used to bypass logic in handle_stripe + * + * 2. caching phase + * set in r5c_try_caching_write() + * clear when journal write is done + * used to initiate r5c_cache_data() + * also used to bypass logic in handle_stripe + */ + STRIPE_R5C_CACHING, /* the stripe is in caching phase + * see more detail in the raid5-cache.c + */ + STRIPE_R5C_PARTIAL_STRIPE, /* in r5c cache (to-be/being handled or + * in conf->r5c_partial_stripe_list) + */ + STRIPE_R5C_FULL_STRIPE, /* in r5c cache (to-be/being handled or + * in conf->r5c_full_stripe_list) + */ + STRIPE_R5C_PREFLUSH, /* need to flush journal device */ }; +#define STRIPE_EXPAND_SYNC_FLAGS \ + ((1 << STRIPE_EXPAND_SOURCE) |\ + (1 << STRIPE_EXPAND_READY) |\ + (1 << STRIPE_EXPANDING) |\ + (1 << STRIPE_SYNC_REQUESTED)) /* * Operation request flags */ @@ -332,6 +412,25 @@ enum { STRIPE_OP_BIODRAIN, STRIPE_OP_RECONSTRUCT, STRIPE_OP_CHECK, + STRIPE_OP_PARTIAL_PARITY, +}; + +/* + * RAID parity calculation preferences + */ +enum { + PARITY_DISABLE_RMW = 0, + PARITY_ENABLE_RMW, + PARITY_PREFER_RMW, +}; + +/* + * Pages requested from set_syndrome_sources() + */ +enum { + SYNDROME_SRC_ALL, + SYNDROME_SRC_WANT_DRAIN, + SYNDROME_SRC_WRITTEN, }; /* * Plugging: @@ -357,19 +456,133 @@ enum { * HANDLE gets cleared if stripe_handle leaves nothing locked. */ +/* Note: disk_info.rdev can be set to NULL asynchronously by raid5_remove_disk. + * There are three safe ways to access disk_info.rdev. + * 1/ when holding mddev->reconfig_mutex + * 2/ when resync/recovery/reshape is known to be happening - i.e. in code that + * is called as part of performing resync/recovery/reshape. + * 3/ while holding rcu_read_lock(), use rcu_dereference to get the pointer + * and if it is non-NULL, increment rdev->nr_pending before dropping the RCU + * lock. + * When .rdev is set to NULL, the nr_pending count checked again and if + * it has been incremented, the pointer is put back in .rdev. + */ struct disk_info { - struct md_rdev *rdev, *replacement; + struct md_rdev *rdev; + struct md_rdev *replacement; + struct page *extra_page; /* extra page to use in prexor */ +}; + +/* + * Stripe cache + */ + +#define NR_STRIPES 256 + +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE +#define STRIPE_SIZE PAGE_SIZE +#define STRIPE_SHIFT (PAGE_SHIFT - 9) +#define STRIPE_SECTORS (STRIPE_SIZE>>9) +#endif + +#define IO_THRESHOLD 1 +#define BYPASS_THRESHOLD 1 +#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) +#define HASH_MASK (NR_HASH - 1) +#define MAX_STRIPE_BATCH 8 + +/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64. + * This is because we sometimes take all the spinlocks + * and creating that much locking depth can cause + * problems. + */ +#define NR_STRIPE_HASH_LOCKS 8 +#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1) + +struct r5worker { + struct work_struct work; + struct r5worker_group *group; + struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; + bool working; +}; + +struct r5worker_group { + struct list_head handle_list; + struct list_head loprio_list; + struct r5conf *conf; + struct r5worker *workers; + int stripes_cnt; +}; + +/* + * r5c journal modes of the array: write-back or write-through. + * write-through mode has identical behavior as existing log only + * implementation. + */ +enum r5c_journal_mode { + R5C_JOURNAL_MODE_WRITE_THROUGH = 0, + R5C_JOURNAL_MODE_WRITE_BACK = 1, +}; + +enum r5_cache_state { + R5_INACTIVE_BLOCKED, /* release of inactive stripes blocked, + * waiting for 25% to be free + */ + R5_ALLOC_MORE, /* It might help to allocate another + * stripe. + */ + R5_DID_ALLOC, /* A stripe was allocated, don't allocate + * more until at least one has been + * released. This avoids flooding + * the cache. + */ + R5C_LOG_TIGHT, /* log device space tight, need to + * prioritize stripes at last_checkpoint + */ + R5C_LOG_CRITICAL, /* log device is running out of space, + * only process stripes that are already + * occupying the log + */ + R5C_EXTRA_PAGE_IN_USE, /* a stripe is using disk_info.extra_page + * for prexor + */ +}; + +#define PENDING_IO_MAX 512 +#define PENDING_IO_ONE_FLUSH 128 +struct r5pending_data { + struct list_head sibling; + sector_t sector; /* stripe sector */ + struct bio_list bios; +}; + +struct raid5_percpu { + struct page *spare_page; /* Used when checking P/Q in raid6 */ + void *scribble; /* space for constructing buffer + * lists and performing address + * conversions + */ + int scribble_obj_size; + local_lock_t lock; }; struct r5conf { struct hlist_head *stripe_hashtbl; + /* only protect corresponding hash list and inactive_list */ + spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS]; struct mddev *mddev; int chunk_sectors; - int level, algorithm; + int level, algorithm, rmw_level; int max_degraded; int raid_disks; int max_nr_stripes; + int min_nr_stripes; +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + unsigned long stripe_size; + unsigned int stripe_shift; + unsigned long stripe_sectors; +#endif /* reshape_progress is the leading edge of a 'reshape' * It has value MaxSector when no reshape is happening @@ -385,6 +598,7 @@ struct r5conf { int prev_chunk_sectors; int prev_algo; short generation; /* increments with every reshape */ + seqcount_spinlock_t gen_lock; /* lock against generation changes */ unsigned long reshape_checkpoint; /* Time we last updated * metadata */ long long min_offset_diff; /* minimum difference between @@ -395,16 +609,19 @@ struct r5conf { */ struct list_head handle_list; /* stripes needing handling */ + struct list_head loprio_list; /* low priority stripes */ struct list_head hold_list; /* preread ready stripes */ struct list_head delayed_list; /* stripes that have plugged requests */ struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ struct bio *retry_read_aligned; /* currently retrying aligned bios */ + unsigned int retry_read_offset; /* sector offset into retry_read_aligned */ struct bio *retry_read_aligned_list; /* aligned bios retry list */ atomic_t preread_active_stripes; /* stripes with scheduled io */ atomic_t active_aligned_reads; atomic_t pending_full_writes; /* full write backlog */ int bypass_count; /* bypassed prereads */ int bypass_threshold; /* preread nice */ + int skip_copy; /* Don't copy data from bio to stripe cache */ struct list_head *last_hold; /* detect hold_list promotions */ atomic_t reshape_stripes; /* stripes with pending writes for reshape */ @@ -412,8 +629,9 @@ struct r5conf { * two caches. */ int active_name; - char cache_name[2][32]; - struct kmem_cache *slab_cache; /* for allocating stripes */ + char cache_name[2][48]; + struct kmem_cache *slab_cache; /* for allocating stripes */ + struct mutex cache_size_mutex; /* Protect changes to cache size */ int seq_flush, seq_write; int quiesce; @@ -424,41 +642,83 @@ struct r5conf { */ int recovery_disabled; /* per cpu variables */ - struct raid5_percpu { - struct page *spare_page; /* Used when checking P/Q in raid6 */ - void *scribble; /* space for constructing buffer - * lists and performing address - * conversions - */ - } __percpu *percpu; - size_t scribble_len; /* size of scribble region must be - * associated with conf to handle - * cpu hotplug while reshaping - */ -#ifdef CONFIG_HOTPLUG_CPU - struct notifier_block cpu_notify; -#endif + struct raid5_percpu __percpu *percpu; + int scribble_disks; + int scribble_sectors; + struct hlist_node node; /* * Free stripes pool */ atomic_t active_stripes; - struct list_head inactive_list; + struct list_head inactive_list[NR_STRIPE_HASH_LOCKS]; + + atomic_t r5c_cached_full_stripes; + struct list_head r5c_full_stripe_list; + atomic_t r5c_cached_partial_stripes; + struct list_head r5c_partial_stripe_list; + atomic_t r5c_flushing_full_stripes; + atomic_t r5c_flushing_partial_stripes; + + atomic_t empty_inactive_list_nr; + struct llist_head released_stripes; + wait_queue_head_t wait_for_quiescent; wait_queue_head_t wait_for_stripe; - wait_queue_head_t wait_for_overlap; - int inactive_blocked; /* release of inactive stripes blocked, - * waiting for 25% to be free - */ + wait_queue_head_t wait_for_reshape; + unsigned long cache_state; + struct shrinker *shrinker; int pool_size; /* number of disks in stripeheads in pool */ spinlock_t device_lock; struct disk_info *disks; + struct bio_set bio_split; /* When taking over an array from a different personality, we store * the new thread here until we fully activate the array. */ - struct md_thread *thread; + struct md_thread __rcu *thread; + struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; + struct r5worker_group *worker_groups; + int group_cnt; + int worker_cnt_per_group; + struct r5l_log *log; + void *log_private; + + spinlock_t pending_bios_lock; + bool batch_bio_dispatch; + struct r5pending_data *pending_data; + struct list_head free_list; + struct list_head pending_list; + int pending_data_cnt; + struct r5pending_data *next_pending_data; }; +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE +#define RAID5_STRIPE_SIZE(conf) STRIPE_SIZE +#define RAID5_STRIPE_SHIFT(conf) STRIPE_SHIFT +#define RAID5_STRIPE_SECTORS(conf) STRIPE_SECTORS +#else +#define RAID5_STRIPE_SIZE(conf) ((conf)->stripe_size) +#define RAID5_STRIPE_SHIFT(conf) ((conf)->stripe_shift) +#define RAID5_STRIPE_SECTORS(conf) ((conf)->stripe_sectors) +#endif + +/* bio's attached to a stripe+device for I/O are linked together in bi_sector + * order without overlap. There may be several bio's per stripe+device, and + * a bio could span several devices. + * When walking this list for a particular stripe+device, we must never proceed + * beyond a bio that extends past this device, as the next bio might no longer + * be valid. + * This function is used to determine the 'next' bio in the list, given the + * sector of the current stripe+device + */ +static inline struct bio *r5_next_bio(struct r5conf *conf, struct bio *bio, sector_t sector) +{ + if (bio_end_sector(bio) < sector + RAID5_STRIPE_SECTORS(conf)) + return bio->bi_next; + else + return NULL; +} + /* * Our supported algorithms */ @@ -489,7 +749,6 @@ struct r5conf { #define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */ #define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */ - /* For every RAID5 algorithm we define a RAID6 algorithm * with exactly the same layout for data and parity, and * with the Q block always on the last device (N-1). @@ -521,7 +780,43 @@ static inline int algorithm_is_DDF(int layout) return layout >= 8 && layout <= 10; } -extern int md_raid5_congested(struct mddev *mddev, int bits); -extern void md_raid5_kick_device(struct r5conf *conf); -extern int raid5_set_cache_size(struct mddev *mddev, int size); +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE +/* + * Return offset of the corresponding page for r5dev. + */ +static inline int raid5_get_page_offset(struct stripe_head *sh, int disk_idx) +{ + return (disk_idx % sh->stripes_per_page) * RAID5_STRIPE_SIZE(sh->raid_conf); +} + +/* + * Return corresponding page address for r5dev. + */ +static inline struct page * +raid5_get_dev_page(struct stripe_head *sh, int disk_idx) +{ + return sh->pages[disk_idx / sh->stripes_per_page]; +} +#endif + +void md_raid5_kick_device(struct r5conf *conf); +int raid5_set_cache_size(struct mddev *mddev, int size); +sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous); +void raid5_release_stripe(struct stripe_head *sh); +sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, + int previous, int *dd_idx, struct stripe_head *sh); + +struct stripe_request_ctx; +/* get stripe from previous generation (when reshaping) */ +#define R5_GAS_PREVIOUS (1 << 0) +/* do not block waiting for a free stripe */ +#define R5_GAS_NOBLOCK (1 << 1) +/* do not block waiting for quiesce to be released */ +#define R5_GAS_NOQUIESCE (1 << 2) +struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, + struct stripe_request_ctx *ctx, sector_t sector, + unsigned int flags); + +int raid5_calc_degraded(struct r5conf *conf); +int r5c_journal_mode_set(struct mddev *mddev, int journal_mode); #endif |
