summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/bcache.h12
-rw-r--r--drivers/md/bcache/btree.c48
-rw-r--r--drivers/md/bcache/btree.h5
-rw-r--r--drivers/md/bcache/request.c4
-rw-r--r--drivers/md/bcache/stats.h1
-rw-r--r--drivers/md/bcache/super.c29
-rw-r--r--drivers/md/bcache/sysfs.c31
-rw-r--r--drivers/md/bcache/sysfs.h2
-rw-r--r--drivers/md/bcache/writeback.c10
-rw-r--r--drivers/md/dm-cache-metadata.c2
-rw-r--r--drivers/md/dm-cache-target.c12
-rw-r--r--drivers/md/dm-clone-target.c10
-rw-r--r--drivers/md/dm-core.h7
-rw-r--r--drivers/md/dm-crypt.c5
-rw-r--r--drivers/md/dm-era-target.c6
-rw-r--r--drivers/md/dm-init.c4
-rw-r--r--drivers/md/dm-integrity.c4
-rw-r--r--drivers/md/dm-ioctl.c15
-rw-r--r--drivers/md/dm-raid.c4
-rw-r--r--drivers/md/dm-snap.c18
-rw-r--r--drivers/md/dm-table.c37
-rw-r--r--drivers/md/dm-thin-metadata.c22
-rw-r--r--drivers/md/dm-thin.c12
-rw-r--r--drivers/md/dm-verity-fec.c2
-rw-r--r--drivers/md/dm-verity-target.c6
-rw-r--r--drivers/md/dm-zoned-metadata.c6
-rw-r--r--drivers/md/dm.c51
-rw-r--r--drivers/md/dm.h2
-rw-r--r--drivers/md/md-autodetect.c3
-rw-r--r--drivers/md/md-bitmap.c93
-rw-r--r--drivers/md/md-bitmap.h8
-rw-r--r--drivers/md/md-cluster.c17
-rw-r--r--drivers/md/md-multipath.c4
-rw-r--r--drivers/md/md.c280
-rw-r--r--drivers/md/md.h37
-rw-r--r--drivers/md/raid1-10.c74
-rw-r--r--drivers/md/raid1.c43
-rw-r--r--drivers/md/raid1.h2
-rw-r--r--drivers/md/raid10.c199
-rw-r--r--drivers/md/raid10.h2
-rw-r--r--drivers/md/raid5-cache.c24
-rw-r--r--drivers/md/raid5-ppl.c4
-rw-r--r--drivers/md/raid5.c74
-rw-r--r--drivers/md/raid5.h4
44 files changed, 745 insertions, 490 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index aebb7ef10e63..5a79bb3c272f 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -275,7 +275,7 @@ struct bcache_device {
int (*cache_miss)(struct btree *b, struct search *s,
struct bio *bio, unsigned int sectors);
- int (*ioctl)(struct bcache_device *d, fmode_t mode,
+ int (*ioctl)(struct bcache_device *d, blk_mode_t mode,
unsigned int cmd, unsigned long arg);
};
@@ -1004,11 +1004,11 @@ extern struct workqueue_struct *bch_flush_wq;
extern struct mutex bch_register_lock;
extern struct list_head bch_cache_sets;
-extern struct kobj_type bch_cached_dev_ktype;
-extern struct kobj_type bch_flash_dev_ktype;
-extern struct kobj_type bch_cache_set_ktype;
-extern struct kobj_type bch_cache_set_internal_ktype;
-extern struct kobj_type bch_cache_ktype;
+extern const struct kobj_type bch_cached_dev_ktype;
+extern const struct kobj_type bch_flash_dev_ktype;
+extern const struct kobj_type bch_cache_set_ktype;
+extern const struct kobj_type bch_cache_set_internal_ktype;
+extern const struct kobj_type bch_cache_ktype;
void bch_cached_dev_release(struct kobject *kobj);
void bch_flash_dev_release(struct kobject *kobj);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 147c493a989a..fd121a61f17c 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -559,6 +559,27 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
}
}
+#define cmp_int(l, r) ((l > r) - (l < r))
+
+#ifdef CONFIG_PROVE_LOCKING
+static int btree_lock_cmp_fn(const struct lockdep_map *_a,
+ const struct lockdep_map *_b)
+{
+ const struct btree *a = container_of(_a, struct btree, lock.dep_map);
+ const struct btree *b = container_of(_b, struct btree, lock.dep_map);
+
+ return -cmp_int(a->level, b->level) ?: bkey_cmp(&a->key, &b->key);
+}
+
+static void btree_lock_print_fn(const struct lockdep_map *map)
+{
+ const struct btree *b = container_of(map, struct btree, lock.dep_map);
+
+ printk(KERN_CONT " l=%u %llu:%llu", b->level,
+ KEY_INODE(&b->key), KEY_OFFSET(&b->key));
+}
+#endif
+
static struct btree *mca_bucket_alloc(struct cache_set *c,
struct bkey *k, gfp_t gfp)
{
@@ -572,7 +593,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
return NULL;
init_rwsem(&b->lock);
- lockdep_set_novalidate_class(&b->lock);
+ lock_set_cmp_fn(&b->lock, btree_lock_cmp_fn, btree_lock_print_fn);
mutex_init(&b->write_lock);
lockdep_set_novalidate_class(&b->write_lock);
INIT_LIST_HEAD(&b->list);
@@ -885,7 +906,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
* cannibalize_bucket() will take. This means every time we unlock the root of
* the btree, we need to release this lock if we have it held.
*/
-static void bch_cannibalize_unlock(struct cache_set *c)
+void bch_cannibalize_unlock(struct cache_set *c)
{
spin_lock(&c->btree_cannibalize_lock);
if (c->btree_cache_alloc_lock == current) {
@@ -1090,10 +1111,12 @@ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
struct btree *parent)
{
BKEY_PADDED(key) k;
- struct btree *b = ERR_PTR(-EAGAIN);
+ struct btree *b;
mutex_lock(&c->bucket_lock);
retry:
+ /* return ERR_PTR(-EAGAIN) when it fails */
+ b = ERR_PTR(-EAGAIN);
if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait))
goto err;
@@ -1138,7 +1161,7 @@ static struct btree *btree_node_alloc_replacement(struct btree *b,
{
struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
- if (!IS_ERR_OR_NULL(n)) {
+ if (!IS_ERR(n)) {
mutex_lock(&n->write_lock);
bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
bkey_copy_key(&n->key, &b->key);
@@ -1340,7 +1363,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
memset(new_nodes, 0, sizeof(new_nodes));
closure_init_stack(&cl);
- while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
+ while (nodes < GC_MERGE_NODES && !IS_ERR(r[nodes].b))
keys += r[nodes++].keys;
blocks = btree_default_blocks(b->c) * 2 / 3;
@@ -1352,7 +1375,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
for (i = 0; i < nodes; i++) {
new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
- if (IS_ERR_OR_NULL(new_nodes[i]))
+ if (IS_ERR(new_nodes[i]))
goto out_nocoalesce;
}
@@ -1487,7 +1510,7 @@ out_nocoalesce:
bch_keylist_free(&keylist);
for (i = 0; i < nodes; i++)
- if (!IS_ERR_OR_NULL(new_nodes[i])) {
+ if (!IS_ERR(new_nodes[i])) {
btree_node_free(new_nodes[i]);
rw_unlock(true, new_nodes[i]);
}
@@ -1669,7 +1692,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
if (should_rewrite) {
n = btree_node_alloc_replacement(b, NULL);
- if (!IS_ERR_OR_NULL(n)) {
+ if (!IS_ERR(n)) {
bch_btree_node_write_sync(n);
bch_btree_set_root(n);
@@ -1968,6 +1991,15 @@ static int bch_btree_check_thread(void *arg)
c->gc_stats.nodes++;
bch_btree_op_init(&op, 0);
ret = bcache_btree(check_recurse, p, c->root, &op);
+ /*
+ * The op may be added to cache_set's btree_cache_wait
+ * in mca_cannibalize(), must ensure it is removed from
+ * the list and release btree_cache_alloc_lock before
+ * free op memory.
+ * Otherwise, the btree_cache_wait will be damaged.
+ */
+ bch_cannibalize_unlock(c);
+ finish_wait(&c->btree_cache_wait, &(&op)->wait);
if (ret)
goto out;
}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 1b5fdbc0d83e..45d64b54115a 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -247,8 +247,8 @@ static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
static inline void rw_lock(bool w, struct btree *b, int level)
{
- w ? down_write_nested(&b->lock, level + 1)
- : down_read_nested(&b->lock, level + 1);
+ w ? down_write(&b->lock)
+ : down_read(&b->lock);
if (w)
b->seq++;
}
@@ -282,6 +282,7 @@ void bch_initial_gc_finish(struct cache_set *c);
void bch_moving_gc(struct cache_set *c);
int bch_btree_check(struct cache_set *c);
void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k);
+void bch_cannibalize_unlock(struct cache_set *c);
static inline void wake_up_gc(struct cache_set *c)
{
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 67a2e29e0b40..a9b1f3896249 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -1228,7 +1228,7 @@ void cached_dev_submit_bio(struct bio *bio)
detached_dev_do_request(d, bio, orig_bdev, start_time);
}
-static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
+static int cached_dev_ioctl(struct bcache_device *d, blk_mode_t mode,
unsigned int cmd, unsigned long arg)
{
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
@@ -1318,7 +1318,7 @@ void flash_dev_submit_bio(struct bio *bio)
continue_at(cl, search_free, NULL);
}
-static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
+static int flash_dev_ioctl(struct bcache_device *d, blk_mode_t mode,
unsigned int cmd, unsigned long arg)
{
return -ENOTTY;
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h
index bd3afc856d53..21b445f8af15 100644
--- a/drivers/md/bcache/stats.h
+++ b/drivers/md/bcache/stats.h
@@ -18,7 +18,6 @@ struct cache_stats {
unsigned long cache_misses;
unsigned long cache_bypass_hits;
unsigned long cache_bypass_misses;
- unsigned long cache_readaheads;
unsigned long cache_miss_collisions;
unsigned long sectors_bypassed;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 7e9d19fd21dd..e2a803683105 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -732,9 +732,9 @@ out:
/* Bcache device */
-static int open_dev(struct block_device *b, fmode_t mode)
+static int open_dev(struct gendisk *disk, blk_mode_t mode)
{
- struct bcache_device *d = b->bd_disk->private_data;
+ struct bcache_device *d = disk->private_data;
if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
return -ENXIO;
@@ -743,14 +743,14 @@ static int open_dev(struct block_device *b, fmode_t mode)
return 0;
}
-static void release_dev(struct gendisk *b, fmode_t mode)
+static void release_dev(struct gendisk *b)
{
struct bcache_device *d = b->private_data;
closure_put(&d->cl);
}
-static int ioctl_dev(struct block_device *b, fmode_t mode,
+static int ioctl_dev(struct block_device *b, blk_mode_t mode,
unsigned int cmd, unsigned long arg)
{
struct bcache_device *d = b->bd_disk->private_data;
@@ -1369,7 +1369,7 @@ static void cached_dev_free(struct closure *cl)
put_page(virt_to_page(dc->sb_disk));
if (!IS_ERR_OR_NULL(dc->bdev))
- blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+ blkdev_put(dc->bdev, bcache_kobj);
wake_up(&unregister_wait);
@@ -1723,7 +1723,7 @@ static void cache_set_flush(struct closure *cl)
if (!IS_ERR_OR_NULL(c->gc_thread))
kthread_stop(c->gc_thread);
- if (!IS_ERR_OR_NULL(c->root))
+ if (!IS_ERR(c->root))
list_add(&c->root->list, &c->btree_cache);
/*
@@ -2087,7 +2087,7 @@ static int run_cache_set(struct cache_set *c)
err = "cannot allocate new btree root";
c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
- if (IS_ERR_OR_NULL(c->root))
+ if (IS_ERR(c->root))
goto err;
mutex_lock(&c->root->write_lock);
@@ -2218,7 +2218,7 @@ void bch_cache_release(struct kobject *kobj)
put_page(virt_to_page(ca->sb_disk));
if (!IS_ERR_OR_NULL(ca->bdev))
- blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+ blkdev_put(ca->bdev, bcache_kobj);
kfree(ca);
module_put(THIS_MODULE);
@@ -2359,7 +2359,7 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
* call blkdev_put() to bdev in bch_cache_release(). So we
* explicitly call blkdev_put() here.
*/
- blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+ blkdev_put(bdev, bcache_kobj);
if (ret == -ENOMEM)
err = "cache_alloc(): -ENOMEM";
else if (ret == -EPERM)
@@ -2461,7 +2461,7 @@ static void register_bdev_worker(struct work_struct *work)
if (!dc) {
fail = true;
put_page(virt_to_page(args->sb_disk));
- blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ blkdev_put(args->bdev, bcache_kobj);
goto out;
}
@@ -2491,7 +2491,7 @@ static void register_cache_worker(struct work_struct *work)
if (!ca) {
fail = true;
put_page(virt_to_page(args->sb_disk));
- blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ blkdev_put(args->bdev, bcache_kobj);
goto out;
}
@@ -2558,9 +2558,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
ret = -EINVAL;
err = "failed to open device";
- bdev = blkdev_get_by_path(strim(path),
- FMODE_READ|FMODE_WRITE|FMODE_EXCL,
- sb);
+ bdev = blkdev_get_by_path(strim(path), BLK_OPEN_READ | BLK_OPEN_WRITE,
+ bcache_kobj, NULL);
if (IS_ERR(bdev)) {
if (bdev == ERR_PTR(-EBUSY)) {
dev_t dev;
@@ -2648,7 +2647,7 @@ async_done:
out_put_sb_page:
put_page(virt_to_page(sb_disk));
out_blkdev_put:
- blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ blkdev_put(bdev, register_bcache);
out_free_sb:
kfree(sb);
out_free_path:
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index c6f677059214..0e2c1880f60b 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -1111,26 +1111,25 @@ SHOW(__bch_cache)
vfree(p);
- ret = scnprintf(buf, PAGE_SIZE,
- "Unused: %zu%%\n"
- "Clean: %zu%%\n"
- "Dirty: %zu%%\n"
- "Metadata: %zu%%\n"
- "Average: %llu\n"
- "Sectors per Q: %zu\n"
- "Quantiles: [",
- unused * 100 / (size_t) ca->sb.nbuckets,
- available * 100 / (size_t) ca->sb.nbuckets,
- dirty * 100 / (size_t) ca->sb.nbuckets,
- meta * 100 / (size_t) ca->sb.nbuckets, sum,
- n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1));
+ ret = sysfs_emit(buf,
+ "Unused: %zu%%\n"
+ "Clean: %zu%%\n"
+ "Dirty: %zu%%\n"
+ "Metadata: %zu%%\n"
+ "Average: %llu\n"
+ "Sectors per Q: %zu\n"
+ "Quantiles: [",
+ unused * 100 / (size_t) ca->sb.nbuckets,
+ available * 100 / (size_t) ca->sb.nbuckets,
+ dirty * 100 / (size_t) ca->sb.nbuckets,
+ meta * 100 / (size_t) ca->sb.nbuckets, sum,
+ n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1));
for (i = 0; i < ARRAY_SIZE(q); i++)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret,
- "%u ", q[i]);
+ ret += sysfs_emit_at(buf, ret, "%u ", q[i]);
ret--;
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n");
+ ret += sysfs_emit_at(buf, ret, "]\n");
return ret;
}
diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h
index a2ff6447b699..65b8bd975ab1 100644
--- a/drivers/md/bcache/sysfs.h
+++ b/drivers/md/bcache/sysfs.h
@@ -3,7 +3,7 @@
#define _BCACHE_SYSFS_H_
#define KTYPE(type) \
-struct kobj_type type ## _ktype = { \
+const struct kobj_type type ## _ktype = { \
.release = type ## _release, \
.sysfs_ops = &((const struct sysfs_ops) { \
.show = type ## _show, \
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index d4a5fc0650bb..24c049067f61 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -890,6 +890,16 @@ static int bch_root_node_dirty_init(struct cache_set *c,
if (ret < 0)
pr_warn("sectors dirty init failed, ret=%d!\n", ret);
+ /*
+ * The op may be added to cache_set's btree_cache_wait
+ * in mca_cannibalize(), must ensure it is removed from
+ * the list and release btree_cache_alloc_lock before
+ * free op memory.
+ * Otherwise, the btree_cache_wait will be damaged.
+ */
+ bch_cannibalize_unlock(c);
+ finish_wait(&c->btree_cache_wait, &(&op.op)->wait);
+
return ret;
}
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 9e0c69958587..acffed750e3e 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -1828,7 +1828,7 @@ int dm_cache_metadata_abort(struct dm_cache_metadata *cmd)
* Replacement block manager (new_bm) is created and old_bm destroyed outside of
* cmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of
* shrinker associated with the block manager's bufio client vs cmd root_lock).
- * - must take shrinker_mutex without holding cmd->root_lock
+ * - must take shrinker_rwsem without holding cmd->root_lock
*/
new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
CACHE_MAX_CONCURRENT_LOCKS);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 872896218550..911f73f7ebba 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2051,8 +2051,8 @@ static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
if (!at_least_one_arg(as, error))
return -EINVAL;
- r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
- &ca->metadata_dev);
+ r = dm_get_device(ca->ti, dm_shift_arg(as),
+ BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->metadata_dev);
if (r) {
*error = "Error opening metadata device";
return r;
@@ -2074,8 +2074,8 @@ static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
if (!at_least_one_arg(as, error))
return -EINVAL;
- r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
- &ca->cache_dev);
+ r = dm_get_device(ca->ti, dm_shift_arg(as),
+ BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->cache_dev);
if (r) {
*error = "Error opening cache device";
return r;
@@ -2093,8 +2093,8 @@ static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
if (!at_least_one_arg(as, error))
return -EINVAL;
- r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
- &ca->origin_dev);
+ r = dm_get_device(ca->ti, dm_shift_arg(as),
+ BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->origin_dev);
if (r) {
*error = "Error opening origin device";
return r;
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index f467cdb5a022..94b2fc33f64b 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -1683,8 +1683,8 @@ static int parse_metadata_dev(struct clone *clone, struct dm_arg_set *as, char *
int r;
sector_t metadata_dev_size;
- r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
- &clone->metadata_dev);
+ r = dm_get_device(clone->ti, dm_shift_arg(as),
+ BLK_OPEN_READ | BLK_OPEN_WRITE, &clone->metadata_dev);
if (r) {
*error = "Error opening metadata device";
return r;
@@ -1703,8 +1703,8 @@ static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **err
int r;
sector_t dest_dev_size;
- r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
- &clone->dest_dev);
+ r = dm_get_device(clone->ti, dm_shift_arg(as),
+ BLK_OPEN_READ | BLK_OPEN_WRITE, &clone->dest_dev);
if (r) {
*error = "Error opening destination device";
return r;
@@ -1725,7 +1725,7 @@ static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **e
int r;
sector_t source_dev_size;
- r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ,
+ r = dm_get_device(clone->ti, dm_shift_arg(as), BLK_OPEN_READ,
&clone->source_dev);
if (r) {
*error = "Error opening source device";
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index aecab0c0720f..ce913ad91a52 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -207,11 +207,10 @@ struct dm_table {
unsigned integrity_added:1;
/*
- * Indicates the rw permissions for the new logical
- * device. This should be a combination of FMODE_READ
- * and FMODE_WRITE.
+ * Indicates the rw permissions for the new logical device. This
+ * should be a combination of BLK_OPEN_READ and BLK_OPEN_WRITE.
*/
- fmode_t mode;
+ blk_mode_t mode;
/* a list of devices used by this table */
struct list_head devices;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 8b47b913ee83..15424bfea7ee 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1693,8 +1693,7 @@ retry:
len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size;
- bio_add_page(clone, page, len, 0);
-
+ __bio_add_page(clone, page, len, 0);
remaining_size -= len;
}
@@ -3256,7 +3255,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->per_bio_data_size = ti->per_io_data_size =
ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size,
- ARCH_KMALLOC_MINALIGN);
+ ARCH_DMA_MINALIGN);
ret = mempool_init(&cc->page_pool, BIO_MAX_VECS, crypt_page_alloc, crypt_page_free, cc);
if (ret) {
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index 0d70914217ee..6acfa5bf97a4 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -1482,14 +1482,16 @@ static int era_ctr(struct dm_target *ti, unsigned int argc, char **argv)
era->ti = ti;
- r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &era->metadata_dev);
+ r = dm_get_device(ti, argv[0], BLK_OPEN_READ | BLK_OPEN_WRITE,
+ &era->metadata_dev);
if (r) {
ti->error = "Error opening metadata device";
era_destroy(era);
return -EINVAL;
}
- r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &era->origin_dev);
+ r = dm_get_device(ti, argv[1], BLK_OPEN_READ | BLK_OPEN_WRITE,
+ &era->origin_dev);
if (r) {
ti->error = "Error opening data device";
era_destroy(era);
diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c
index d369457dbed0..2a71bcdba92d 100644
--- a/drivers/md/dm-init.c
+++ b/drivers/md/dm-init.c
@@ -293,8 +293,10 @@ static int __init dm_init_init(void)
for (i = 0; i < ARRAY_SIZE(waitfor); i++) {
if (waitfor[i]) {
+ dev_t dev;
+
DMINFO("waiting for device %s ...", waitfor[i]);
- while (!dm_get_dev_t(waitfor[i]))
+ while (early_lookup_bdev(waitfor[i], &dev))
fsleep(5000);
}
}
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 31838b13ea54..63ec502fcb12 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -4268,10 +4268,10 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
}
/*
- * If this workqueue were percpu, it would cause bio reordering
+ * If this workqueue weren't ordered, it would cause bio reordering
* and reduced performance.
*/
- ic->wait_wq = alloc_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ ic->wait_wq = alloc_ordered_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM);
if (!ic->wait_wq) {
ti->error = "Cannot allocate workqueue";
r = -ENOMEM;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index cc77cf3d4109..6d301019e5e3 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -861,7 +861,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
table = dm_get_inactive_table(md, &srcu_idx);
if (table) {
- if (!(dm_table_get_mode(table) & FMODE_WRITE))
+ if (!(dm_table_get_mode(table) & BLK_OPEN_WRITE))
param->flags |= DM_READONLY_FLAG;
param->target_count = table->num_targets;
}
@@ -1168,13 +1168,10 @@ static int do_resume(struct dm_ioctl *param)
/* Do we need to load a new map ? */
if (new_map) {
sector_t old_size, new_size;
- int srcu_idx;
/* Suspend if it isn't already suspended */
- old_map = dm_get_live_table(md, &srcu_idx);
- if ((param->flags & DM_SKIP_LOCKFS_FLAG) || !old_map)
+ if (param->flags & DM_SKIP_LOCKFS_FLAG)
suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
- dm_put_live_table(md, srcu_idx);
if (param->flags & DM_NOFLUSH_FLAG)
suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
if (!dm_suspended_md(md))
@@ -1192,7 +1189,7 @@ static int do_resume(struct dm_ioctl *param)
if (old_size && new_size && old_size != new_size)
need_resize_uevent = true;
- if (dm_table_get_mode(new_map) & FMODE_WRITE)
+ if (dm_table_get_mode(new_map) & BLK_OPEN_WRITE)
set_disk_ro(dm_disk(md), 0);
else
set_disk_ro(dm_disk(md), 1);
@@ -1381,12 +1378,12 @@ static int dev_arm_poll(struct file *filp, struct dm_ioctl *param, size_t param_
return 0;
}
-static inline fmode_t get_mode(struct dm_ioctl *param)
+static inline blk_mode_t get_mode(struct dm_ioctl *param)
{
- fmode_t mode = FMODE_READ | FMODE_WRITE;
+ blk_mode_t mode = BLK_OPEN_READ | BLK_OPEN_WRITE;
if (param->flags & DM_READONLY_FLAG)
- mode = FMODE_READ;
+ mode = BLK_OPEN_READ;
return mode;
}
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index c8821fcb8299..8846bf510a35 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3750,11 +3750,11 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
* canceling read-auto mode
*/
mddev->ro = 0;
- if (!mddev->suspended && mddev->sync_thread)
+ if (!mddev->suspended)
md_wakeup_thread(mddev->sync_thread);
}
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- if (!mddev->suspended && mddev->thread)
+ if (!mddev->suspended)
md_wakeup_thread(mddev->thread);
return 0;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 9c49f53760d0..bf7a574499a3 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1241,9 +1241,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
int i;
int r = -EINVAL;
char *origin_path, *cow_path;
- dev_t origin_dev, cow_dev;
unsigned int args_used, num_flush_bios = 1;
- fmode_t origin_mode = FMODE_READ;
+ blk_mode_t origin_mode = BLK_OPEN_READ;
if (argc < 4) {
ti->error = "requires 4 or more arguments";
@@ -1253,7 +1252,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (dm_target_is_snapshot_merge(ti)) {
num_flush_bios = 2;
- origin_mode = FMODE_WRITE;
+ origin_mode = BLK_OPEN_WRITE;
}
s = kzalloc(sizeof(*s), GFP_KERNEL);
@@ -1279,24 +1278,21 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->error = "Cannot get origin device";
goto bad_origin;
}
- origin_dev = s->origin->bdev->bd_dev;
cow_path = argv[0];
argv++;
argc--;
- cow_dev = dm_get_dev_t(cow_path);
- if (cow_dev && cow_dev == origin_dev) {
- ti->error = "COW device cannot be the same as origin device";
- r = -EINVAL;
- goto bad_cow;
- }
-
r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow);
if (r) {
ti->error = "Cannot get COW device";
goto bad_cow;
}
+ if (s->cow->bdev && s->cow->bdev == s->origin->bdev) {
+ ti->error = "COW device cannot be the same as origin device";
+ r = -EINVAL;
+ goto bad_store;
+ }
r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store);
if (r) {
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 1398f1d6e83e..7d208b2b1a19 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -126,7 +126,7 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
return 0;
}
-int dm_table_create(struct dm_table **result, fmode_t mode,
+int dm_table_create(struct dm_table **result, blk_mode_t mode,
unsigned int num_targets, struct mapped_device *md)
{
struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL);
@@ -304,7 +304,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
* device and not to touch the existing bdev field in case
* it is accessed concurrently.
*/
-static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
+static int upgrade_mode(struct dm_dev_internal *dd, blk_mode_t new_mode,
struct mapped_device *md)
{
int r;
@@ -324,23 +324,13 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
}
/*
- * Convert the path to a device
- */
-dev_t dm_get_dev_t(const char *path)
-{
- dev_t dev;
-
- if (lookup_bdev(path, &dev))
- dev = name_to_dev_t(path);
- return dev;
-}
-EXPORT_SYMBOL_GPL(dm_get_dev_t);
-
-/*
* Add a device to the list, or just increment the usage count if
* it's already present.
+ *
+ * Note: the __ref annotation is because this function can call the __init
+ * marked early_lookup_bdev when called during early boot code from dm-init.c.
*/
-int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
+int __ref dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode,
struct dm_dev **result)
{
int r;
@@ -358,9 +348,13 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
if (MAJOR(dev) != major || MINOR(dev) != minor)
return -EOVERFLOW;
} else {
- dev = dm_get_dev_t(path);
- if (!dev)
- return -ENODEV;
+ r = lookup_bdev(path, &dev);
+#ifndef MODULE
+ if (r && system_state < SYSTEM_RUNNING)
+ r = early_lookup_bdev(path, &dev);
+#endif
+ if (r)
+ return r;
}
if (dev == disk_devt(t->md->disk))
return -EINVAL;
@@ -668,7 +662,8 @@ int dm_table_add_target(struct dm_table *t, const char *type,
t->singleton = true;
}
- if (dm_target_always_writeable(ti->type) && !(t->mode & FMODE_WRITE)) {
+ if (dm_target_always_writeable(ti->type) &&
+ !(t->mode & BLK_OPEN_WRITE)) {
ti->error = "target type may not be included in a read-only table";
goto bad;
}
@@ -2039,7 +2034,7 @@ struct list_head *dm_table_get_devices(struct dm_table *t)
return &t->devices;
}
-fmode_t dm_table_get_mode(struct dm_table *t)
+blk_mode_t dm_table_get_mode(struct dm_table *t)
{
return t->mode;
}
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 9f5cb52c5763..9dd0409848ab 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1756,13 +1756,15 @@ int dm_thin_remove_range(struct dm_thin_device *td,
int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
{
- int r;
+ int r = -EINVAL;
uint32_t ref_count;
down_read(&pmd->root_lock);
- r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
- if (!r)
- *result = (ref_count > 1);
+ if (!pmd->fail_io) {
+ r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
+ if (!r)
+ *result = (ref_count > 1);
+ }
up_read(&pmd->root_lock);
return r;
@@ -1770,10 +1772,11 @@ int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *re
int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
{
- int r = 0;
+ int r = -EINVAL;
pmd_write_lock(pmd);
- r = dm_sm_inc_blocks(pmd->data_sm, b, e);
+ if (!pmd->fail_io)
+ r = dm_sm_inc_blocks(pmd->data_sm, b, e);
pmd_write_unlock(pmd);
return r;
@@ -1781,10 +1784,11 @@ int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_
int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
{
- int r = 0;
+ int r = -EINVAL;
pmd_write_lock(pmd);
- r = dm_sm_dec_blocks(pmd->data_sm, b, e);
+ if (!pmd->fail_io)
+ r = dm_sm_dec_blocks(pmd->data_sm, b, e);
pmd_write_unlock(pmd);
return r;
@@ -1887,7 +1891,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
* Replacement block manager (new_bm) is created and old_bm destroyed outside of
* pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of
* shrinker associated with the block manager's bufio client vs pmd root_lock).
- * - must take shrinker_mutex without holding pmd->root_lock
+ * - must take shrinker_rwsem without holding pmd->root_lock
*/
new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
THIN_MAX_CONCURRENT_LOCKS);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 2b13c949bd72..f1d0dcb9db22 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -401,8 +401,7 @@ static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t da
sector_t s = block_to_sectors(tc->pool, data_b);
sector_t len = block_to_sectors(tc->pool, data_e - data_b);
- return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOWAIT,
- &op->bio);
+ return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOIO, &op->bio);
}
static void end_discard(struct discard_op *op, int r)
@@ -3301,7 +3300,7 @@ static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv)
unsigned long block_size;
dm_block_t low_water_blocks;
struct dm_dev *metadata_dev;
- fmode_t metadata_mode;
+ blk_mode_t metadata_mode;
/*
* FIXME Remove validation from scope of lock.
@@ -3334,7 +3333,8 @@ static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (r)
goto out_unlock;
- metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE);
+ metadata_mode = BLK_OPEN_READ |
+ ((pf.mode == PM_READ_ONLY) ? 0 : BLK_OPEN_WRITE);
r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);
if (r) {
ti->error = "Error opening metadata block device";
@@ -3342,7 +3342,7 @@ static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
warn_if_metadata_device_too_big(metadata_dev->bdev);
- r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
+ r = dm_get_device(ti, argv[1], BLK_OPEN_READ | BLK_OPEN_WRITE, &data_dev);
if (r) {
ti->error = "Error getting data device";
goto out_metadata;
@@ -4223,7 +4223,7 @@ static int thin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_origin_dev;
}
- r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
+ r = dm_get_device(ti, argv[2], BLK_OPEN_READ, &origin_dev);
if (r) {
ti->error = "Error opening origin device";
goto bad_origin_dev;
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index a9ee2faa75a2..3ef9f018da60 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -607,7 +607,7 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
(*argc)--;
if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV)) {
- r = dm_get_device(ti, arg_value, FMODE_READ, &v->fec->dev);
+ r = dm_get_device(ti, arg_value, BLK_OPEN_READ, &v->fec->dev);
if (r) {
ti->error = "FEC device lookup failed";
return r;
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index e35c16e06d06..26adcfea0302 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -1196,7 +1196,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (r)
goto bad;
- if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) {
+ if ((dm_table_get_mode(ti->table) & ~BLK_OPEN_READ)) {
ti->error = "Device must be readonly";
r = -EINVAL;
goto bad;
@@ -1225,13 +1225,13 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
v->version = num;
- r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev);
+ r = dm_get_device(ti, argv[1], BLK_OPEN_READ, &v->data_dev);
if (r) {
ti->error = "Data device lookup failed";
goto bad;
}
- r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev);
+ r = dm_get_device(ti, argv[2], BLK_OPEN_READ, &v->hash_dev);
if (r) {
ti->error = "Hash device lookup failed";
goto bad;
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 8f0896a6990b..9d3cca8e3dc9 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -577,7 +577,7 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
bio->bi_iter.bi_sector = dmz_blk2sect(block);
bio->bi_private = mblk;
bio->bi_end_io = dmz_mblock_bio_end_io;
- bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
+ __bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
submit_bio(bio);
return mblk;
@@ -728,7 +728,7 @@ static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
bio->bi_iter.bi_sector = dmz_blk2sect(block);
bio->bi_private = mblk;
bio->bi_end_io = dmz_mblock_bio_end_io;
- bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
+ __bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
submit_bio(bio);
return 0;
@@ -752,7 +752,7 @@ static int dmz_rdwr_block(struct dmz_dev *dev, enum req_op op,
bio = bio_alloc(dev->bdev, 1, op | REQ_SYNC | REQ_META | REQ_PRIO,
GFP_NOIO);
bio->bi_iter.bi_sector = dmz_blk2sect(block);
- bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0);
+ __bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0);
ret = submit_bio_wait(bio);
bio_put(bio);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 74f79c4e45c9..c4cdab508287 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -207,7 +207,7 @@ static int __init local_init(void)
if (r)
return r;
- deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
+ deferred_remove_workqueue = alloc_ordered_workqueue("kdmremove", 0);
if (!deferred_remove_workqueue) {
r = -ENOMEM;
goto out_uevent_exit;
@@ -310,13 +310,13 @@ int dm_deleting_md(struct mapped_device *md)
return test_bit(DMF_DELETING, &md->flags);
}
-static int dm_blk_open(struct block_device *bdev, fmode_t mode)
+static int dm_blk_open(struct gendisk *disk, blk_mode_t mode)
{
struct mapped_device *md;
spin_lock(&_minor_lock);
- md = bdev->bd_disk->private_data;
+ md = disk->private_data;
if (!md)
goto out;
@@ -334,7 +334,7 @@ out:
return md ? 0 : -ENXIO;
}
-static void dm_blk_close(struct gendisk *disk, fmode_t mode)
+static void dm_blk_close(struct gendisk *disk)
{
struct mapped_device *md;
@@ -448,7 +448,7 @@ static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
dm_put_live_table(md, srcu_idx);
}
-static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
+static int dm_blk_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg)
{
struct mapped_device *md = bdev->bd_disk->private_data;
@@ -734,7 +734,7 @@ static char *_dm_claim_ptr = "I belong to device-mapper";
* Open a table device so we can use it as a map destination.
*/
static struct table_device *open_table_device(struct mapped_device *md,
- dev_t dev, fmode_t mode)
+ dev_t dev, blk_mode_t mode)
{
struct table_device *td;
struct block_device *bdev;
@@ -746,7 +746,7 @@ static struct table_device *open_table_device(struct mapped_device *md,
return ERR_PTR(-ENOMEM);
refcount_set(&td->count, 1);
- bdev = blkdev_get_by_dev(dev, mode | FMODE_EXCL, _dm_claim_ptr);
+ bdev = blkdev_get_by_dev(dev, mode, _dm_claim_ptr, NULL);
if (IS_ERR(bdev)) {
r = PTR_ERR(bdev);
goto out_free_td;
@@ -771,7 +771,7 @@ static struct table_device *open_table_device(struct mapped_device *md,
return td;
out_blkdev_put:
- blkdev_put(bdev, mode | FMODE_EXCL);
+ blkdev_put(bdev, _dm_claim_ptr);
out_free_td:
kfree(td);
return ERR_PTR(r);
@@ -784,14 +784,14 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
{
if (md->disk->slave_dir)
bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
- blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
+ blkdev_put(td->dm_dev.bdev, _dm_claim_ptr);
put_dax(td->dm_dev.dax_dev);
list_del(&td->list);
kfree(td);
}
static struct table_device *find_table_device(struct list_head *l, dev_t dev,
- fmode_t mode)
+ blk_mode_t mode)
{
struct table_device *td;
@@ -802,7 +802,7 @@ static struct table_device *find_table_device(struct list_head *l, dev_t dev,
return NULL;
}
-int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
+int dm_get_table_device(struct mapped_device *md, dev_t dev, blk_mode_t mode,
struct dm_dev **result)
{
struct table_device *td;
@@ -1172,7 +1172,8 @@ static inline sector_t max_io_len_target_boundary(struct dm_target *ti,
}
static sector_t __max_io_len(struct dm_target *ti, sector_t sector,
- unsigned int max_granularity)
+ unsigned int max_granularity,
+ unsigned int max_sectors)
{
sector_t target_offset = dm_target_offset(ti, sector);
sector_t len = max_io_len_target_boundary(ti, target_offset);
@@ -1186,13 +1187,13 @@ static sector_t __max_io_len(struct dm_target *ti, sector_t sector,
if (!max_granularity)
return len;
return min_t(sector_t, len,
- min(queue_max_sectors(ti->table->md->queue),
+ min(max_sectors ? : queue_max_sectors(ti->table->md->queue),
blk_chunk_sectors_left(target_offset, max_granularity)));
}
static inline sector_t max_io_len(struct dm_target *ti, sector_t sector)
{
- return __max_io_len(ti, sector, ti->max_io_len);
+ return __max_io_len(ti, sector, ti->max_io_len, 0);
}
int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
@@ -1581,12 +1582,13 @@ static void __send_empty_flush(struct clone_info *ci)
static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
unsigned int num_bios,
- unsigned int max_granularity)
+ unsigned int max_granularity,
+ unsigned int max_sectors)
{
unsigned int len, bios;
len = min_t(sector_t, ci->sector_count,
- __max_io_len(ti, ci->sector, max_granularity));
+ __max_io_len(ti, ci->sector, max_granularity, max_sectors));
atomic_add(num_bios, &ci->io->io_count);
bios = __send_duplicate_bios(ci, ti, num_bios, &len);
@@ -1623,23 +1625,27 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci,
{
unsigned int num_bios = 0;
unsigned int max_granularity = 0;
+ unsigned int max_sectors = 0;
struct queue_limits *limits = dm_get_queue_limits(ti->table->md);
switch (bio_op(ci->bio)) {
case REQ_OP_DISCARD:
num_bios = ti->num_discard_bios;
+ max_sectors = limits->max_discard_sectors;
if (ti->max_discard_granularity)
- max_granularity = limits->max_discard_sectors;
+ max_granularity = max_sectors;
break;
case REQ_OP_SECURE_ERASE:
num_bios = ti->num_secure_erase_bios;
+ max_sectors = limits->max_secure_erase_sectors;
if (ti->max_secure_erase_granularity)
- max_granularity = limits->max_secure_erase_sectors;
+ max_granularity = max_sectors;
break;
case REQ_OP_WRITE_ZEROES:
num_bios = ti->num_write_zeroes_bios;
+ max_sectors = limits->max_write_zeroes_sectors;
if (ti->max_write_zeroes_granularity)
- max_granularity = limits->max_write_zeroes_sectors;
+ max_granularity = max_sectors;
break;
default:
break;
@@ -1654,7 +1660,8 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci,
if (unlikely(!num_bios))
return BLK_STS_NOTSUPP;
- __send_changing_extent_only(ci, ti, num_bios, max_granularity);
+ __send_changing_extent_only(ci, ti, num_bios,
+ max_granularity, max_sectors);
return BLK_STS_OK;
}
@@ -2808,6 +2815,10 @@ retry:
}
map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+ if (!map) {
+ /* avoid deadlock with fs/namespace.c:do_mount() */
+ suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
+ }
r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
if (r)
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a856e0aee73b..63d9010d8e61 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -203,7 +203,7 @@ int dm_open_count(struct mapped_device *md);
int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred);
int dm_cancel_deferred_remove(struct mapped_device *md);
int dm_request_based(struct mapped_device *md);
-int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
+int dm_get_table_device(struct mapped_device *md, dev_t dev, blk_mode_t mode,
struct dm_dev **result);
void dm_put_table_device(struct mapped_device *md, struct dm_dev *d);
diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c
index 91836e6de326..6eaa0eab40f9 100644
--- a/drivers/md/md-autodetect.c
+++ b/drivers/md/md-autodetect.c
@@ -147,7 +147,8 @@ static void __init md_setup_drive(struct md_setup_args *args)
if (p)
*p++ = 0;
- dev = name_to_dev_t(devname);
+ if (early_lookup_bdev(devname, &dev))
+ dev = 0;
if (strncmp(devname, "/dev/", 5) == 0)
devname += 5;
snprintf(comp_name, 63, "/dev/%s", devname);
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index bc8d7565171d..1ff712889a3b 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -54,14 +54,7 @@ __acquires(bitmap->lock)
{
unsigned char *mappage;
- if (page >= bitmap->pages) {
- /* This can happen if bitmap_start_sync goes beyond
- * End-of-device while looking for a whole page.
- * It is harmless.
- */
- return -EINVAL;
- }
-
+ WARN_ON_ONCE(page >= bitmap->pages);
if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */
return 0;
@@ -1023,7 +1016,6 @@ static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block)
return set;
}
-
/* this gets called when the md device is ready to unplug its underlying
* (slave) device queues -- before we let any writes go down, we need to
* sync the dirty pages of the bitmap file to disk */
@@ -1033,8 +1025,7 @@ void md_bitmap_unplug(struct bitmap *bitmap)
int dirty, need_write;
int writing = 0;
- if (!bitmap || !bitmap->storage.filemap ||
- test_bit(BITMAP_STALE, &bitmap->flags))
+ if (!md_bitmap_enabled(bitmap))
return;
/* look at each page to see if there are any set bits that need to be
@@ -1063,6 +1054,35 @@ void md_bitmap_unplug(struct bitmap *bitmap)
}
EXPORT_SYMBOL(md_bitmap_unplug);
+struct bitmap_unplug_work {
+ struct work_struct work;
+ struct bitmap *bitmap;
+ struct completion *done;
+};
+
+static void md_bitmap_unplug_fn(struct work_struct *work)
+{
+ struct bitmap_unplug_work *unplug_work =
+ container_of(work, struct bitmap_unplug_work, work);
+
+ md_bitmap_unplug(unplug_work->bitmap);
+ complete(unplug_work->done);
+}
+
+void md_bitmap_unplug_async(struct bitmap *bitmap)
+{
+ DECLARE_COMPLETION_ONSTACK(done);
+ struct bitmap_unplug_work unplug_work;
+
+ INIT_WORK_ONSTACK(&unplug_work.work, md_bitmap_unplug_fn);
+ unplug_work.bitmap = bitmap;
+ unplug_work.done = &done;
+
+ queue_work(md_bitmap_wq, &unplug_work.work);
+ wait_for_completion(&done);
+}
+EXPORT_SYMBOL(md_bitmap_unplug_async);
+
static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed);
/* * bitmap_init_from_disk -- called at bitmap_create time to initialize
* the in-memory bitmap from the on-disk bitmap -- also, sets up the
@@ -1241,11 +1261,28 @@ static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap,
sector_t offset, sector_t *blocks,
int create);
+static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout,
+ bool force)
+{
+ struct md_thread *thread;
+
+ rcu_read_lock();
+ thread = rcu_dereference(mddev->thread);
+
+ if (!thread)
+ goto out;
+
+ if (force || thread->timeout < MAX_SCHEDULE_TIMEOUT)
+ thread->timeout = timeout;
+
+out:
+ rcu_read_unlock();
+}
+
/*
* bitmap daemon -- periodically wakes up to clean bits and flush pages
* out to disk
*/
-
void md_bitmap_daemon_work(struct mddev *mddev)
{
struct bitmap *bitmap;
@@ -1269,7 +1306,7 @@ void md_bitmap_daemon_work(struct mddev *mddev)
bitmap->daemon_lastrun = jiffies;
if (bitmap->allclean) {
- mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+ mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true);
goto done;
}
bitmap->allclean = 1;
@@ -1366,8 +1403,7 @@ void md_bitmap_daemon_work(struct mddev *mddev)
done:
if (bitmap->allclean == 0)
- mddev->thread->timeout =
- mddev->bitmap_info.daemon_sleep;
+ mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true);
mutex_unlock(&mddev->bitmap_info.mutex);
}
@@ -1387,6 +1423,14 @@ __acquires(bitmap->lock)
sector_t csize;
int err;
+ if (page >= bitmap->pages) {
+ /*
+ * This can happen if bitmap_start_sync goes beyond
+ * End-of-device while looking for a whole page or
+ * user set a huge number to sysfs bitmap_set_bits.
+ */
+ return NULL;
+ }
err = md_bitmap_checkpage(bitmap, page, create, 0);
if (bitmap->bp[page].hijacked ||
@@ -1820,8 +1864,7 @@ void md_bitmap_destroy(struct mddev *mddev)
mddev->bitmap = NULL; /* disconnect from the md device */
spin_unlock(&mddev->lock);
mutex_unlock(&mddev->bitmap_info.mutex);
- if (mddev->thread)
- mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+ mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true);
md_bitmap_free(bitmap);
}
@@ -1964,7 +2007,7 @@ int md_bitmap_load(struct mddev *mddev)
/* Kick recovery in case any bits were set */
set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
- mddev->thread->timeout = mddev->bitmap_info.daemon_sleep;
+ mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true);
md_wakeup_thread(mddev->thread);
md_bitmap_update_sb(bitmap);
@@ -2469,17 +2512,11 @@ timeout_store(struct mddev *mddev, const char *buf, size_t len)
timeout = MAX_SCHEDULE_TIMEOUT-1;
if (timeout < 1)
timeout = 1;
+
mddev->bitmap_info.daemon_sleep = timeout;
- if (mddev->thread) {
- /* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then
- * the bitmap is all clean and we don't need to
- * adjust the timeout right now
- */
- if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) {
- mddev->thread->timeout = timeout;
- md_wakeup_thread(mddev->thread);
- }
- }
+ mddev_set_timeout(mddev, timeout, false);
+ md_wakeup_thread(mddev->thread);
+
return len;
}
diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
index cfd7395de8fd..8a3788c9bfef 100644
--- a/drivers/md/md-bitmap.h
+++ b/drivers/md/md-bitmap.h
@@ -264,6 +264,7 @@ void md_bitmap_sync_with_cluster(struct mddev *mddev,
sector_t new_lo, sector_t new_hi);
void md_bitmap_unplug(struct bitmap *bitmap);
+void md_bitmap_unplug_async(struct bitmap *bitmap);
void md_bitmap_daemon_work(struct mddev *mddev);
int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
@@ -273,6 +274,13 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot,
sector_t *lo, sector_t *hi, bool clear_bits);
void md_bitmap_free(struct bitmap *bitmap);
void md_bitmap_wait_behind_writes(struct mddev *mddev);
+
+static inline bool md_bitmap_enabled(struct bitmap *bitmap)
+{
+ return bitmap && bitmap->storage.filemap &&
+ !test_bit(BITMAP_STALE, &bitmap->flags);
+}
+
#endif
#endif
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 10e0c5381d01..3d9fd74233df 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -75,14 +75,14 @@ struct md_cluster_info {
sector_t suspend_hi;
int suspend_from; /* the slot which broadcast suspend_lo/hi */
- struct md_thread *recovery_thread;
+ struct md_thread __rcu *recovery_thread;
unsigned long recovery_map;
/* communication loc resources */
struct dlm_lock_resource *ack_lockres;
struct dlm_lock_resource *message_lockres;
struct dlm_lock_resource *token_lockres;
struct dlm_lock_resource *no_new_dev_lockres;
- struct md_thread *recv_thread;
+ struct md_thread __rcu *recv_thread;
struct completion newdisk_completion;
wait_queue_head_t wait;
unsigned long state;
@@ -362,8 +362,8 @@ static void __recover_slot(struct mddev *mddev, int slot)
set_bit(slot, &cinfo->recovery_map);
if (!cinfo->recovery_thread) {
- cinfo->recovery_thread = md_register_thread(recover_bitmaps,
- mddev, "recover");
+ rcu_assign_pointer(cinfo->recovery_thread,
+ md_register_thread(recover_bitmaps, mddev, "recover"));
if (!cinfo->recovery_thread) {
pr_warn("md-cluster: Could not create recovery thread\n");
return;
@@ -526,11 +526,15 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
{
int got_lock = 0;
+ struct md_thread *thread;
struct md_cluster_info *cinfo = mddev->cluster_info;
mddev->good_device_nr = le32_to_cpu(msg->raid_slot);
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
- wait_event(mddev->thread->wqueue,
+
+ /* daemaon thread must exist */
+ thread = rcu_dereference_protected(mddev->thread, true);
+ wait_event(thread->wqueue,
(got_lock = mddev_trylock(mddev)) ||
test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state));
md_reload_sb(mddev, mddev->good_device_nr);
@@ -889,7 +893,8 @@ static int join(struct mddev *mddev, int nodes)
}
/* Initiate the communication resources */
ret = -ENOMEM;
- cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
+ rcu_assign_pointer(cinfo->recv_thread,
+ md_register_thread(recv_daemon, mddev, "cluster_recv"));
if (!cinfo->recv_thread) {
pr_err("md-cluster: cannot allocate memory for recv_thread!\n");
goto err;
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 66edf5e72bd6..92c45be203d7 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -400,8 +400,8 @@ static int multipath_run (struct mddev *mddev)
if (ret)
goto out_free_conf;
- mddev->thread = md_register_thread(multipathd, mddev,
- "multipath");
+ rcu_assign_pointer(mddev->thread,
+ md_register_thread(multipathd, mddev, "multipath"));
if (!mddev->thread)
goto out_free_conf;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8e344b4b3444..cf3733c90c47 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -70,11 +70,7 @@
#include "md-bitmap.h"
#include "md-cluster.h"
-/* pers_list is a list of registered personalities protected
- * by pers_lock.
- * pers_lock does extra service to protect accesses to
- * mddev->thread when the mutex cannot be held.
- */
+/* pers_list is a list of registered personalities protected by pers_lock. */
static LIST_HEAD(pers_list);
static DEFINE_SPINLOCK(pers_lock);
@@ -87,23 +83,13 @@ static struct module *md_cluster_mod;
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
static struct workqueue_struct *md_wq;
static struct workqueue_struct *md_misc_wq;
-static struct workqueue_struct *md_rdev_misc_wq;
+struct workqueue_struct *md_bitmap_wq;
static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this);
static void mddev_detach(struct mddev *mddev);
-
-enum md_ro_state {
- MD_RDWR,
- MD_RDONLY,
- MD_AUTO_READ,
- MD_MAX_STATE
-};
-
-static bool md_is_rdwr(struct mddev *mddev)
-{
- return (mddev->ro == MD_RDWR);
-}
+static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
+static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
/*
* Default number of read corrections we'll attempt on an rdev
@@ -360,10 +346,6 @@ EXPORT_SYMBOL_GPL(md_new_event);
static LIST_HEAD(all_mddevs);
static DEFINE_SPINLOCK(all_mddevs_lock);
-static bool is_md_suspended(struct mddev *mddev)
-{
- return percpu_ref_is_dying(&mddev->active_io);
-}
/* Rather than calling directly into the personality make_request function,
* IO requests come here first so that we can check if the device is
* being suspended pending a reconfiguration.
@@ -457,13 +439,19 @@ static void md_submit_bio(struct bio *bio)
*/
void mddev_suspend(struct mddev *mddev)
{
- WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
- lockdep_assert_held(&mddev->reconfig_mutex);
+ struct md_thread *thread = rcu_dereference_protected(mddev->thread,
+ lockdep_is_held(&mddev->reconfig_mutex));
+
+ WARN_ON_ONCE(thread && current == thread->tsk);
if (mddev->suspended++)
return;
wake_up(&mddev->sb_wait);
set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
percpu_ref_kill(&mddev->active_io);
+
+ if (mddev->pers->prepare_suspend)
+ mddev->pers->prepare_suspend(mddev);
+
wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io));
mddev->pers->quiesce(mddev, 1);
clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
@@ -655,9 +643,11 @@ void mddev_init(struct mddev *mddev)
{
mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex);
+ mutex_init(&mddev->delete_mutex);
mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks);
INIT_LIST_HEAD(&mddev->all_mddevs);
+ INIT_LIST_HEAD(&mddev->deleting);
timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
atomic_set(&mddev->active, 1);
atomic_set(&mddev->openers, 0);
@@ -759,6 +749,24 @@ static void mddev_free(struct mddev *mddev)
static const struct attribute_group md_redundancy_group;
+static void md_free_rdev(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+ struct md_rdev *tmp;
+
+ mutex_lock(&mddev->delete_mutex);
+ if (list_empty(&mddev->deleting))
+ goto out;
+
+ list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) {
+ list_del_init(&rdev->same_set);
+ kobject_del(&rdev->kobj);
+ export_rdev(rdev, mddev);
+ }
+out:
+ mutex_unlock(&mddev->delete_mutex);
+}
+
void mddev_unlock(struct mddev *mddev)
{
if (mddev->to_remove) {
@@ -800,13 +808,10 @@ void mddev_unlock(struct mddev *mddev)
} else
mutex_unlock(&mddev->reconfig_mutex);
- /* As we've dropped the mutex we need a spinlock to
- * make sure the thread doesn't disappear
- */
- spin_lock(&pers_lock);
+ md_free_rdev(mddev);
+
md_wakeup_thread(mddev->thread);
wake_up(&mddev->sb_wait);
- spin_unlock(&pers_lock);
}
EXPORT_SYMBOL_GPL(mddev_unlock);
@@ -938,7 +943,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
atomic_inc(&rdev->nr_pending);
bio->bi_iter.bi_sector = sector;
- bio_add_page(bio, page, size, 0);
+ __bio_add_page(bio, page, size, 0);
bio->bi_private = rdev;
bio->bi_end_io = super_written;
@@ -979,7 +984,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
bio.bi_iter.bi_sector = sector + rdev->new_data_offset;
else
bio.bi_iter.bi_sector = sector + rdev->data_offset;
- bio_add_page(&bio, page, size, 0);
+ __bio_add_page(&bio, page, size, 0);
submit_bio_wait(&bio);
@@ -2440,16 +2445,12 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
return err;
}
-static void rdev_delayed_delete(struct work_struct *ws)
-{
- struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
- kobject_del(&rdev->kobj);
- kobject_put(&rdev->kobj);
-}
-
void md_autodetect_dev(dev_t dev);
-static void export_rdev(struct md_rdev *rdev)
+/* just for claiming the bdev */
+static struct md_rdev claim_rdev;
+
+static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
{
pr_debug("md: export_rdev(%pg)\n", rdev->bdev);
md_rdev_clear(rdev);
@@ -2457,13 +2458,15 @@ static void export_rdev(struct md_rdev *rdev)
if (test_bit(AutoDetected, &rdev->flags))
md_autodetect_dev(rdev->bdev->bd_dev);
#endif
- blkdev_put(rdev->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ blkdev_put(rdev->bdev, mddev->major_version == -2 ? &claim_rdev : rdev);
rdev->bdev = NULL;
kobject_put(&rdev->kobj);
}
static void md_kick_rdev_from_array(struct md_rdev *rdev)
{
+ struct mddev *mddev = rdev->mddev;
+
bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%pg>\n", rdev->bdev);
@@ -2477,15 +2480,17 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
rdev->sysfs_unack_badblocks = NULL;
rdev->sysfs_badblocks = NULL;
rdev->badblocks.count = 0;
- /* We need to delay this, otherwise we can deadlock when
- * writing to 'remove' to "dev/state". We also need
- * to delay it due to rcu usage.
- */
+
synchronize_rcu();
- INIT_WORK(&rdev->del_work, rdev_delayed_delete);
- kobject_get(&rdev->kobj);
- queue_work(md_rdev_misc_wq, &rdev->del_work);
- export_rdev(rdev);
+
+ /*
+ * kobject_del() will wait for all in progress writers to be done, where
+ * reconfig_mutex is held, hence it can't be called under
+ * reconfig_mutex and it's delayed to mddev_unlock().
+ */
+ mutex_lock(&mddev->delete_mutex);
+ list_add(&rdev->same_set, &mddev->deleting);
+ mutex_unlock(&mddev->delete_mutex);
}
static void export_array(struct mddev *mddev)
@@ -3553,6 +3558,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
{
struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
+ struct kernfs_node *kn = NULL;
ssize_t rv;
struct mddev *mddev = rdev->mddev;
@@ -3560,6 +3566,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
return -EIO;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
+
+ if (entry->store == state_store && cmd_match(page, "remove"))
+ kn = sysfs_break_active_protection(kobj, attr);
+
rv = mddev ? mddev_lock(mddev) : -ENODEV;
if (!rv) {
if (rdev->mddev == NULL)
@@ -3568,6 +3578,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
rv = entry->store(rdev, page, length);
mddev_unlock(mddev);
}
+
+ if (kn)
+ sysfs_unbreak_active_protection(kn);
+
return rv;
}
@@ -3612,6 +3626,7 @@ int md_rdev_init(struct md_rdev *rdev)
return badblocks_init(&rdev->badblocks, 0);
}
EXPORT_SYMBOL_GPL(md_rdev_init);
+
/*
* Import a device. If 'super_format' >= 0, then sanity check the superblock
*
@@ -3624,7 +3639,6 @@ EXPORT_SYMBOL_GPL(md_rdev_init);
*/
static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
{
- static struct md_rdev claim_rdev; /* just for claiming the bdev */
struct md_rdev *rdev;
sector_t size;
int err;
@@ -3640,9 +3654,8 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
if (err)
goto out_clear_rdev;
- rdev->bdev = blkdev_get_by_dev(newdev,
- FMODE_READ | FMODE_WRITE | FMODE_EXCL,
- super_format == -2 ? &claim_rdev : rdev);
+ rdev->bdev = blkdev_get_by_dev(newdev, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ super_format == -2 ? &claim_rdev : rdev, NULL);
if (IS_ERR(rdev->bdev)) {
pr_warn("md: could not open device unknown-block(%u,%u).\n",
MAJOR(newdev), MINOR(newdev));
@@ -3679,7 +3692,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
return rdev;
out_blkdev_put:
- blkdev_put(rdev->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ blkdev_put(rdev->bdev, super_format == -2 ? &claim_rdev : rdev);
out_clear_rdev:
md_rdev_clear(rdev);
out_free_rdev:
@@ -3794,8 +3807,9 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
static ssize_t
safe_delay_show(struct mddev *mddev, char *page)
{
- int msec = (mddev->safemode_delay*1000)/HZ;
- return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
+ unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ;
+
+ return sprintf(page, "%u.%03u\n", msec/1000, msec%1000);
}
static ssize_t
safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
@@ -3807,7 +3821,7 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
return -EINVAL;
}
- if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
+ if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ)
return -EINVAL;
if (msec == 0)
mddev->safemode_delay = 0;
@@ -4477,6 +4491,8 @@ max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len
rv = kstrtouint(buf, 10, &n);
if (rv < 0)
return rv;
+ if (n > INT_MAX)
+ return -EINVAL;
atomic_set(&mddev->max_corr_read_errors, n);
return len;
}
@@ -4491,20 +4507,6 @@ null_show(struct mddev *mddev, char *page)
return -EINVAL;
}
-/* need to ensure rdev_delayed_delete() has completed */
-static void flush_rdev_wq(struct mddev *mddev)
-{
- struct md_rdev *rdev;
-
- rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev)
- if (work_pending(&rdev->del_work)) {
- flush_workqueue(md_rdev_misc_wq);
- break;
- }
- rcu_read_unlock();
-}
-
static ssize_t
new_dev_store(struct mddev *mddev, const char *buf, size_t len)
{
@@ -4532,7 +4534,6 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
minor != MINOR(dev))
return -EOVERFLOW;
- flush_rdev_wq(mddev);
err = mddev_lock(mddev);
if (err)
return err;
@@ -4560,7 +4561,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
err = bind_rdev_to_array(rdev, mddev);
out:
if (err)
- export_rdev(rdev);
+ export_rdev(rdev, mddev);
mddev_unlock(mddev);
if (!err)
md_new_event();
@@ -4804,11 +4805,21 @@ action_store(struct mddev *mddev, const char *page, size_t len)
return -EINVAL;
err = mddev_lock(mddev);
if (!err) {
- if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
err = -EBUSY;
- else {
+ } else if (mddev->reshape_position == MaxSector ||
+ mddev->pers->check_reshape == NULL ||
+ mddev->pers->check_reshape(mddev)) {
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
err = mddev->pers->start_reshape(mddev);
+ } else {
+ /*
+ * If reshape is still in progress, and
+ * md_check_recovery() can continue to reshape,
+ * don't restart reshape because data can be
+ * corrupted for raid456.
+ */
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
}
mddev_unlock(mddev);
}
@@ -5592,7 +5603,6 @@ struct mddev *md_alloc(dev_t dev, char *name)
* removed (mddev_delayed_delete).
*/
flush_workqueue(md_misc_wq);
- flush_workqueue(md_rdev_misc_wq);
mutex_lock(&disks_mutex);
mddev = mddev_alloc(dev);
@@ -6269,10 +6279,12 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
}
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- if (mddev->sync_thread)
- /* Thread might be blocked waiting for metadata update
- * which will now never happen */
- wake_up_process(mddev->sync_thread->tsk);
+
+ /*
+ * Thread might be blocked waiting for metadata update which will now
+ * never happen
+ */
+ md_wakeup_thread_directly(mddev->sync_thread);
if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
return -EBUSY;
@@ -6333,10 +6345,12 @@ static int do_md_stop(struct mddev *mddev, int mode,
}
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- if (mddev->sync_thread)
- /* Thread might be blocked waiting for metadata update
- * which will now never happen */
- wake_up_process(mddev->sync_thread->tsk);
+
+ /*
+ * Thread might be blocked waiting for metadata update which will now
+ * never happen
+ */
+ md_wakeup_thread_directly(mddev->sync_thread);
mddev_unlock(mddev);
wait_event(resync_wait, (mddev->sync_thread == NULL &&
@@ -6498,7 +6512,7 @@ static void autorun_devices(int part)
rdev_for_each_list(rdev, tmp, &candidates) {
list_del_init(&rdev->same_set);
if (bind_rdev_to_array(rdev, mddev))
- export_rdev(rdev);
+ export_rdev(rdev, mddev);
}
autorun_array(mddev);
mddev_unlock(mddev);
@@ -6508,7 +6522,7 @@ static void autorun_devices(int part)
*/
rdev_for_each_list(rdev, tmp, &candidates) {
list_del_init(&rdev->same_set);
- export_rdev(rdev);
+ export_rdev(rdev, mddev);
}
mddev_put(mddev);
}
@@ -6696,13 +6710,13 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
pr_warn("md: %pg has different UUID to %pg\n",
rdev->bdev,
rdev0->bdev);
- export_rdev(rdev);
+ export_rdev(rdev, mddev);
return -EINVAL;
}
}
err = bind_rdev_to_array(rdev, mddev);
if (err)
- export_rdev(rdev);
+ export_rdev(rdev, mddev);
return err;
}
@@ -6733,7 +6747,6 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
if (info->state & (1<<MD_DISK_SYNC) &&
info->raid_disk < mddev->raid_disks) {
rdev->raid_disk = info->raid_disk;
- set_bit(In_sync, &rdev->flags);
clear_bit(Bitmap_sync, &rdev->flags);
} else
rdev->raid_disk = -1;
@@ -6746,7 +6759,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
/* This was a hot-add request, but events doesn't
* match, so reject it.
*/
- export_rdev(rdev);
+ export_rdev(rdev, mddev);
return -EINVAL;
}
@@ -6772,7 +6785,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
}
}
if (has_journal || mddev->bitmap) {
- export_rdev(rdev);
+ export_rdev(rdev, mddev);
return -EBUSY;
}
set_bit(Journal, &rdev->flags);
@@ -6787,7 +6800,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
/* --add initiated by this node */
err = md_cluster_ops->add_new_disk(mddev, rdev);
if (err) {
- export_rdev(rdev);
+ export_rdev(rdev, mddev);
return err;
}
}
@@ -6797,7 +6810,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
err = bind_rdev_to_array(rdev, mddev);
if (err)
- export_rdev(rdev);
+ export_rdev(rdev, mddev);
if (mddev_is_clustered(mddev)) {
if (info->state & (1 << MD_DISK_CANDIDATE)) {
@@ -6860,7 +6873,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
err = bind_rdev_to_array(rdev, mddev);
if (err) {
- export_rdev(rdev);
+ export_rdev(rdev, mddev);
return err;
}
}
@@ -6985,7 +6998,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
return 0;
abort_export:
- export_rdev(rdev);
+ export_rdev(rdev, mddev);
return err;
}
@@ -7486,7 +7499,7 @@ static int __md_set_array_info(struct mddev *mddev, void __user *argp)
return err;
}
-static int md_ioctl(struct block_device *bdev, fmode_t mode,
+static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg)
{
int err = 0;
@@ -7555,9 +7568,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
}
- if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK)
- flush_rdev_wq(mddev);
-
if (cmd == HOT_REMOVE_DISK)
/* need to ensure recovery thread has run */
wait_event_interruptible_timeout(mddev->sb_wait,
@@ -7718,7 +7728,7 @@ out:
return err;
}
#ifdef CONFIG_COMPAT
-static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
+static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg)
{
switch (cmd) {
@@ -7767,13 +7777,13 @@ out_unlock:
return err;
}
-static int md_open(struct block_device *bdev, fmode_t mode)
+static int md_open(struct gendisk *disk, blk_mode_t mode)
{
struct mddev *mddev;
int err;
spin_lock(&all_mddevs_lock);
- mddev = mddev_get(bdev->bd_disk->private_data);
+ mddev = mddev_get(disk->private_data);
spin_unlock(&all_mddevs_lock);
if (!mddev)
return -ENODEV;
@@ -7789,7 +7799,7 @@ static int md_open(struct block_device *bdev, fmode_t mode)
atomic_inc(&mddev->openers);
mutex_unlock(&mddev->open_mutex);
- bdev_check_media_change(bdev);
+ disk_check_media_change(disk);
return 0;
out_unlock:
@@ -7799,7 +7809,7 @@ out:
return err;
}
-static void md_release(struct gendisk *disk, fmode_t mode)
+static void md_release(struct gendisk *disk)
{
struct mddev *mddev = disk->private_data;
@@ -7886,13 +7896,29 @@ static int md_thread(void *arg)
return 0;
}
-void md_wakeup_thread(struct md_thread *thread)
+static void md_wakeup_thread_directly(struct md_thread __rcu *thread)
{
- if (thread) {
- pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
- set_bit(THREAD_WAKEUP, &thread->flags);
- wake_up(&thread->wqueue);
+ struct md_thread *t;
+
+ rcu_read_lock();
+ t = rcu_dereference(thread);
+ if (t)
+ wake_up_process(t->tsk);
+ rcu_read_unlock();
+}
+
+void md_wakeup_thread(struct md_thread __rcu *thread)
+{
+ struct md_thread *t;
+
+ rcu_read_lock();
+ t = rcu_dereference(thread);
+ if (t) {
+ pr_debug("md: waking up MD thread %s.\n", t->tsk->comm);
+ set_bit(THREAD_WAKEUP, &t->flags);
+ wake_up(&t->wqueue);
}
+ rcu_read_unlock();
}
EXPORT_SYMBOL(md_wakeup_thread);
@@ -7922,22 +7948,15 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *),
}
EXPORT_SYMBOL(md_register_thread);
-void md_unregister_thread(struct md_thread **threadp)
+void md_unregister_thread(struct md_thread __rcu **threadp)
{
- struct md_thread *thread;
+ struct md_thread *thread = rcu_dereference_protected(*threadp, true);
- /*
- * Locking ensures that mddev_unlock does not wake_up a
- * non-existent thread
- */
- spin_lock(&pers_lock);
- thread = *threadp;
- if (!thread) {
- spin_unlock(&pers_lock);
+ if (!thread)
return;
- }
- *threadp = NULL;
- spin_unlock(&pers_lock);
+
+ rcu_assign_pointer(*threadp, NULL);
+ synchronize_rcu();
pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
kthread_stop(thread->tsk);
@@ -9100,6 +9119,7 @@ void md_do_sync(struct md_thread *thread)
spin_unlock(&mddev->lock);
wake_up(&resync_wait);
+ wake_up(&mddev->sb_wait);
md_wakeup_thread(mddev->thread);
return;
}
@@ -9202,9 +9222,8 @@ static void md_start_sync(struct work_struct *ws)
{
struct mddev *mddev = container_of(ws, struct mddev, del_work);
- mddev->sync_thread = md_register_thread(md_do_sync,
- mddev,
- "resync");
+ rcu_assign_pointer(mddev->sync_thread,
+ md_register_thread(md_do_sync, mddev, "resync"));
if (!mddev->sync_thread) {
pr_warn("%s: could not start resync thread...\n",
mdname(mddev));
@@ -9619,9 +9638,10 @@ static int __init md_init(void)
if (!md_misc_wq)
goto err_misc_wq;
- md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0);
- if (!md_rdev_misc_wq)
- goto err_rdev_misc_wq;
+ md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND,
+ 0);
+ if (!md_bitmap_wq)
+ goto err_bitmap_wq;
ret = __register_blkdev(MD_MAJOR, "md", md_probe);
if (ret < 0)
@@ -9641,8 +9661,8 @@ static int __init md_init(void)
err_mdp:
unregister_blkdev(MD_MAJOR, "md");
err_md:
- destroy_workqueue(md_rdev_misc_wq);
-err_rdev_misc_wq:
+ destroy_workqueue(md_bitmap_wq);
+err_bitmap_wq:
destroy_workqueue(md_misc_wq);
err_misc_wq:
destroy_workqueue(md_wq);
@@ -9938,8 +9958,8 @@ static __exit void md_exit(void)
}
spin_unlock(&all_mddevs_lock);
- destroy_workqueue(md_rdev_misc_wq);
destroy_workqueue(md_misc_wq);
+ destroy_workqueue(md_bitmap_wq);
destroy_workqueue(md_wq);
}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fd8f260ed5f8..bfd2306bc750 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -122,8 +122,6 @@ struct md_rdev {
struct serial_in_rdev *serial; /* used for raid1 io serialization */
- struct work_struct del_work; /* used for delayed sysfs removal */
-
struct kernfs_node *sysfs_state; /* handle for 'state'
* sysfs entry */
/* handle for 'unacknowledged_bad_blocks' sysfs dentry */
@@ -367,8 +365,8 @@ struct mddev {
int new_chunk_sectors;
int reshape_backwards;
- struct md_thread *thread; /* management thread */
- struct md_thread *sync_thread; /* doing resync or reconstruct */
+ struct md_thread __rcu *thread; /* management thread */
+ struct md_thread __rcu *sync_thread; /* doing resync or reconstruct */
/* 'last_sync_action' is initialized to "none". It is set when a
* sync operation (i.e "data-check", "requested-resync", "resync",
@@ -531,6 +529,14 @@ struct mddev {
unsigned int good_device_nr; /* good device num within cluster raid */
unsigned int noio_flag; /* for memalloc scope API */
+ /*
+ * Temporarily store rdev that will be finally removed when
+ * reconfig_mutex is unlocked.
+ */
+ struct list_head deleting;
+ /* Protect the deleting list */
+ struct mutex delete_mutex;
+
bool has_superblocks:1;
bool fail_last_dev:1;
bool serialize_policy:1;
@@ -555,6 +561,23 @@ enum recovery_flags {
MD_RESYNCING_REMOTE, /* remote node is running resync thread */
};
+enum md_ro_state {
+ MD_RDWR,
+ MD_RDONLY,
+ MD_AUTO_READ,
+ MD_MAX_STATE
+};
+
+static inline bool md_is_rdwr(struct mddev *mddev)
+{
+ return (mddev->ro == MD_RDWR);
+}
+
+static inline bool is_md_suspended(struct mddev *mddev)
+{
+ return percpu_ref_is_dying(&mddev->active_io);
+}
+
static inline int __must_check mddev_lock(struct mddev *mddev)
{
return mutex_lock_interruptible(&mddev->reconfig_mutex);
@@ -614,6 +637,7 @@ struct md_personality
int (*start_reshape) (struct mddev *mddev);
void (*finish_reshape) (struct mddev *mddev);
void (*update_reshape_pos) (struct mddev *mddev);
+ void (*prepare_suspend) (struct mddev *mddev);
/* quiesce suspends or resumes internal processing.
* 1 - stop new actions and wait for action io to complete
* 0 - return to normal behaviour
@@ -734,8 +758,8 @@ extern struct md_thread *md_register_thread(
void (*run)(struct md_thread *thread),
struct mddev *mddev,
const char *name);
-extern void md_unregister_thread(struct md_thread **threadp);
-extern void md_wakeup_thread(struct md_thread *thread);
+extern void md_unregister_thread(struct md_thread __rcu **threadp);
+extern void md_wakeup_thread(struct md_thread __rcu *thread);
extern void md_check_recovery(struct mddev *mddev);
extern void md_reap_sync_thread(struct mddev *mddev);
extern int mddev_init_writes_pending(struct mddev *mddev);
@@ -828,6 +852,7 @@ struct mdu_array_info_s;
struct mdu_disk_info_s;
extern int mdp_major;
+extern struct workqueue_struct *md_bitmap_wq;
void md_autostart_arrays(int part);
int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info);
int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info);
diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index e61f6cad4e08..169ebe296f2d 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -21,6 +21,7 @@
#define IO_MADE_GOOD ((struct bio *)2)
#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
+#define MAX_PLUG_BIO 32
/* for managing resync I/O pages */
struct resync_pages {
@@ -31,6 +32,7 @@ struct resync_pages {
struct raid1_plug_cb {
struct blk_plug_cb cb;
struct bio_list pending;
+ unsigned int count;
};
static void rbio_pool_free(void *rbio, void *data)
@@ -101,11 +103,73 @@ static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp,
struct page *page = resync_fetch_page(rp, idx);
int len = min_t(int, size, PAGE_SIZE);
- /*
- * won't fail because the vec table is big
- * enough to hold all these pages
- */
- bio_add_page(bio, page, len, 0);
+ if (WARN_ON(!bio_add_page(bio, page, len, 0))) {
+ bio->bi_status = BLK_STS_RESOURCE;
+ bio_endio(bio);
+ return;
+ }
+
size -= len;
} while (idx++ < RESYNC_PAGES && size > 0);
}
+
+
+static inline void raid1_submit_write(struct bio *bio)
+{
+ struct md_rdev *rdev = (struct md_rdev *)bio->bi_bdev;
+
+ bio->bi_next = NULL;
+ bio_set_dev(bio, rdev->bdev);
+ if (test_bit(Faulty, &rdev->flags))
+ bio_io_error(bio);
+ else if (unlikely(bio_op(bio) == REQ_OP_DISCARD &&
+ !bdev_max_discard_sectors(bio->bi_bdev)))
+ /* Just ignore it */
+ bio_endio(bio);
+ else
+ submit_bio_noacct(bio);
+}
+
+static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
+ blk_plug_cb_fn unplug, int copies)
+{
+ struct raid1_plug_cb *plug = NULL;
+ struct blk_plug_cb *cb;
+
+ /*
+ * If bitmap is not enabled, it's safe to submit the io directly, and
+ * this can get optimal performance.
+ */
+ if (!md_bitmap_enabled(mddev->bitmap)) {
+ raid1_submit_write(bio);
+ return true;
+ }
+
+ cb = blk_check_plugged(unplug, mddev, sizeof(*plug));
+ if (!cb)
+ return false;
+
+ plug = container_of(cb, struct raid1_plug_cb, cb);
+ bio_list_add(&plug->pending, bio);
+ if (++plug->count / MAX_PLUG_BIO >= copies) {
+ list_del(&cb->list);
+ cb->callback(cb, false);
+ }
+
+
+ return true;
+}
+
+/*
+ * current->bio_list will be set under submit_bio() context, in this case bitmap
+ * io will be added to the list and wait for current io submission to finish,
+ * while current io submission must wait for bitmap io to be done. In order to
+ * avoid such deadlock, submit bitmap io asynchronously.
+ */
+static inline void raid1_prepare_flush_writes(struct bitmap *bitmap)
+{
+ if (current->bio_list)
+ md_bitmap_unplug_async(bitmap);
+ else
+ md_bitmap_unplug(bitmap);
+}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 68a9e2d9985b..dd25832eb045 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -794,22 +794,13 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
static void flush_bio_list(struct r1conf *conf, struct bio *bio)
{
/* flush any pending bitmap writes to disk before proceeding w/ I/O */
- md_bitmap_unplug(conf->mddev->bitmap);
+ raid1_prepare_flush_writes(conf->mddev->bitmap);
wake_up(&conf->wait_barrier);
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
- struct md_rdev *rdev = (void *)bio->bi_bdev;
- bio->bi_next = NULL;
- bio_set_dev(bio, rdev->bdev);
- if (test_bit(Faulty, &rdev->flags)) {
- bio_io_error(bio);
- } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !bdev_max_discard_sectors(bio->bi_bdev)))
- /* Just ignore it */
- bio_endio(bio);
- else
- submit_bio_noacct(bio);
+
+ raid1_submit_write(bio);
bio = next;
cond_resched();
}
@@ -1147,7 +1138,10 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio,
if (unlikely(!page))
goto free_pages;
- bio_add_page(behind_bio, page, len, 0);
+ if (!bio_add_page(behind_bio, page, len, 0)) {
+ put_page(page);
+ goto free_pages;
+ }
size -= len;
i++;
@@ -1175,7 +1169,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
struct r1conf *conf = mddev->private;
struct bio *bio;
- if (from_schedule || current->bio_list) {
+ if (from_schedule) {
spin_lock_irq(&conf->device_lock);
bio_list_merge(&conf->pending_bio_list, &plug->pending);
spin_unlock_irq(&conf->device_lock);
@@ -1343,8 +1337,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
struct bitmap *bitmap = mddev->bitmap;
unsigned long flags;
struct md_rdev *blocked_rdev;
- struct blk_plug_cb *cb;
- struct raid1_plug_cb *plug = NULL;
int first_clone;
int max_sectors;
bool write_behind = false;
@@ -1573,15 +1565,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
r1_bio->sector);
/* flush_pending_writes() needs access to the rdev so...*/
mbio->bi_bdev = (void *)rdev;
-
- cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
- if (cb)
- plug = container_of(cb, struct raid1_plug_cb, cb);
- else
- plug = NULL;
- if (plug) {
- bio_list_add(&plug->pending, mbio);
- } else {
+ if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) {
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -2914,7 +2898,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
* won't fail because the vec table is big
* enough to hold all these pages
*/
- bio_add_page(bio, page, len, 0);
+ __bio_add_page(bio, page, len, 0);
}
}
nr_sectors += len>>9;
@@ -3084,7 +3068,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
}
err = -ENOMEM;
- conf->thread = md_register_thread(raid1d, mddev, "raid1");
+ rcu_assign_pointer(conf->thread,
+ md_register_thread(raid1d, mddev, "raid1"));
if (!conf->thread)
goto abort;
@@ -3177,8 +3162,8 @@ static int raid1_run(struct mddev *mddev)
/*
* Ok, everything is just fine now
*/
- mddev->thread = conf->thread;
- conf->thread = NULL;
+ rcu_assign_pointer(mddev->thread, conf->thread);
+ rcu_assign_pointer(conf->thread, NULL);
mddev->private = conf;
set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index ebb6788820e7..468f189da7a0 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -130,7 +130,7 @@ struct r1conf {
/* When taking over an array from a different personality, we store
* the new thread here until we fully activate the array.
*/
- struct md_thread *thread;
+ struct md_thread __rcu *thread;
/* Keep track of cluster resync window to send to other
* nodes.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 4fcfcb350d2b..d0de8c9fb3cf 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -779,8 +779,16 @@ static struct md_rdev *read_balance(struct r10conf *conf,
disk = r10_bio->devs[slot].devnum;
rdev = rcu_dereference(conf->mirrors[disk].replacement);
if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
- r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
+ r10_bio->devs[slot].addr + sectors >
+ rdev->recovery_offset) {
+ /*
+ * Read replacement first to prevent reading both rdev
+ * and replacement as NULL during replacement replace
+ * rdev.
+ */
+ smp_mb();
rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ }
if (rdev == NULL ||
test_bit(Faulty, &rdev->flags))
continue;
@@ -902,25 +910,15 @@ static void flush_pending_writes(struct r10conf *conf)
__set_current_state(TASK_RUNNING);
blk_start_plug(&plug);
- /* flush any pending bitmap writes to disk
- * before proceeding w/ I/O */
- md_bitmap_unplug(conf->mddev->bitmap);
+ raid1_prepare_flush_writes(conf->mddev->bitmap);
wake_up(&conf->wait_barrier);
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
- struct md_rdev *rdev = (void*)bio->bi_bdev;
- bio->bi_next = NULL;
- bio_set_dev(bio, rdev->bdev);
- if (test_bit(Faulty, &rdev->flags)) {
- bio_io_error(bio);
- } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !bdev_max_discard_sectors(bio->bi_bdev)))
- /* Just ignore it */
- bio_endio(bio);
- else
- submit_bio_noacct(bio);
+
+ raid1_submit_write(bio);
bio = next;
+ cond_resched();
}
blk_finish_plug(&plug);
} else
@@ -982,6 +980,7 @@ static void lower_barrier(struct r10conf *conf)
static bool stop_waiting_barrier(struct r10conf *conf)
{
struct bio_list *bio_list = current->bio_list;
+ struct md_thread *thread;
/* barrier is dropped */
if (!conf->barrier)
@@ -997,12 +996,14 @@ static bool stop_waiting_barrier(struct r10conf *conf)
(!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1])))
return true;
+ /* daemon thread must exist while handling io */
+ thread = rcu_dereference_protected(conf->mddev->thread, true);
/*
* move on if io is issued from raid10d(), nr_pending is not released
* from original io(see handle_read_error()). All raise barrier is
* blocked until this io is done.
*/
- if (conf->mddev->thread->tsk == current) {
+ if (thread->tsk == current) {
WARN_ON_ONCE(atomic_read(&conf->nr_pending) == 0);
return true;
}
@@ -1113,7 +1114,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
struct r10conf *conf = mddev->private;
struct bio *bio;
- if (from_schedule || current->bio_list) {
+ if (from_schedule) {
spin_lock_irq(&conf->device_lock);
bio_list_merge(&conf->pending_bio_list, &plug->pending);
spin_unlock_irq(&conf->device_lock);
@@ -1125,23 +1126,15 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
/* we aren't scheduling, so we can do the write-out directly. */
bio = bio_list_get(&plug->pending);
- md_bitmap_unplug(mddev->bitmap);
+ raid1_prepare_flush_writes(mddev->bitmap);
wake_up(&conf->wait_barrier);
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
- struct md_rdev *rdev = (void*)bio->bi_bdev;
- bio->bi_next = NULL;
- bio_set_dev(bio, rdev->bdev);
- if (test_bit(Faulty, &rdev->flags)) {
- bio_io_error(bio);
- } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !bdev_max_discard_sectors(bio->bi_bdev)))
- /* Just ignore it */
- bio_endio(bio);
- else
- submit_bio_noacct(bio);
+
+ raid1_submit_write(bio);
bio = next;
+ cond_resched();
}
kfree(plug);
}
@@ -1282,8 +1275,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
const blk_opf_t do_fua = bio->bi_opf & REQ_FUA;
unsigned long flags;
- struct blk_plug_cb *cb;
- struct raid1_plug_cb *plug = NULL;
struct r10conf *conf = mddev->private;
struct md_rdev *rdev;
int devnum = r10_bio->devs[n_copy].devnum;
@@ -1323,14 +1314,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
atomic_inc(&r10_bio->remaining);
- cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
- if (cb)
- plug = container_of(cb, struct raid1_plug_cb, cb);
- else
- plug = NULL;
- if (plug) {
- bio_list_add(&plug->pending, mbio);
- } else {
+ if (!raid1_add_bio_to_plug(mddev, mbio, raid10_unplug, conf->copies)) {
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -1479,9 +1463,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
for (i = 0; i < conf->copies; i++) {
int d = r10_bio->devs[i].devnum;
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
- struct md_rdev *rrdev = rcu_dereference(
- conf->mirrors[d].replacement);
+ struct md_rdev *rdev, *rrdev;
+
+ rrdev = rcu_dereference(conf->mirrors[d].replacement);
+ /*
+ * Read replacement first to prevent reading both rdev and
+ * replacement as NULL during replacement replace rdev.
+ */
+ smp_mb();
+ rdev = rcu_dereference(conf->mirrors[d].rdev);
if (rdev == rrdev)
rrdev = NULL;
if (rdev && (test_bit(Faulty, &rdev->flags)))
@@ -2148,9 +2138,10 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct r10conf *conf = mddev->private;
int err = -EEXIST;
- int mirror;
+ int mirror, repl_slot = -1;
int first = 0;
int last = conf->geo.raid_disks - 1;
+ struct raid10_info *p;
if (mddev->recovery_cp < MaxSector)
/* only hot-add to in-sync arrays, as recovery is
@@ -2173,23 +2164,14 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
else
mirror = first;
for ( ; mirror <= last ; mirror++) {
- struct raid10_info *p = &conf->mirrors[mirror];
+ p = &conf->mirrors[mirror];
if (p->recovery_disabled == mddev->recovery_disabled)
continue;
if (p->rdev) {
- if (!test_bit(WantReplacement, &p->rdev->flags) ||
- p->replacement != NULL)
- continue;
- clear_bit(In_sync, &rdev->flags);
- set_bit(Replacement, &rdev->flags);
- rdev->raid_disk = mirror;
- err = 0;
- if (mddev->gendisk)
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->data_offset << 9);
- conf->fullsync = 1;
- rcu_assign_pointer(p->replacement, rdev);
- break;
+ if (test_bit(WantReplacement, &p->rdev->flags) &&
+ p->replacement == NULL && repl_slot < 0)
+ repl_slot = mirror;
+ continue;
}
if (mddev->gendisk)
@@ -2206,6 +2188,19 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
break;
}
+ if (err && repl_slot >= 0) {
+ p = &conf->mirrors[repl_slot];
+ clear_bit(In_sync, &rdev->flags);
+ set_bit(Replacement, &rdev->flags);
+ rdev->raid_disk = repl_slot;
+ err = 0;
+ if (mddev->gendisk)
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+ conf->fullsync = 1;
+ rcu_assign_pointer(p->replacement, rdev);
+ }
+
print_conf(conf);
return err;
}
@@ -3303,6 +3298,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
int chunks_skipped = 0;
sector_t chunk_mask = conf->geo.chunk_mask;
int page_idx = 0;
+ int error_disk = -1;
/*
* Allow skipping a full rebuild for incremental assembly
@@ -3386,8 +3382,21 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
return reshape_request(mddev, sector_nr, skipped);
if (chunks_skipped >= conf->geo.raid_disks) {
- /* if there has been nothing to do on any drive,
- * then there is nothing to do at all..
+ pr_err("md/raid10:%s: %s fails\n", mdname(mddev),
+ test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery");
+ if (error_disk >= 0 &&
+ !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ /*
+ * recovery fails, set mirrors.recovery_disabled,
+ * device shouldn't be added to there.
+ */
+ conf->mirrors[error_disk].recovery_disabled =
+ mddev->recovery_disabled;
+ return 0;
+ }
+ /*
+ * if there has been nothing to do on any drive,
+ * then there is nothing to do at all.
*/
*skipped = 1;
return (max_sector - sector_nr) + sectors_skipped;
@@ -3437,8 +3446,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
sector_t sect;
int must_sync;
int any_working;
- int need_recover = 0;
- int need_replace = 0;
struct raid10_info *mirror = &conf->mirrors[i];
struct md_rdev *mrdev, *mreplace;
@@ -3446,15 +3453,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
mrdev = rcu_dereference(mirror->rdev);
mreplace = rcu_dereference(mirror->replacement);
- if (mrdev != NULL &&
- !test_bit(Faulty, &mrdev->flags) &&
- !test_bit(In_sync, &mrdev->flags))
- need_recover = 1;
- if (mreplace != NULL &&
- !test_bit(Faulty, &mreplace->flags))
- need_replace = 1;
+ if (mrdev && (test_bit(Faulty, &mrdev->flags) ||
+ test_bit(In_sync, &mrdev->flags)))
+ mrdev = NULL;
+ if (mreplace && test_bit(Faulty, &mreplace->flags))
+ mreplace = NULL;
- if (!need_recover && !need_replace) {
+ if (!mrdev && !mreplace) {
rcu_read_unlock();
continue;
}
@@ -3470,8 +3475,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
rcu_read_unlock();
continue;
}
- if (mreplace && test_bit(Faulty, &mreplace->flags))
- mreplace = NULL;
/* Unless we are doing a full sync, or a replacement
* we only need to recover the block if it is set in
* the bitmap
@@ -3490,7 +3493,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
rcu_read_unlock();
continue;
}
- atomic_inc(&mrdev->nr_pending);
+ if (mrdev)
+ atomic_inc(&mrdev->nr_pending);
if (mreplace)
atomic_inc(&mreplace->nr_pending);
rcu_read_unlock();
@@ -3577,7 +3581,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
r10_bio->devs[1].devnum = i;
r10_bio->devs[1].addr = to_addr;
- if (need_recover) {
+ if (mrdev) {
bio = r10_bio->devs[1].bio;
bio->bi_next = biolist;
biolist = bio;
@@ -3594,11 +3598,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio = r10_bio->devs[1].repl_bio;
if (bio)
bio->bi_end_io = NULL;
- /* Note: if need_replace, then bio
+ /* Note: if replace is not NULL, then bio
* cannot be NULL as r10buf_pool_alloc will
* have allocated it.
*/
- if (!need_replace)
+ if (!mreplace)
break;
bio->bi_next = biolist;
biolist = bio;
@@ -3622,7 +3626,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
for (k = 0; k < conf->copies; k++)
if (r10_bio->devs[k].devnum == i)
break;
- if (!test_bit(In_sync,
+ if (mrdev && !test_bit(In_sync,
&mrdev->flags)
&& !rdev_set_badblocks(
mrdev,
@@ -3643,17 +3647,21 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
mdname(mddev));
mirror->recovery_disabled
= mddev->recovery_disabled;
+ } else {
+ error_disk = i;
}
put_buf(r10_bio);
if (rb2)
atomic_dec(&rb2->remaining);
r10_bio = rb2;
- rdev_dec_pending(mrdev, mddev);
+ if (mrdev)
+ rdev_dec_pending(mrdev, mddev);
if (mreplace)
rdev_dec_pending(mreplace, mddev);
break;
}
- rdev_dec_pending(mrdev, mddev);
+ if (mrdev)
+ rdev_dec_pending(mrdev, mddev);
if (mreplace)
rdev_dec_pending(mreplace, mddev);
if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) {
@@ -3819,11 +3827,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
for (bio= biolist ; bio ; bio=bio->bi_next) {
struct resync_pages *rp = get_resync_pages(bio);
page = resync_fetch_page(rp, page_idx);
- /*
- * won't fail because the vec table is big enough
- * to hold all these pages
- */
- bio_add_page(bio, page, len, 0);
+ if (WARN_ON(!bio_add_page(bio, page, len, 0))) {
+ bio->bi_status = BLK_STS_RESOURCE;
+ bio_endio(bio);
+ goto giveup;
+ }
}
nr_sectors += len>>9;
sector_nr += len>>9;
@@ -4107,7 +4115,8 @@ static struct r10conf *setup_conf(struct mddev *mddev)
atomic_set(&conf->nr_pending, 0);
err = -ENOMEM;
- conf->thread = md_register_thread(raid10d, mddev, "raid10");
+ rcu_assign_pointer(conf->thread,
+ md_register_thread(raid10d, mddev, "raid10"));
if (!conf->thread)
goto out;
@@ -4152,8 +4161,8 @@ static int raid10_run(struct mddev *mddev)
if (!conf)
goto out;
- mddev->thread = conf->thread;
- conf->thread = NULL;
+ rcu_assign_pointer(mddev->thread, conf->thread);
+ rcu_assign_pointer(conf->thread, NULL);
if (mddev_is_clustered(conf->mddev)) {
int fc, fo;
@@ -4296,8 +4305,8 @@ static int raid10_run(struct mddev *mddev)
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
- mddev->sync_thread = md_register_thread(md_do_sync, mddev,
- "reshape");
+ rcu_assign_pointer(mddev->sync_thread,
+ md_register_thread(md_do_sync, mddev, "reshape"));
if (!mddev->sync_thread)
goto out_free_conf;
}
@@ -4698,8 +4707,8 @@ out:
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
- mddev->sync_thread = md_register_thread(md_do_sync, mddev,
- "reshape");
+ rcu_assign_pointer(mddev->sync_thread,
+ md_register_thread(md_do_sync, mddev, "reshape"));
if (!mddev->sync_thread) {
ret = -EAGAIN;
goto abort;
@@ -4997,11 +5006,11 @@ read_more:
if (len > PAGE_SIZE)
len = PAGE_SIZE;
for (bio = blist; bio ; bio = bio->bi_next) {
- /*
- * won't fail because the vec table is big enough
- * to hold all these pages
- */
- bio_add_page(bio, page, len, 0);
+ if (WARN_ON(!bio_add_page(bio, page, len, 0))) {
+ bio->bi_status = BLK_STS_RESOURCE;
+ bio_endio(bio);
+ return sectors_done;
+ }
}
sector_nr += len >> 9;
nr_sectors += len >> 9;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 8c072ce0bc54..63e48b11b552 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -100,7 +100,7 @@ struct r10conf {
/* When taking over an array from a different personality, we store
* the new thread here until we fully activate the array.
*/
- struct md_thread *thread;
+ struct md_thread __rcu *thread;
/*
* Keep track of cluster resync window to send to other nodes.
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 46182b955aef..47ba7d9e81e1 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -120,7 +120,7 @@ struct r5l_log {
struct bio_set bs;
mempool_t meta_pool;
- struct md_thread *reclaim_thread;
+ struct md_thread __rcu *reclaim_thread;
unsigned long reclaim_target; /* number of space that need to be
* reclaimed. if it's 0, reclaim spaces
* used by io_units which are in
@@ -792,7 +792,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
io->current_bio = r5l_bio_alloc(log);
io->current_bio->bi_end_io = r5l_log_endio;
io->current_bio->bi_private = io;
- bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
+ __bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
r5_reserve_log_entry(log, io);
@@ -1576,17 +1576,18 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
void r5l_quiesce(struct r5l_log *log, int quiesce)
{
- struct mddev *mddev;
+ struct mddev *mddev = log->rdev->mddev;
+ struct md_thread *thread = rcu_dereference_protected(
+ log->reclaim_thread, lockdep_is_held(&mddev->reconfig_mutex));
if (quiesce) {
/* make sure r5l_write_super_and_discard_space exits */
- mddev = log->rdev->mddev;
wake_up(&mddev->sb_wait);
- kthread_park(log->reclaim_thread->tsk);
+ kthread_park(thread->tsk);
r5l_wake_reclaim(log, MaxSector);
r5l_do_reclaim(log);
} else
- kthread_unpark(log->reclaim_thread->tsk);
+ kthread_unpark(thread->tsk);
}
bool r5l_log_disk_error(struct r5conf *conf)
@@ -3063,6 +3064,7 @@ void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
{
struct r5l_log *log;
+ struct md_thread *thread;
int ret;
pr_debug("md/raid:%s: using device %pg as journal\n",
@@ -3121,11 +3123,13 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
spin_lock_init(&log->tree_lock);
INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
- log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
- log->rdev->mddev, "reclaim");
- if (!log->reclaim_thread)
+ thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev,
+ "reclaim");
+ if (!thread)
goto reclaim_thread;
- log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
+
+ thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
+ rcu_assign_pointer(log->reclaim_thread, thread);
init_waitqueue_head(&log->iounit_wait);
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index e495939bb3e0..eaea57aee602 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -465,7 +465,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
bio->bi_end_io = ppl_log_endio;
bio->bi_iter.bi_sector = log->next_io_sector;
- bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
+ __bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
pr_debug("%s: log->current_io_sector: %llu\n", __func__,
(unsigned long long)log->next_io_sector);
@@ -496,7 +496,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
prev->bi_opf, GFP_NOIO,
&ppl_conf->bs);
bio->bi_iter.bi_sector = bio_end_sector(prev);
- bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
+ __bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
bio_chain(bio, prev);
ppl_submit_iounit_bio(io, prev);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4739ed891e75..85b3004594e0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2433,7 +2433,7 @@ static int grow_stripes(struct r5conf *conf, int num)
conf->active_name = 0;
sc = kmem_cache_create(conf->cache_name[conf->active_name],
- sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
+ struct_size_t(struct stripe_head, dev, devs),
0, 0, NULL);
if (!sc)
return 1;
@@ -2559,7 +2559,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
/* Step 1 */
sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
- sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
+ struct_size_t(struct stripe_head, dev, newsize),
0, 0, NULL);
if (!sc)
return -ENOMEM;
@@ -5516,7 +5516,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
sector = raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 0,
&dd_idx, NULL);
- end_sector = bio_end_sector(raid_bio);
+ end_sector = sector + bio_sectors(raid_bio);
rcu_read_lock();
if (r5c_big_stripe_cached(conf, sector))
@@ -5966,6 +5966,19 @@ out:
return ret;
}
+static bool reshape_inprogress(struct mddev *mddev)
+{
+ return test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_DONE, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_INTR, &mddev->recovery);
+}
+
+static bool reshape_disabled(struct mddev *mddev)
+{
+ return is_md_suspended(mddev) || !md_is_rdwr(mddev);
+}
+
static enum stripe_result make_stripe_request(struct mddev *mddev,
struct r5conf *conf, struct stripe_request_ctx *ctx,
sector_t logical_sector, struct bio *bi)
@@ -5997,7 +6010,8 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
if (ahead_of_reshape(mddev, logical_sector,
conf->reshape_safe)) {
spin_unlock_irq(&conf->device_lock);
- return STRIPE_SCHEDULE_AND_RETRY;
+ ret = STRIPE_SCHEDULE_AND_RETRY;
+ goto out;
}
}
spin_unlock_irq(&conf->device_lock);
@@ -6076,6 +6090,15 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
out_release:
raid5_release_stripe(sh);
+out:
+ if (ret == STRIPE_SCHEDULE_AND_RETRY && !reshape_inprogress(mddev) &&
+ reshape_disabled(mddev)) {
+ bi->bi_status = BLK_STS_IOERR;
+ ret = STRIPE_FAIL;
+ pr_err("md/raid456:%s: io failed across reshape position while reshape can't make progress.\n",
+ mdname(mddev));
+ }
+
return ret;
}
@@ -7708,7 +7731,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
}
sprintf(pers_name, "raid%d", mddev->new_level);
- conf->thread = md_register_thread(raid5d, mddev, pers_name);
+ rcu_assign_pointer(conf->thread,
+ md_register_thread(raid5d, mddev, pers_name));
if (!conf->thread) {
pr_warn("md/raid:%s: couldn't allocate thread.\n",
mdname(mddev));
@@ -7931,8 +7955,8 @@ static int raid5_run(struct mddev *mddev)
}
conf->min_offset_diff = min_offset_diff;
- mddev->thread = conf->thread;
- conf->thread = NULL;
+ rcu_assign_pointer(mddev->thread, conf->thread);
+ rcu_assign_pointer(conf->thread, NULL);
mddev->private = conf;
for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
@@ -8029,8 +8053,8 @@ static int raid5_run(struct mddev *mddev)
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
- mddev->sync_thread = md_register_thread(md_do_sync, mddev,
- "reshape");
+ rcu_assign_pointer(mddev->sync_thread,
+ md_register_thread(md_do_sync, mddev, "reshape"));
if (!mddev->sync_thread)
goto abort;
}
@@ -8377,6 +8401,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
p = conf->disks + disk;
tmp = rdev_mdlock_deref(mddev, p->rdev);
if (test_bit(WantReplacement, &tmp->flags) &&
+ mddev->reshape_position == MaxSector &&
p->replacement == NULL) {
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
@@ -8500,6 +8525,7 @@ static int raid5_start_reshape(struct mddev *mddev)
struct r5conf *conf = mddev->private;
struct md_rdev *rdev;
int spares = 0;
+ int i;
unsigned long flags;
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
@@ -8511,6 +8537,13 @@ static int raid5_start_reshape(struct mddev *mddev)
if (has_failed(conf))
return -EINVAL;
+ /* raid5 can't handle concurrent reshape and recovery */
+ if (mddev->recovery_cp < MaxSector)
+ return -EBUSY;
+ for (i = 0; i < conf->raid_disks; i++)
+ if (rdev_mdlock_deref(mddev, conf->disks[i].replacement))
+ return -EBUSY;
+
rdev_for_each(rdev, mddev) {
if (!test_bit(In_sync, &rdev->flags)
&& !test_bit(Faulty, &rdev->flags))
@@ -8607,8 +8640,8 @@ static int raid5_start_reshape(struct mddev *mddev)
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
- mddev->sync_thread = md_register_thread(md_do_sync, mddev,
- "reshape");
+ rcu_assign_pointer(mddev->sync_thread,
+ md_register_thread(md_do_sync, mddev, "reshape"));
if (!mddev->sync_thread) {
mddev->recovery = 0;
spin_lock_irq(&conf->device_lock);
@@ -9043,6 +9076,22 @@ static int raid5_start(struct mddev *mddev)
return r5l_start(conf->log);
}
+static void raid5_prepare_suspend(struct mddev *mddev)
+{
+ struct r5conf *conf = mddev->private;
+
+ wait_event(mddev->sb_wait, !reshape_inprogress(mddev) ||
+ percpu_ref_is_zero(&mddev->active_io));
+ if (percpu_ref_is_zero(&mddev->active_io))
+ return;
+
+ /*
+ * Reshape is not in progress, and array is suspended, io that is
+ * waiting for reshpape can never be done.
+ */
+ wake_up(&conf->wait_for_overlap);
+}
+
static struct md_personality raid6_personality =
{
.name = "raid6",
@@ -9063,6 +9112,7 @@ static struct md_personality raid6_personality =
.check_reshape = raid6_check_reshape,
.start_reshape = raid5_start_reshape,
.finish_reshape = raid5_finish_reshape,
+ .prepare_suspend = raid5_prepare_suspend,
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
@@ -9087,6 +9137,7 @@ static struct md_personality raid5_personality =
.check_reshape = raid5_check_reshape,
.start_reshape = raid5_start_reshape,
.finish_reshape = raid5_finish_reshape,
+ .prepare_suspend = raid5_prepare_suspend,
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
@@ -9112,6 +9163,7 @@ static struct md_personality raid4_personality =
.check_reshape = raid5_check_reshape,
.start_reshape = raid5_start_reshape,
.finish_reshape = raid5_finish_reshape,
+ .prepare_suspend = raid5_prepare_suspend,
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index e873938a6125..97a795979a35 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -268,7 +268,7 @@ struct stripe_head {
unsigned long flags;
u32 log_checksum;
unsigned short write_hint;
- } dev[1]; /* allocated with extra space depending of RAID geometry */
+ } dev[]; /* allocated depending of RAID geometry ("disks" member) */
};
/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
@@ -679,7 +679,7 @@ struct r5conf {
/* When taking over an array from a different personality, we store
* the new thread here until we fully activate the array.
*/
- struct md_thread *thread;
+ struct md_thread __rcu *thread;
struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
struct r5worker_group *worker_groups;
int group_cnt;