summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-io.c15
-rw-r--r--drivers/md/dm-snap.c120
-rw-r--r--drivers/md/dm-thin.c11
-rw-r--r--drivers/md/dm.c47
-rw-r--r--drivers/md/md.c17
-rw-r--r--drivers/md/raid0.c2
-rw-r--r--drivers/md/raid1.c5
-rw-r--r--drivers/md/raid5.c13
8 files changed, 179 insertions, 51 deletions
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 37de0173b6d2..74adcd2c967e 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -289,9 +289,16 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
struct request_queue *q = bdev_get_queue(where->bdev);
unsigned short logical_block_size = queue_logical_block_size(q);
sector_t num_sectors;
+ unsigned int uninitialized_var(special_cmd_max_sectors);
- /* Reject unsupported discard requests */
- if ((rw & REQ_DISCARD) && !blk_queue_discard(q)) {
+ /*
+ * Reject unsupported discard and write same requests.
+ */
+ if (rw & REQ_DISCARD)
+ special_cmd_max_sectors = q->limits.max_discard_sectors;
+ else if (rw & REQ_WRITE_SAME)
+ special_cmd_max_sectors = q->limits.max_write_same_sectors;
+ if ((rw & (REQ_DISCARD | REQ_WRITE_SAME)) && special_cmd_max_sectors == 0) {
dec_count(io, region, -EOPNOTSUPP);
return;
}
@@ -317,7 +324,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
store_io_and_region_in_bio(bio, io, region);
if (rw & REQ_DISCARD) {
- num_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining);
+ num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining);
bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
remaining -= num_sectors;
} else if (rw & REQ_WRITE_SAME) {
@@ -326,7 +333,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
*/
dp->get_page(dp, &page, &len, &offset);
bio_add_page(bio, page, logical_block_size, offset);
- num_sectors = min_t(sector_t, q->limits.max_write_same_sectors, remaining);
+ num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining);
bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
offset = 0;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 8b204ae216ab..f83a0f3fc365 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -20,6 +20,8 @@
#include <linux/log2.h>
#include <linux/dm-kcopyd.h>
+#include "dm.h"
+
#include "dm-exception-store.h"
#define DM_MSG_PREFIX "snapshots"
@@ -291,12 +293,23 @@ struct origin {
};
/*
+ * This structure is allocated for each origin target
+ */
+struct dm_origin {
+ struct dm_dev *dev;
+ struct dm_target *ti;
+ unsigned split_boundary;
+ struct list_head hash_list;
+};
+
+/*
* Size of the hash table for origin volumes. If we make this
* the size of the minors list then it should be nearly perfect
*/
#define ORIGIN_HASH_SIZE 256
#define ORIGIN_MASK 0xFF
static struct list_head *_origins;
+static struct list_head *_dm_origins;
static struct rw_semaphore _origins_lock;
static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done);
@@ -310,12 +323,22 @@ static int init_origin_hash(void)
_origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
GFP_KERNEL);
if (!_origins) {
- DMERR("unable to allocate memory");
+ DMERR("unable to allocate memory for _origins");
return -ENOMEM;
}
-
for (i = 0; i < ORIGIN_HASH_SIZE; i++)
INIT_LIST_HEAD(_origins + i);
+
+ _dm_origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!_dm_origins) {
+ DMERR("unable to allocate memory for _dm_origins");
+ kfree(_origins);
+ return -ENOMEM;
+ }
+ for (i = 0; i < ORIGIN_HASH_SIZE; i++)
+ INIT_LIST_HEAD(_dm_origins + i);
+
init_rwsem(&_origins_lock);
return 0;
@@ -324,6 +347,7 @@ static int init_origin_hash(void)
static void exit_origin_hash(void)
{
kfree(_origins);
+ kfree(_dm_origins);
}
static unsigned origin_hash(struct block_device *bdev)
@@ -350,6 +374,30 @@ static void __insert_origin(struct origin *o)
list_add_tail(&o->hash_list, sl);
}
+static struct dm_origin *__lookup_dm_origin(struct block_device *origin)
+{
+ struct list_head *ol;
+ struct dm_origin *o;
+
+ ol = &_dm_origins[origin_hash(origin)];
+ list_for_each_entry (o, ol, hash_list)
+ if (bdev_equal(o->dev->bdev, origin))
+ return o;
+
+ return NULL;
+}
+
+static void __insert_dm_origin(struct dm_origin *o)
+{
+ struct list_head *sl = &_dm_origins[origin_hash(o->dev->bdev)];
+ list_add_tail(&o->hash_list, sl);
+}
+
+static void __remove_dm_origin(struct dm_origin *o)
+{
+ list_del(&o->hash_list);
+}
+
/*
* _origins_lock must be held when calling this function.
* Returns number of snapshots registered using the supplied cow device, plus:
@@ -1840,9 +1888,40 @@ static int snapshot_preresume(struct dm_target *ti)
static void snapshot_resume(struct dm_target *ti)
{
struct dm_snapshot *s = ti->private;
- struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
+ struct dm_snapshot *snap_src = NULL, *snap_dest = NULL, *snap_merging = NULL;
+ struct dm_origin *o;
+ struct mapped_device *origin_md = NULL;
+ bool must_restart_merging = false;
down_read(&_origins_lock);
+
+ o = __lookup_dm_origin(s->origin->bdev);
+ if (o)
+ origin_md = dm_table_get_md(o->ti->table);
+ if (!origin_md) {
+ (void) __find_snapshots_sharing_cow(s, NULL, NULL, &snap_merging);
+ if (snap_merging)
+ origin_md = dm_table_get_md(snap_merging->ti->table);
+ }
+ if (origin_md == dm_table_get_md(ti->table))
+ origin_md = NULL;
+ if (origin_md) {
+ if (dm_hold(origin_md))
+ origin_md = NULL;
+ }
+
+ up_read(&_origins_lock);
+
+ if (origin_md) {
+ dm_internal_suspend_fast(origin_md);
+ if (snap_merging && test_bit(RUNNING_MERGE, &snap_merging->state_bits)) {
+ must_restart_merging = true;
+ stop_merge(snap_merging);
+ }
+ }
+
+ down_read(&_origins_lock);
+
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
if (snap_src && snap_dest) {
down_write(&snap_src->lock);
@@ -1851,8 +1930,16 @@ static void snapshot_resume(struct dm_target *ti)
up_write(&snap_dest->lock);
up_write(&snap_src->lock);
}
+
up_read(&_origins_lock);
+ if (origin_md) {
+ if (must_restart_merging)
+ start_merge(snap_merging);
+ dm_internal_resume_fast(origin_md);
+ dm_put(origin_md);
+ }
+
/* Now we have correct chunk size, reregister */
reregister_snapshot(s);
@@ -2133,11 +2220,6 @@ static int origin_write_extent(struct dm_snapshot *merging_snap,
* Origin: maps a linear range of a device, with hooks for snapshotting.
*/
-struct dm_origin {
- struct dm_dev *dev;
- unsigned split_boundary;
-};
-
/*
* Construct an origin mapping: <dev_path>
* The context for an origin is merely a 'struct dm_dev *'
@@ -2166,6 +2248,7 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_open;
}
+ o->ti = ti;
ti->private = o;
ti->num_flush_bios = 1;
@@ -2180,6 +2263,7 @@ bad_alloc:
static void origin_dtr(struct dm_target *ti)
{
struct dm_origin *o = ti->private;
+
dm_put_device(ti, o->dev);
kfree(o);
}
@@ -2216,6 +2300,19 @@ static void origin_resume(struct dm_target *ti)
struct dm_origin *o = ti->private;
o->split_boundary = get_origin_minimum_chunksize(o->dev->bdev);
+
+ down_write(&_origins_lock);
+ __insert_dm_origin(o);
+ up_write(&_origins_lock);
+}
+
+static void origin_postsuspend(struct dm_target *ti)
+{
+ struct dm_origin *o = ti->private;
+
+ down_write(&_origins_lock);
+ __remove_dm_origin(o);
+ up_write(&_origins_lock);
}
static void origin_status(struct dm_target *ti, status_type_t type,
@@ -2258,12 +2355,13 @@ static int origin_iterate_devices(struct dm_target *ti,
static struct target_type origin_target = {
.name = "snapshot-origin",
- .version = {1, 8, 1},
+ .version = {1, 9, 0},
.module = THIS_MODULE,
.ctr = origin_ctr,
.dtr = origin_dtr,
.map = origin_map,
.resume = origin_resume,
+ .postsuspend = origin_postsuspend,
.status = origin_status,
.merge = origin_merge,
.iterate_devices = origin_iterate_devices,
@@ -2271,7 +2369,7 @@ static struct target_type origin_target = {
static struct target_type snapshot_target = {
.name = "snapshot",
- .version = {1, 12, 0},
+ .version = {1, 13, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
@@ -2285,7 +2383,7 @@ static struct target_type snapshot_target = {
static struct target_type merge_target = {
.name = dm_snapshot_merge_target_name,
- .version = {1, 2, 0},
+ .version = {1, 3, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 654773cb1eee..921aafd12aee 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2358,17 +2358,6 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
case -ENODATA:
- if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
- /*
- * This block isn't provisioned, and we have no way
- * of doing so.
- */
- handle_unserviceable_bio(tc->pool, bio);
- cell_defer_no_holder(tc, virt_cell);
- return DM_MAPIO_SUBMITTED;
- }
- /* fall through */
-
case -EWOULDBLOCK:
thin_defer_cell(tc, virt_cell);
return DM_MAPIO_SUBMITTED;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 73f28802dc7a..8001fe9e3434 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -433,7 +433,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
dm_get(md);
atomic_inc(&md->open_count);
-
out:
spin_unlock(&_minor_lock);
@@ -442,16 +441,20 @@ out:
static void dm_blk_close(struct gendisk *disk, fmode_t mode)
{
- struct mapped_device *md = disk->private_data;
+ struct mapped_device *md;
spin_lock(&_minor_lock);
+ md = disk->private_data;
+ if (WARN_ON(!md))
+ goto out;
+
if (atomic_dec_and_test(&md->open_count) &&
(test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
queue_work(deferred_remove_workqueue, &deferred_remove_work);
dm_put(md);
-
+out:
spin_unlock(&_minor_lock);
}
@@ -2241,7 +2244,6 @@ static void free_dev(struct mapped_device *md)
int minor = MINOR(disk_devt(md->disk));
unlock_fs(md);
- bdput(md->bdev);
destroy_workqueue(md->wq);
if (md->kworker_task)
@@ -2252,19 +2254,22 @@ static void free_dev(struct mapped_device *md)
mempool_destroy(md->rq_pool);
if (md->bs)
bioset_free(md->bs);
- blk_integrity_unregister(md->disk);
- del_gendisk(md->disk);
+
cleanup_srcu_struct(&md->io_barrier);
free_table_devices(&md->table_devices);
- free_minor(minor);
+ dm_stats_cleanup(&md->stats);
spin_lock(&_minor_lock);
md->disk->private_data = NULL;
spin_unlock(&_minor_lock);
-
+ if (blk_get_integrity(md->disk))
+ blk_integrity_unregister(md->disk);
+ del_gendisk(md->disk);
put_disk(md->disk);
blk_cleanup_queue(md->queue);
- dm_stats_cleanup(&md->stats);
+ bdput(md->bdev);
+ free_minor(minor);
+
module_put(THIS_MODULE);
kfree(md);
}
@@ -2616,6 +2621,19 @@ void dm_get(struct mapped_device *md)
BUG_ON(test_bit(DMF_FREEING, &md->flags));
}
+int dm_hold(struct mapped_device *md)
+{
+ spin_lock(&_minor_lock);
+ if (test_bit(DMF_FREEING, &md->flags)) {
+ spin_unlock(&_minor_lock);
+ return -EBUSY;
+ }
+ dm_get(md);
+ spin_unlock(&_minor_lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_hold);
+
const char *dm_device_name(struct mapped_device *md)
{
return md->name;
@@ -2629,8 +2647,9 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
might_sleep();
- spin_lock(&_minor_lock);
map = dm_get_live_table(md, &srcu_idx);
+
+ spin_lock(&_minor_lock);
idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
set_bit(DMF_FREEING, &md->flags);
spin_unlock(&_minor_lock);
@@ -2638,10 +2657,16 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
if (dm_request_based(md))
flush_kthread_worker(&md->kworker);
+ /*
+ * Take suspend_lock so that presuspend and postsuspend methods
+ * do not race with internal suspend.
+ */
+ mutex_lock(&md->suspend_lock);
if (!dm_suspended_md(md)) {
dm_table_presuspend_targets(map);
dm_table_postsuspend_targets(map);
}
+ mutex_unlock(&md->suspend_lock);
/* dm_put_live_table must be before msleep, otherwise deadlock is possible */
dm_put_live_table(md, srcu_idx);
@@ -3115,6 +3140,7 @@ void dm_internal_suspend_fast(struct mapped_device *md)
flush_workqueue(md->wq);
dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
}
+EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
void dm_internal_resume_fast(struct mapped_device *md)
{
@@ -3126,6 +3152,7 @@ void dm_internal_resume_fast(struct mapped_device *md)
done:
mutex_unlock(&md->suspend_lock);
}
+EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
/*-----------------------------------------------------------------
* Event notification.
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c8d2bac4e28b..717daad71fb1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2555,7 +2555,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
return err ? err : len;
}
static struct rdev_sysfs_entry rdev_state =
-__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
+__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
static ssize_t
errors_show(struct md_rdev *rdev, char *page)
@@ -3638,7 +3638,8 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len)
return err ?: len;
}
static struct md_sysfs_entry md_resync_start =
-__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
+__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
+ resync_start_show, resync_start_store);
/*
* The array state can be:
@@ -3851,7 +3852,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
return err ?: len;
}
static struct md_sysfs_entry md_array_state =
-__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
+__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
static ssize_t
max_corrected_read_errors_show(struct mddev *mddev, char *page) {
@@ -4101,7 +4102,7 @@ out_unlock:
}
static struct md_sysfs_entry md_metadata =
-__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
+__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
static ssize_t
action_show(struct mddev *mddev, char *page)
@@ -4189,7 +4190,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
}
static struct md_sysfs_entry md_scan_mode =
-__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
+__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
static ssize_t
last_sync_action_show(struct mddev *mddev, char *page)
@@ -4335,7 +4336,8 @@ sync_completed_show(struct mddev *mddev, char *page)
return sprintf(page, "%llu / %llu\n", resync, max_sectors);
}
-static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
+static struct md_sysfs_entry md_sync_completed =
+ __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
static ssize_t
min_sync_show(struct mddev *mddev, char *page)
@@ -5078,7 +5080,8 @@ int md_run(struct mddev *mddev)
}
if (err) {
mddev_detach(mddev);
- pers->free(mddev, mddev->private);
+ if (mddev->private)
+ pers->free(mddev, mddev->private);
module_put(pers->owner);
bitmap_destroy(mddev);
return err;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index a13f738a7b39..3ed9f42ddca6 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -467,8 +467,6 @@ static int raid0_run(struct mddev *mddev)
dump_zones(mddev);
ret = md_integrity_register(mddev);
- if (ret)
- raid0_free(mddev, conf);
return ret;
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4153da5d4011..d34e238afa54 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -560,7 +560,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
if (test_bit(WriteMostly, &rdev->flags)) {
/* Don't balance among write-mostly, just
* use the first as a last resort */
- if (best_disk < 0) {
+ if (best_dist_disk < 0) {
if (is_badblock(rdev, this_sector, sectors,
&first_bad, &bad_sectors)) {
if (first_bad < this_sector)
@@ -569,7 +569,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
best_good_sectors = first_bad - this_sector;
} else
best_good_sectors = sectors;
- best_disk = disk;
+ best_dist_disk = disk;
+ best_pending_disk = disk;
}
continue;
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e75d48c0421a..cd2f96b2c572 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5121,12 +5121,17 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
schedule_timeout_uninterruptible(1);
}
/* Need to check if array will still be degraded after recovery/resync
- * We don't need to check the 'failed' flag as when that gets set,
- * recovery aborts.
+ * Note in case of > 1 drive failures it's possible we're rebuilding
+ * one drive while leaving another faulty drive in array.
*/
- for (i = 0; i < conf->raid_disks; i++)
- if (conf->disks[i].rdev == NULL)
+ rcu_read_lock();
+ for (i = 0; i < conf->raid_disks; i++) {
+ struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev);
+
+ if (rdev == NULL || test_bit(Faulty, &rdev->flags))
still_degraded = 1;
+ }
+ rcu_read_unlock();
bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);