summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/device-mapper/cache-policies.txt39
-rw-r--r--block/blk-core.c2
-rw-r--r--drivers/md/Kconfig11
-rw-r--r--drivers/md/Makefile2
-rw-r--r--drivers/md/dm-cache-metadata.c98
-rw-r--r--drivers/md/dm-cache-metadata.h4
-rw-r--r--drivers/md/dm-cache-policy-mq.c1473
-rw-r--r--drivers/md/dm-cache-policy-smq.c92
-rw-r--r--drivers/md/dm-cache-target.c16
-rw-r--r--drivers/md/dm-crypt.c2
-rw-r--r--drivers/md/dm-delay.c2
-rw-r--r--drivers/md/dm-flakey.c2
-rw-r--r--drivers/md/dm-ioctl.c5
-rw-r--r--drivers/md/dm-log-writes.c2
-rw-r--r--drivers/md/dm-mpath.c216
-rw-r--r--drivers/md/dm-path-selector.h5
-rw-r--r--drivers/md/dm-queue-length.c37
-rw-r--r--drivers/md/dm-raid1.c2
-rw-r--r--drivers/md/dm-round-robin.c85
-rw-r--r--drivers/md/dm-service-time.c35
-rw-r--r--drivers/md/dm-snap.c11
-rw-r--r--drivers/md/dm-table.c66
-rw-r--r--drivers/md/dm-target.c3
-rw-r--r--drivers/md/dm-thin-metadata.c7
-rw-r--r--drivers/md/dm-thin.c23
-rw-r--r--drivers/md/dm-verity-fec.c2
-rw-r--r--drivers/md/dm-verity-target.c12
-rw-r--r--drivers/md/dm.c600
-rw-r--r--drivers/md/dm.h4
-rw-r--r--include/linux/device-mapper.h15
30 files changed, 849 insertions, 2024 deletions
diff --git a/Documentation/device-mapper/cache-policies.txt b/Documentation/device-mapper/cache-policies.txt
index d9246a32e673..e5062ad18717 100644
--- a/Documentation/device-mapper/cache-policies.txt
+++ b/Documentation/device-mapper/cache-policies.txt
@@ -28,51 +28,16 @@ Overview of supplied cache replacement policies
multiqueue (mq)
---------------
-This policy has been deprecated in favor of the smq policy (see below).
+This policy is now an alias for smq (see below).
-The multiqueue policy has three sets of 16 queues: one set for entries
-waiting for the cache and another two for those in the cache (a set for
-clean entries and a set for dirty entries).
+The following tunables are accepted, but have no effect:
-Cache entries in the queues are aged based on logical time. Entry into
-the cache is based on variable thresholds and queue selection is based
-on hit count on entry. The policy aims to take different cache miss
-costs into account and to adjust to varying load patterns automatically.
-
-Message and constructor argument pairs are:
'sequential_threshold <#nr_sequential_ios>'
'random_threshold <#nr_random_ios>'
'read_promote_adjustment <value>'
'write_promote_adjustment <value>'
'discard_promote_adjustment <value>'
-The sequential threshold indicates the number of contiguous I/Os
-required before a stream is treated as sequential. Once a stream is
-considered sequential it will bypass the cache. The random threshold
-is the number of intervening non-contiguous I/Os that must be seen
-before the stream is treated as random again.
-
-The sequential and random thresholds default to 512 and 4 respectively.
-
-Large, sequential I/Os are probably better left on the origin device
-since spindles tend to have good sequential I/O bandwidth. The
-io_tracker counts contiguous I/Os to try to spot when the I/O is in one
-of these sequential modes. But there are use-cases for wanting to
-promote sequential blocks to the cache (e.g. fast application startup).
-If sequential threshold is set to 0 the sequential I/O detection is
-disabled and sequential I/O will no longer implicitly bypass the cache.
-Setting the random threshold to 0 does _not_ disable the random I/O
-stream detection.
-
-Internally the mq policy determines a promotion threshold. If the hit
-count of a block not in the cache goes above this threshold it gets
-promoted to the cache. The read, write and discard promote adjustment
-tunables allow you to tweak the promotion threshold by adding a small
-value based on the io type. They default to 4, 8 and 1 respectively.
-If you're trying to quickly warm a new cache device you may wish to
-reduce these to encourage promotion. Remember to switch them back to
-their defaults after the cache fills though.
-
Stochastic multiqueue (smq)
---------------------------
diff --git a/block/blk-core.c b/block/blk-core.c
index b83d29755b5a..45f4d7efbf34 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2198,7 +2198,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
if (q->mq_ops) {
if (blk_queue_io_stat(q))
blk_account_io_start(rq, true);
- blk_mq_insert_request(rq, false, true, true);
+ blk_mq_insert_request(rq, false, true, false);
return 0;
}
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 0a2e7273db9e..02a5345a44a6 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -249,6 +249,7 @@ config DM_DEBUG_BLOCK_STACK_TRACING
block manager locking used by thin provisioning and caching.
If unsure, say N.
+
config DM_BIO_PRISON
tristate
depends on BLK_DEV_DM
@@ -304,16 +305,6 @@ config DM_CACHE
algorithms used to select which blocks are promoted, demoted,
cleaned etc. It supports writeback and writethrough modes.
-config DM_CACHE_MQ
- tristate "MQ Cache Policy (EXPERIMENTAL)"
- depends on DM_CACHE
- default y
- ---help---
- A cache policy that uses a multiqueue ordered by recent hit
- count to select which blocks should be promoted and demoted.
- This is meant to be a general purpose policy. It prioritises
- reads over writes.
-
config DM_CACHE_SMQ
tristate "Stochastic MQ Cache Policy (EXPERIMENTAL)"
depends on DM_CACHE
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 62a65764e8e0..52ba8dd82821 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -12,7 +12,6 @@ dm-log-userspace-y \
+= dm-log-userspace-base.o dm-log-userspace-transfer.o
dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
-dm-cache-mq-y += dm-cache-policy-mq.o
dm-cache-smq-y += dm-cache-policy-smq.o
dm-cache-cleaner-y += dm-cache-policy-cleaner.o
dm-era-y += dm-era-target.o
@@ -55,7 +54,6 @@ obj-$(CONFIG_DM_RAID) += dm-raid.o
obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
obj-$(CONFIG_DM_VERITY) += dm-verity.o
obj-$(CONFIG_DM_CACHE) += dm-cache.o
-obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o
obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o
obj-$(CONFIG_DM_ERA) += dm-era.o
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index f6543f3a970f..27f2ef300f8b 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -867,19 +867,40 @@ static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
return 0;
}
-#define WRITE_LOCK(cmd) \
- if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \
+#define WRITE_LOCK(cmd) \
+ down_write(&cmd->root_lock); \
+ if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \
+ up_write(&cmd->root_lock); \
return -EINVAL; \
- down_write(&cmd->root_lock)
+ }
#define WRITE_LOCK_VOID(cmd) \
- if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \
+ down_write(&cmd->root_lock); \
+ if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \
+ up_write(&cmd->root_lock); \
return; \
- down_write(&cmd->root_lock)
+ }
#define WRITE_UNLOCK(cmd) \
up_write(&cmd->root_lock)
+#define READ_LOCK(cmd) \
+ down_read(&cmd->root_lock); \
+ if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \
+ up_read(&cmd->root_lock); \
+ return -EINVAL; \
+ }
+
+#define READ_LOCK_VOID(cmd) \
+ down_read(&cmd->root_lock); \
+ if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \
+ up_read(&cmd->root_lock); \
+ return; \
+ }
+
+#define READ_UNLOCK(cmd) \
+ up_read(&cmd->root_lock)
+
int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
{
int r;
@@ -1015,22 +1036,20 @@ int dm_cache_load_discards(struct dm_cache_metadata *cmd,
{
int r;
- down_read(&cmd->root_lock);
+ READ_LOCK(cmd);
r = __load_discards(cmd, fn, context);
- up_read(&cmd->root_lock);
+ READ_UNLOCK(cmd);
return r;
}
-dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd)
+int dm_cache_size(struct dm_cache_metadata *cmd, dm_cblock_t *result)
{
- dm_cblock_t r;
+ READ_LOCK(cmd);
+ *result = cmd->cache_blocks;
+ READ_UNLOCK(cmd);
- down_read(&cmd->root_lock);
- r = cmd->cache_blocks;
- up_read(&cmd->root_lock);
-
- return r;
+ return 0;
}
static int __remove(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
@@ -1188,9 +1207,9 @@ int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
{
int r;
- down_read(&cmd->root_lock);
+ READ_LOCK(cmd);
r = __load_mappings(cmd, policy, fn, context);
- up_read(&cmd->root_lock);
+ READ_UNLOCK(cmd);
return r;
}
@@ -1215,18 +1234,18 @@ static int __dump_mappings(struct dm_cache_metadata *cmd)
void dm_cache_dump(struct dm_cache_metadata *cmd)
{
- down_read(&cmd->root_lock);
+ READ_LOCK_VOID(cmd);
__dump_mappings(cmd);
- up_read(&cmd->root_lock);
+ READ_UNLOCK(cmd);
}
int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd)
{
int r;
- down_read(&cmd->root_lock);
+ READ_LOCK(cmd);
r = cmd->changed;
- up_read(&cmd->root_lock);
+ READ_UNLOCK(cmd);
return r;
}
@@ -1276,9 +1295,9 @@ int dm_cache_set_dirty(struct dm_cache_metadata *cmd,
void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
struct dm_cache_statistics *stats)
{
- down_read(&cmd->root_lock);
+ READ_LOCK_VOID(cmd);
*stats = cmd->stats;
- up_read(&cmd->root_lock);
+ READ_UNLOCK(cmd);
}
void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
@@ -1312,9 +1331,9 @@ int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd,
{
int r = -EINVAL;
- down_read(&cmd->root_lock);
+ READ_LOCK(cmd);
r = dm_sm_get_nr_free(cmd->metadata_sm, result);
- up_read(&cmd->root_lock);
+ READ_UNLOCK(cmd);
return r;
}
@@ -1324,9 +1343,9 @@ int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
{
int r = -EINVAL;
- down_read(&cmd->root_lock);
+ READ_LOCK(cmd);
r = dm_sm_get_nr_blocks(cmd->metadata_sm, result);
- up_read(&cmd->root_lock);
+ READ_UNLOCK(cmd);
return r;
}
@@ -1417,7 +1436,13 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result)
{
- return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result);
+ int r;
+
+ READ_LOCK(cmd);
+ r = blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result);
+ READ_UNLOCK(cmd);
+
+ return r;
}
void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd)
@@ -1440,10 +1465,7 @@ int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd)
struct dm_block *sblock;
struct cache_disk_superblock *disk_super;
- /*
- * We ignore fail_io for this function.
- */
- down_write(&cmd->root_lock);
+ WRITE_LOCK(cmd);
set_bit(NEEDS_CHECK, &cmd->flags);
r = superblock_lock(cmd, &sblock);
@@ -1458,19 +1480,17 @@ int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd)
dm_bm_unlock(sblock);
out:
- up_write(&cmd->root_lock);
+ WRITE_UNLOCK(cmd);
return r;
}
-bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd)
+int dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd, bool *result)
{
- bool needs_check;
+ READ_LOCK(cmd);
+ *result = !!test_bit(NEEDS_CHECK, &cmd->flags);
+ READ_UNLOCK(cmd);
- down_read(&cmd->root_lock);
- needs_check = !!test_bit(NEEDS_CHECK, &cmd->flags);
- up_read(&cmd->root_lock);
-
- return needs_check;
+ return 0;
}
int dm_cache_metadata_abort(struct dm_cache_metadata *cmd)
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 2ffee21f318d..8528744195e5 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -66,7 +66,7 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd);
* origin blocks to map to.
*/
int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size);
-dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd);
+int dm_cache_size(struct dm_cache_metadata *cmd, dm_cblock_t *result);
int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
sector_t discard_block_size,
@@ -137,7 +137,7 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
*/
int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result);
-bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd);
+int dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd, bool *result);
int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd);
void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd);
void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd);
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
deleted file mode 100644
index ddb26980cd66..000000000000
--- a/drivers/md/dm-cache-policy-mq.c
+++ /dev/null
@@ -1,1473 +0,0 @@
-/*
- * Copyright (C) 2012 Red Hat. All rights reserved.
- *
- * This file is released under the GPL.
- */
-
-#include "dm-cache-policy.h"
-#include "dm.h"
-
-#include <linux/hash.h>
-#include <linux/jiffies.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-#define DM_MSG_PREFIX "cache-policy-mq"
-
-static struct kmem_cache *mq_entry_cache;
-
-/*----------------------------------------------------------------*/
-
-static unsigned next_power(unsigned n, unsigned min)
-{
- return roundup_pow_of_two(max(n, min));
-}
-
-/*----------------------------------------------------------------*/
-
-/*
- * Large, sequential ios are probably better left on the origin device since
- * spindles tend to have good bandwidth.
- *
- * The io_tracker tries to spot when the io is in one of these sequential
- * modes.
- *
- * Two thresholds to switch between random and sequential io mode are defaulting
- * as follows and can be adjusted via the constructor and message interfaces.
- */
-#define RANDOM_THRESHOLD_DEFAULT 4
-#define SEQUENTIAL_THRESHOLD_DEFAULT 512
-
-enum io_pattern {
- PATTERN_SEQUENTIAL,
- PATTERN_RANDOM
-};
-
-struct io_tracker {
- enum io_pattern pattern;
-
- unsigned nr_seq_samples;
- unsigned nr_rand_samples;
- unsigned thresholds[2];
-
- dm_oblock_t last_end_oblock;
-};
-
-static void iot_init(struct io_tracker *t,
- int sequential_threshold, int random_threshold)
-{
- t->pattern = PATTERN_RANDOM;
- t->nr_seq_samples = 0;
- t->nr_rand_samples = 0;
- t->last_end_oblock = 0;
- t->thresholds[PATTERN_RANDOM] = random_threshold;
- t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold;
-}
-
-static enum io_pattern iot_pattern(struct io_tracker *t)
-{
- return t->pattern;
-}
-
-static void iot_update_stats(struct io_tracker *t, struct bio *bio)
-{
- if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1)
- t->nr_seq_samples++;
- else {
- /*
- * Just one non-sequential IO is enough to reset the
- * counters.
- */
- if (t->nr_seq_samples) {
- t->nr_seq_samples = 0;
- t->nr_rand_samples = 0;
- }
-
- t->nr_rand_samples++;
- }
-
- t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1);
-}
-
-static void iot_check_for_pattern_switch(struct io_tracker *t)
-{
- switch (t->pattern) {
- case PATTERN_SEQUENTIAL:
- if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) {
- t->pattern = PATTERN_RANDOM;
- t->nr_seq_samples = t->nr_rand_samples = 0;
- }
- break;
-
- case PATTERN_RANDOM:
- if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) {
- t->pattern = PATTERN_SEQUENTIAL;
- t->nr_seq_samples = t->nr_rand_samples = 0;
- }
- break;
- }
-}
-
-static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
-{
- iot_update_stats(t, bio);
- iot_check_for_pattern_switch(t);
-}
-
-/*----------------------------------------------------------------*/
-
-
-/*
- * This queue is divided up into different levels. Allowing us to push
- * entries to the back of any of the levels. Think of it as a partially
- * sorted queue.
- */
-#define NR_QUEUE_LEVELS 16u
-#define NR_SENTINELS NR_QUEUE_LEVELS * 3
-
-#define WRITEBACK_PERIOD HZ
-
-struct queue {
- unsigned nr_elts;
- bool current_writeback_sentinels;
- unsigned long next_writeback;
- struct list_head qs[NR_QUEUE_LEVELS];
- struct list_head sentinels[NR_SENTINELS];
-};
-
-static void queue_init(struct queue *q)
-{
- unsigned i;
-
- q->nr_elts = 0;
- q->current_writeback_sentinels = false;
- q->next_writeback = 0;
- for (i = 0; i < NR_QUEUE_LEVELS; i++) {
- INIT_LIST_HEAD(q->qs + i);
- INIT_LIST_HEAD(q->sentinels + i);
- INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i);
- INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i);
- }
-}
-
-static unsigned queue_size(struct queue *q)
-{
- return q->nr_elts;
-}
-
-static bool queue_empty(struct queue *q)
-{
- return q->nr_elts == 0;
-}
-
-/*
- * Insert an entry to the back of the given level.
- */
-static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
-{
- q->nr_elts++;
- list_add_tail(elt, q->qs + level);
-}
-
-static void queue_remove(struct queue *q, struct list_head *elt)
-{
- q->nr_elts--;
- list_del(elt);
-}
-
-static bool is_sentinel(struct queue *q, struct list_head *h)
-{
- return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS));
-}
-
-/*
- * Gives us the oldest entry of the lowest popoulated level. If the first
- * level is emptied then we shift down one level.
- */
-static struct list_head *queue_peek(struct queue *q)
-{
- unsigned level;
- struct list_head *h;
-
- for (level = 0; level < NR_QUEUE_LEVELS; level++)
- list_for_each(h, q->qs + level)
- if (!is_sentinel(q, h))
- return h;
-
- return NULL;
-}
-
-static struct list_head *queue_pop(struct queue *q)
-{
- struct list_head *r = queue_peek(q);
-
- if (r) {
- q->nr_elts--;
- list_del(r);
- }
-
- return r;
-}
-
-/*
- * Pops an entry from a level that is not past a sentinel.
- */
-static struct list_head *queue_pop_old(struct queue *q)
-{
- unsigned level;
- struct list_head *h;
-
- for (level = 0; level < NR_QUEUE_LEVELS; level++)
- list_for_each(h, q->qs + level) {
- if (is_sentinel(q, h))
- break;
-
- q->nr_elts--;
- list_del(h);
- return h;
- }
-
- return NULL;
-}
-
-static struct list_head *list_pop(struct list_head *lh)
-{
- struct list_head *r = lh->next;
-
- BUG_ON(!r);
- list_del_init(r);
-
- return r;
-}
-
-static struct list_head *writeback_sentinel(struct queue *q, unsigned level)
-{
- if (q->current_writeback_sentinels)
- return q->sentinels + NR_QUEUE_LEVELS + level;
- else
- return q->sentinels + 2 * NR_QUEUE_LEVELS + level;
-}
-
-static void queue_update_writeback_sentinels(struct queue *q)
-{
- unsigned i;
- struct list_head *h;
-
- if (time_after(jiffies, q->next_writeback)) {
- for (i = 0; i < NR_QUEUE_LEVELS; i++) {
- h = writeback_sentinel(q, i);
- list_del(h);
- list_add_tail(h, q->qs + i);
- }
-
- q->next_writeback = jiffies + WRITEBACK_PERIOD;
- q->current_writeback_sentinels = !q->current_writeback_sentinels;
- }
-}
-
-/*
- * Sometimes we want to iterate through entries that have been pushed since
- * a certain event. We use sentinel entries on the queues to delimit these
- * 'tick' events.
- */
-static void queue_tick(struct queue *q)
-{
- unsigned i;
-
- for (i = 0; i < NR_QUEUE_LEVELS; i++) {
- list_del(q->sentinels + i);
- list_add_tail(q->sentinels + i, q->qs + i);
- }
-}
-
-typedef void (*iter_fn)(struct list_head *, void *);
-static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context)
-{
- unsigned i;
- struct list_head *h;
-
- for (i = 0; i < NR_QUEUE_LEVELS; i++) {
- list_for_each_prev(h, q->qs + i) {
- if (is_sentinel(q, h))
- break;
-
- fn(h, context);
- }
- }
-}
-
-/*----------------------------------------------------------------*/
-
-/*
- * Describes a cache entry. Used in both the cache and the pre_cache.
- */
-struct entry {
- struct hlist_node hlist;
- struct list_head list;
- dm_oblock_t oblock;
-
- /*
- * FIXME: pack these better
- */
- bool dirty:1;
- unsigned hit_count;
-};
-
-/*
- * Rather than storing the cblock in an entry, we allocate all entries in
- * an array, and infer the cblock from the entry position.
- *
- * Free entries are linked together into a list.
- */
-struct entry_pool {
- struct entry *entries, *entries_end;
- struct list_head free;
- unsigned nr_allocated;
-};
-
-static int epool_init(struct entry_pool *ep, unsigned nr_entries)
-{
- unsigned i;
-
- ep->entries = vzalloc(sizeof(struct entry) * nr_entries);
- if (!ep->entries)
- return -ENOMEM;
-
- ep->entries_end = ep->entries + nr_entries;
-
- INIT_LIST_HEAD(&ep->free);
- for (i = 0; i < nr_entries; i++)
- list_add(&ep->entries[i].list, &ep->free);
-
- ep->nr_allocated = 0;
-
- return 0;
-}
-
-static void epool_exit(struct entry_pool *ep)
-{
- vfree(ep->entries);
-}
-
-static struct entry *alloc_entry(struct entry_pool *ep)
-{
- struct entry *e;
-
- if (list_empty(&ep->free))
- return NULL;
-
- e = list_entry(list_pop(&ep->free), struct entry, list);
- INIT_LIST_HEAD(&e->list);
- INIT_HLIST_NODE(&e->hlist);
- ep->nr_allocated++;
-
- return e;
-}
-
-/*
- * This assumes the cblock hasn't already been allocated.
- */
-static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock)
-{
- struct entry *e = ep->entries + from_cblock(cblock);
-
- list_del_init(&e->list);
- INIT_HLIST_NODE(&e->hlist);
- ep->nr_allocated++;
-
- return e;
-}
-
-static void free_entry(struct entry_pool *ep, struct entry *e)
-{
- BUG_ON(!ep->nr_allocated);
- ep->nr_allocated--;
- INIT_HLIST_NODE(&e->hlist);
- list_add(&e->list, &ep->free);
-}
-
-/*
- * Returns NULL if the entry is free.
- */
-static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock)
-{
- struct entry *e = ep->entries + from_cblock(cblock);
- return !hlist_unhashed(&e->hlist) ? e : NULL;
-}
-
-static bool epool_empty(struct entry_pool *ep)
-{
- return list_empty(&ep->free);
-}
-
-static bool in_pool(struct entry_pool *ep, struct entry *e)
-{
- return e >= ep->entries && e < ep->entries_end;
-}
-
-static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e)
-{
- return to_cblock(e - ep->entries);
-}
-
-/*----------------------------------------------------------------*/
-
-struct mq_policy {
- struct dm_cache_policy policy;
-
- /* protects everything */
- struct mutex lock;
- dm_cblock_t cache_size;
- struct io_tracker tracker;
-
- /*
- * Entries come from two pools, one of pre-cache entries, and one
- * for the cache proper.
- */
- struct entry_pool pre_cache_pool;
- struct entry_pool cache_pool;
-
- /*
- * We maintain three queues of entries. The cache proper,
- * consisting of a clean and dirty queue, contains the currently
- * active mappings. Whereas the pre_cache tracks blocks that
- * are being hit frequently and potential candidates for promotion
- * to the cache.
- */
- struct queue pre_cache;
- struct queue cache_clean;
- struct queue cache_dirty;
-
- /*
- * Keeps track of time, incremented by the core. We use this to
- * avoid attributing multiple hits within the same tick.
- *
- * Access to tick_protected should be done with the spin lock held.
- * It's copied to tick at the start of the map function (within the
- * mutex).
- */
- spinlock_t tick_lock;
- unsigned tick_protected;
- unsigned tick;
-
- /*
- * A count of the number of times the map function has been called
- * and found an entry in the pre_cache or cache. Currently used to
- * calculate the generation.
- */
- unsigned hit_count;
-
- /*
- * A generation is a longish period that is used to trigger some
- * book keeping effects. eg, decrementing hit counts on entries.
- * This is needed to allow the cache to evolve as io patterns
- * change.
- */
- unsigned generation;
- unsigned generation_period; /* in lookups (will probably change) */
-
- unsigned discard_promote_adjustment;
- unsigned read_promote_adjustment;
- unsigned write_promote_adjustment;
-
- /*
- * The hash table allows us to quickly find an entry by origin
- * block. Both pre_cache and cache entries are in here.
- */
- unsigned nr_buckets;
- dm_block_t hash_bits;
- struct hlist_head *table;
-};
-
-#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1
-#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4
-#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8
-#define DISCOURAGE_DEMOTING_DIRTY_THRESHOLD 128
-
-/*----------------------------------------------------------------*/
-
-/*
- * Simple hash table implementation. Should replace with the standard hash
- * table that's making its way upstream.
- */
-static void hash_insert(struct mq_policy *mq, struct entry *e)
-{
- unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits);
-
- hlist_add_head(&e->hlist, mq->table + h);
-}
-
-static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock)
-{
- unsigned h = hash_64(from_oblock(oblock), mq->hash_bits);
- struct hlist_head *bucket = mq->table + h;
- struct entry *e;
-
- hlist_for_each_entry(e, bucket, hlist)
- if (e->oblock == oblock) {
- hlist_del(&e->hlist);
- hlist_add_head(&e->hlist, bucket);
- return e;
- }
-
- return NULL;
-}
-
-static void hash_remove(struct entry *e)
-{
- hlist_del(&e->hlist);
-}
-
-/*----------------------------------------------------------------*/
-
-static bool any_free_cblocks(struct mq_policy *mq)
-{
- return !epool_empty(&mq->cache_pool);
-}
-
-static bool any_clean_cblocks(struct mq_policy *mq)
-{
- return !queue_empty(&mq->cache_clean);
-}
-
-/*----------------------------------------------------------------*/
-
-/*
- * Now we get to the meat of the policy. This section deals with deciding
- * when to to add entries to the pre_cache and cache, and move between
- * them.
- */
-
-/*
- * The queue level is based on the log2 of the hit count.
- */
-static unsigned queue_level(struct entry *e)
-{
- return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u);
-}
-
-static bool in_cache(struct mq_policy *mq, struct entry *e)
-{
- return in_pool(&mq->cache_pool, e);
-}
-
-/*
- * Inserts the entry into the pre_cache or the cache. Ensures the cache
- * block is marked as allocated if necc. Inserts into the hash table.
- * Sets the tick which records when the entry was last moved about.
- */
-static void push(struct mq_policy *mq, struct entry *e)
-{
- hash_insert(mq, e);
-
- if (in_cache(mq, e))
- queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean,
- queue_level(e), &e->list);
- else
- queue_push(&mq->pre_cache, queue_level(e), &e->list);
-}
-
-/*
- * Removes an entry from pre_cache or cache. Removes from the hash table.
- */
-static void del(struct mq_policy *mq, struct entry *e)
-{
- if (in_cache(mq, e))
- queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list);
- else
- queue_remove(&mq->pre_cache, &e->list);
-
- hash_remove(e);
-}
-
-/*
- * Like del, except it removes the first entry in the queue (ie. the least
- * recently used).
- */
-static struct entry *pop(struct mq_policy *mq, struct queue *q)
-{
- struct entry *e;
- struct list_head *h = queue_pop(q);
-
- if (!h)
- return NULL;
-
- e = container_of(h, struct entry, list);
- hash_remove(e);
-
- return e;
-}
-
-static struct entry *pop_old(struct mq_policy *mq, struct queue *q)
-{
- struct entry *e;
- struct list_head *h = queue_pop_old(q);
-
- if (!h)
- return NULL;
-
- e = container_of(h, struct entry, list);
- hash_remove(e);
-
- return e;
-}
-
-static struct entry *peek(struct queue *q)
-{
- struct list_head *h = queue_peek(q);
- return h ? container_of(h, struct entry, list) : NULL;
-}
-
-/*
- * The promotion threshold is adjusted every generation. As are the counts
- * of the entries.
- *
- * At the moment the threshold is taken by averaging the hit counts of some
- * of the entries in the cache (the first 20 entries across all levels in
- * ascending order, giving preference to the clean entries at each level).
- *
- * We can be much cleverer than this though. For example, each promotion
- * could bump up the threshold helping to prevent churn. Much more to do
- * here.
- */
-
-#define MAX_TO_AVERAGE 20
-
-static void check_generation(struct mq_policy *mq)
-{
- unsigned total = 0, nr = 0, count = 0, level;
- struct list_head *head;
- struct entry *e;
-
- if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) {
- mq->hit_count = 0;
- mq->generation++;
-
- for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) {
- head = mq->cache_clean.qs + level;
- list_for_each_entry(e, head, list) {
- nr++;
- total += e->hit_count;
-
- if (++count >= MAX_TO_AVERAGE)
- break;
- }
-
- head = mq->cache_dirty.qs + level;
- list_for_each_entry(e, head, list) {
- nr++;
- total += e->hit_count;
-
- if (++count >= MAX_TO_AVERAGE)
- break;
- }
- }
- }
-}
-
-/*
- * Whenever we use an entry we bump up it's hit counter, and push it to the
- * back to it's current level.
- */
-static void requeue(struct mq_policy *mq, struct entry *e)
-{
- check_generation(mq);
- del(mq, e);
- push(mq, e);
-}
-
-/*
- * Demote the least recently used entry from the cache to the pre_cache.
- * Returns the new cache entry to use, and the old origin block it was
- * mapped to.
- *
- * We drop the hit count on the demoted entry back to 1 to stop it bouncing
- * straight back into the cache if it's subsequently hit. There are
- * various options here, and more experimentation would be good:
- *
- * - just forget about the demoted entry completely (ie. don't insert it
- into the pre_cache).
- * - divide the hit count rather that setting to some hard coded value.
- * - set the hit count to a hard coded value other than 1, eg, is it better
- * if it goes in at level 2?
- */
-static int demote_cblock(struct mq_policy *mq,
- struct policy_locker *locker, dm_oblock_t *oblock)
-{
- struct entry *demoted = peek(&mq->cache_clean);
-
- if (!demoted)
- /*
- * We could get a block from mq->cache_dirty, but that
- * would add extra latency to the triggering bio as it
- * waits for the writeback. Better to not promote this
- * time and hope there's a clean block next time this block
- * is hit.
- */
- return -ENOSPC;
-
- if (locker->fn(locker, demoted->oblock))
- /*
- * We couldn't lock the demoted block.
- */
- return -EBUSY;
-
- del(mq, demoted);
- *oblock = demoted->oblock;
- free_entry(&mq->cache_pool, demoted);
-
- /*
- * We used to put the demoted block into the pre-cache, but I think
- * it's simpler to just let it work it's way up from zero again.
- * Stops blocks flickering in and out of the cache.
- */
-
- return 0;
-}
-
-/*
- * Entries in the pre_cache whose hit count passes the promotion
- * threshold move to the cache proper. Working out the correct
- * value for the promotion_threshold is crucial to this policy.
- */
-static unsigned promote_threshold(struct mq_policy *mq)
-{
- struct entry *e;
-
- if (any_free_cblocks(mq))
- return 0;
-
- e = peek(&mq->cache_clean);
- if (e)
- return e->hit_count;
-
- e = peek(&mq->cache_dirty);
- if (e)
- return e->hit_count + DISCOURAGE_DEMOTING_DIRTY_THRESHOLD;
-
- /* This should never happen */
- return 0;
-}
-
-/*
- * We modify the basic promotion_threshold depending on the specific io.
- *
- * If the origin block has been discarded then there's no cost to copy it
- * to the cache.
- *
- * We bias towards reads, since they can be demoted at no cost if they
- * haven't been dirtied.
- */
-static unsigned adjusted_promote_threshold(struct mq_policy *mq,
- bool discarded_oblock, int data_dir)
-{
- if (data_dir == READ)
- return promote_threshold(mq) + mq->read_promote_adjustment;
-
- if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
- /*
- * We don't need to do any copying at all, so give this a
- * very low threshold.
- */
- return mq->discard_promote_adjustment;
- }
-
- return promote_threshold(mq) + mq->write_promote_adjustment;
-}
-
-static bool should_promote(struct mq_policy *mq, struct entry *e,
- bool discarded_oblock, int data_dir)
-{
- return e->hit_count >=
- adjusted_promote_threshold(mq, discarded_oblock, data_dir);
-}
-
-static int cache_entry_found(struct mq_policy *mq,
- struct entry *e,
- struct policy_result *result)
-{
- requeue(mq, e);
-
- if (in_cache(mq, e)) {
- result->op = POLICY_HIT;
- result->cblock = infer_cblock(&mq->cache_pool, e);
- }
-
- return 0;
-}
-
-/*
- * Moves an entry from the pre_cache to the cache. The main work is
- * finding which cache block to use.
- */
-static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
- struct policy_locker *locker,
- struct policy_result *result)
-{
- int r;
- struct entry *new_e;
-
- /* Ensure there's a free cblock in the cache */
- if (epool_empty(&mq->cache_pool)) {
- result->op = POLICY_REPLACE;
- r = demote_cblock(mq, locker, &result->old_oblock);
- if (r) {
- result->op = POLICY_MISS;
- return 0;
- }
-
- } else
- result->op = POLICY_NEW;
-
- new_e = alloc_entry(&mq->cache_pool);
- BUG_ON(!new_e);
-
- new_e->oblock = e->oblock;
- new_e->dirty = false;
- new_e->hit_count = e->hit_count;
-
- del(mq, e);
- free_entry(&mq->pre_cache_pool, e);
- push(mq, new_e);
-
- result->cblock = infer_cblock(&mq->cache_pool, new_e);
-
- return 0;
-}
-
-static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
- bool can_migrate, bool discarded_oblock,
- int data_dir, struct policy_locker *locker,
- struct policy_result *result)
-{
- int r = 0;
-
- if (!should_promote(mq, e, discarded_oblock, data_dir)) {
- requeue(mq, e);
- result->op = POLICY_MISS;
-
- } else if (!can_migrate)
- r = -EWOULDBLOCK;
-
- else {
- requeue(mq, e);
- r = pre_cache_to_cache(mq, e, locker, result);
- }
-
- return r;
-}
-
-static void insert_in_pre_cache(struct mq_policy *mq,
- dm_oblock_t oblock)
-{
- struct entry *e = alloc_entry(&mq->pre_cache_pool);
-
- if (!e)
- /*
- * There's no spare entry structure, so we grab the least
- * used one from the pre_cache.
- */
- e = pop(mq, &mq->pre_cache);
-
- if (unlikely(!e)) {
- DMWARN("couldn't pop from pre cache");
- return;
- }
-
- e->dirty = false;
- e->oblock = oblock;
- e->hit_count = 1;
- push(mq, e);
-}
-
-static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
- struct policy_locker *locker,
- struct policy_result *result)
-{
- int r;
- struct entry *e;
-
- if (epool_empty(&mq->cache_pool)) {
- result->op = POLICY_REPLACE;
- r = demote_cblock(mq, locker, &result->old_oblock);
- if (unlikely(r)) {
- result->op = POLICY_MISS;
- insert_in_pre_cache(mq, oblock);
- return;
- }
-
- /*
- * This will always succeed, since we've just demoted.
- */
- e = alloc_entry(&mq->cache_pool);
- BUG_ON(!e);
-
- } else {
- e = alloc_entry(&mq->cache_pool);
- result->op = POLICY_NEW;
- }
-
- e->oblock = oblock;
- e->dirty = false;
- e->hit_count = 1;
- push(mq, e);
-
- result->cblock = infer_cblock(&mq->cache_pool, e);
-}
-
-static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
- bool can_migrate, bool discarded_oblock,
- int data_dir, struct policy_locker *locker,
- struct policy_result *result)
-{
- if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) {
- if (can_migrate)
- insert_in_cache(mq, oblock, locker, result);
- else
- return -EWOULDBLOCK;
- } else {
- insert_in_pre_cache(mq, oblock);
- result->op = POLICY_MISS;
- }
-
- return 0;
-}
-
-/*
- * Looks the oblock up in the hash table, then decides whether to put in
- * pre_cache, or cache etc.
- */
-static int map(struct mq_policy *mq, dm_oblock_t oblock,
- bool can_migrate, bool discarded_oblock,
- int data_dir, struct policy_locker *locker,
- struct policy_result *result)
-{
- int r = 0;
- struct entry *e = hash_lookup(mq, oblock);
-
- if (e && in_cache(mq, e))
- r = cache_entry_found(mq, e, result);
-
- else if (mq->tracker.thresholds[PATTERN_SEQUENTIAL] &&
- iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
- result->op = POLICY_MISS;
-
- else if (e)
- r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
- data_dir, locker, result);
-
- else
- r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
- data_dir, locker, result);
-
- if (r == -EWOULDBLOCK)
- result->op = POLICY_MISS;
-
- return r;
-}
-
-/*----------------------------------------------------------------*/
-
-/*
- * Public interface, via the policy struct. See dm-cache-policy.h for a
- * description of these.
- */
-
-static struct mq_policy *to_mq_policy(struct dm_cache_policy *p)
-{
- return container_of(p, struct mq_policy, policy);
-}
-
-static void mq_destroy(struct dm_cache_policy *p)
-{
- struct mq_policy *mq = to_mq_policy(p);
-
- vfree(mq->table);
- epool_exit(&mq->cache_pool);
- epool_exit(&mq->pre_cache_pool);
- kfree(mq);
-}
-
-static void update_pre_cache_hits(struct list_head *h, void *context)
-{
- struct entry *e = container_of(h, struct entry, list);
- e->hit_count++;
-}
-
-static void update_cache_hits(struct list_head *h, void *context)
-{
- struct mq_policy *mq = context;
- struct entry *e = container_of(h, struct entry, list);
- e->hit_count++;
- mq->hit_count++;
-}
-
-static void copy_tick(struct mq_policy *mq)
-{
- unsigned long flags, tick;
-
- spin_lock_irqsave(&mq->tick_lock, flags);
- tick = mq->tick_protected;
- if (tick != mq->tick) {
- queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq);
- queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq);
- queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq);
- mq->tick = tick;
- }
-
- queue_tick(&mq->pre_cache);
- queue_tick(&mq->cache_dirty);
- queue_tick(&mq->cache_clean);
- queue_update_writeback_sentinels(&mq->cache_dirty);
- spin_unlock_irqrestore(&mq->tick_lock, flags);
-}
-
-static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
- bool can_block, bool can_migrate, bool discarded_oblock,
- struct bio *bio, struct policy_locker *locker,
- struct policy_result *result)
-{
- int r;
- struct mq_policy *mq = to_mq_policy(p);
-
- result->op = POLICY_MISS;
-
- if (can_block)
- mutex_lock(&mq->lock);
- else if (!mutex_trylock(&mq->lock))
- return -EWOULDBLOCK;
-
- copy_tick(mq);
-
- iot_examine_bio(&mq->tracker, bio);
- r = map(mq, oblock, can_migrate, discarded_oblock,
- bio_data_dir(bio), locker, result);
-
- mutex_unlock(&mq->lock);
-
- return r;
-}
-
-static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
-{
- int r;
- struct mq_policy *mq = to_mq_policy(p);
- struct entry *e;
-
- if (!mutex_trylock(&mq->lock))
- return -EWOULDBLOCK;
-
- e = hash_lookup(mq, oblock);
- if (e && in_cache(mq, e)) {
- *cblock = infer_cblock(&mq->cache_pool, e);
- r = 0;
- } else
- r = -ENOENT;
-
- mutex_unlock(&mq->lock);
-
- return r;
-}
-
-static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set)
-{
- struct entry *e;
-
- e = hash_lookup(mq, oblock);
- BUG_ON(!e || !in_cache(mq, e));
-
- del(mq, e);
- e->dirty = set;
- push(mq, e);
-}
-
-static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
-{
- struct mq_policy *mq = to_mq_policy(p);
-
- mutex_lock(&mq->lock);
- __mq_set_clear_dirty(mq, oblock, true);
- mutex_unlock(&mq->lock);
-}
-
-static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
-{
- struct mq_policy *mq = to_mq_policy(p);
-
- mutex_lock(&mq->lock);
- __mq_set_clear_dirty(mq, oblock, false);
- mutex_unlock(&mq->lock);
-}
-
-static int mq_load_mapping(struct dm_cache_policy *p,
- dm_oblock_t oblock, dm_cblock_t cblock,
- uint32_t hint, bool hint_valid)
-{
- struct mq_policy *mq = to_mq_policy(p);
- struct entry *e;
-
- e = alloc_particular_entry(&mq->cache_pool, cblock);
- e->oblock = oblock;
- e->dirty = false; /* this gets corrected in a minute */
- e->hit_count = hint_valid ? hint : 1;
- push(mq, e);
-
- return 0;
-}
-
-static int mq_save_hints(struct mq_policy *mq, struct queue *q,
- policy_walk_fn fn, void *context)
-{
- int r;
- unsigned level;
- struct list_head *h;
- struct entry *e;
-
- for (level = 0; level < NR_QUEUE_LEVELS; level++)
- list_for_each(h, q->qs + level) {
- if (is_sentinel(q, h))
- continue;
-
- e = container_of(h, struct entry, list);
- r = fn(context, infer_cblock(&mq->cache_pool, e),
- e->oblock, e->hit_count);
- if (r)
- return r;
- }
-
- return 0;
-}
-
-static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
- void *context)
-{
- struct mq_policy *mq = to_mq_policy(p);
- int r = 0;
-
- mutex_lock(&mq->lock);
-
- r = mq_save_hints(mq, &mq->cache_clean, fn, context);
- if (!r)
- r = mq_save_hints(mq, &mq->cache_dirty, fn, context);
-
- mutex_unlock(&mq->lock);
-
- return r;
-}
-
-static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock)
-{
- struct entry *e;
-
- e = hash_lookup(mq, oblock);
- BUG_ON(!e || !in_cache(mq, e));
-
- del(mq, e);
- free_entry(&mq->cache_pool, e);
-}
-
-static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
-{
- struct mq_policy *mq = to_mq_policy(p);
-
- mutex_lock(&mq->lock);
- __remove_mapping(mq, oblock);
- mutex_unlock(&mq->lock);
-}
-
-static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock)
-{
- struct entry *e = epool_find(&mq->cache_pool, cblock);
-
- if (!e)
- return -ENODATA;
-
- del(mq, e);
- free_entry(&mq->cache_pool, e);
-
- return 0;
-}
-
-static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
-{
- int r;
- struct mq_policy *mq = to_mq_policy(p);
-
- mutex_lock(&mq->lock);
- r = __remove_cblock(mq, cblock);
- mutex_unlock(&mq->lock);
-
- return r;
-}
-
-#define CLEAN_TARGET_PERCENTAGE 25
-
-static bool clean_target_met(struct mq_policy *mq)
-{
- /*
- * Cache entries may not be populated. So we're cannot rely on the
- * size of the clean queue.
- */
- unsigned nr_clean = from_cblock(mq->cache_size) - queue_size(&mq->cache_dirty);
- unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_PERCENTAGE / 100;
-
- return nr_clean >= target;
-}
-
-static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
- dm_cblock_t *cblock)
-{
- struct entry *e = pop_old(mq, &mq->cache_dirty);
-
- if (!e && !clean_target_met(mq))
- e = pop(mq, &mq->cache_dirty);
-
- if (!e)
- return -ENODATA;
-
- *oblock = e->oblock;
- *cblock = infer_cblock(&mq->cache_pool, e);
- e->dirty = false;
- push(mq, e);
-
- return 0;
-}
-
-static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
- dm_cblock_t *cblock, bool critical_only)
-{
- int r;
- struct mq_policy *mq = to_mq_policy(p);
-
- mutex_lock(&mq->lock);
- r = __mq_writeback_work(mq, oblock, cblock);
- mutex_unlock(&mq->lock);
-
- return r;
-}
-
-static void __force_mapping(struct mq_policy *mq,
- dm_oblock_t current_oblock, dm_oblock_t new_oblock)
-{
- struct entry *e = hash_lookup(mq, current_oblock);
-
- if (e && in_cache(mq, e)) {
- del(mq, e);
- e->oblock = new_oblock;
- e->dirty = true;
- push(mq, e);
- }
-}
-
-static void mq_force_mapping(struct dm_cache_policy *p,
- dm_oblock_t current_oblock, dm_oblock_t new_oblock)
-{
- struct mq_policy *mq = to_mq_policy(p);
-
- mutex_lock(&mq->lock);
- __force_mapping(mq, current_oblock, new_oblock);
- mutex_unlock(&mq->lock);
-}
-
-static dm_cblock_t mq_residency(struct dm_cache_policy *p)
-{
- dm_cblock_t r;
- struct mq_policy *mq = to_mq_policy(p);
-
- mutex_lock(&mq->lock);
- r = to_cblock(mq->cache_pool.nr_allocated);
- mutex_unlock(&mq->lock);
-
- return r;
-}
-
-static void mq_tick(struct dm_cache_policy *p, bool can_block)
-{
- struct mq_policy *mq = to_mq_policy(p);
- unsigned long flags;
-
- spin_lock_irqsave(&mq->tick_lock, flags);
- mq->tick_protected++;
- spin_unlock_irqrestore(&mq->tick_lock, flags);
-
- if (can_block) {
- mutex_lock(&mq->lock);
- copy_tick(mq);
- mutex_unlock(&mq->lock);
- }
-}
-
-static int mq_set_config_value(struct dm_cache_policy *p,
- const char *key, const char *value)
-{
- struct mq_policy *mq = to_mq_policy(p);
- unsigned long tmp;
-
- if (kstrtoul(value, 10, &tmp))
- return -EINVAL;
-
- if (!strcasecmp(key, "random_threshold")) {
- mq->tracker.thresholds[PATTERN_RANDOM] = tmp;
-
- } else if (!strcasecmp(key, "sequential_threshold")) {
- mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp;
-
- } else if (!strcasecmp(key, "discard_promote_adjustment"))
- mq->discard_promote_adjustment = tmp;
-
- else if (!strcasecmp(key, "read_promote_adjustment"))
- mq->read_promote_adjustment = tmp;
-
- else if (!strcasecmp(key, "write_promote_adjustment"))
- mq->write_promote_adjustment = tmp;
-
- else
- return -EINVAL;
-
- return 0;
-}
-
-static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
- unsigned maxlen, ssize_t *sz_ptr)
-{
- ssize_t sz = *sz_ptr;
- struct mq_policy *mq = to_mq_policy(p);
-
- DMEMIT("10 random_threshold %u "
- "sequential_threshold %u "
- "discard_promote_adjustment %u "
- "read_promote_adjustment %u "
- "write_promote_adjustment %u ",
- mq->tracker.thresholds[PATTERN_RANDOM],
- mq->tracker.thresholds[PATTERN_SEQUENTIAL],
- mq->discard_promote_adjustment,
- mq->read_promote_adjustment,
- mq->write_promote_adjustment);
-
- *sz_ptr = sz;
- return 0;
-}
-
-/* Init the policy plugin interface function pointers. */
-static void init_policy_functions(struct mq_policy *mq)
-{
- mq->policy.destroy = mq_destroy;
- mq->policy.map = mq_map;
- mq->policy.lookup = mq_lookup;
- mq->policy.set_dirty = mq_set_dirty;
- mq->policy.clear_dirty = mq_clear_dirty;
- mq->policy.load_mapping = mq_load_mapping;
- mq->policy.walk_mappings = mq_walk_mappings;
- mq->policy.remove_mapping = mq_remove_mapping;
- mq->policy.remove_cblock = mq_remove_cblock;
- mq->policy.writeback_work = mq_writeback_work;
- mq->policy.force_mapping = mq_force_mapping;
- mq->policy.residency = mq_residency;
- mq->policy.tick = mq_tick;
- mq->policy.emit_config_values = mq_emit_config_values;
- mq->policy.set_config_value = mq_set_config_value;
-}
-
-static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
- sector_t origin_size,
- sector_t cache_block_size)
-{
- struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
-
- if (!mq)
- return NULL;
-
- init_policy_functions(mq);
- iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT);
- mq->cache_size = cache_size;
-
- if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) {
- DMERR("couldn't initialize pool of pre-cache entries");
- goto bad_pre_cache_init;
- }
-
- if (epool_init(&mq->cache_pool, from_cblock(cache_size))) {
- DMERR("couldn't initialize pool of cache entries");
- goto bad_cache_init;
- }
-
- mq->tick_protected = 0;
- mq->tick = 0;
- mq->hit_count = 0;
- mq->generation = 0;
- mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT;
- mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT;
- mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT;
- mutex_init(&mq->lock);
- spin_lock_init(&mq->tick_lock);
-
- queue_init(&mq->pre_cache);
- queue_init(&mq->cache_clean);
- queue_init(&mq->cache_dirty);
-
- mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
-
- mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
- mq->hash_bits = __ffs(mq->nr_buckets);
- mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets);
- if (!mq->table)
- goto bad_alloc_table;
-
- return &mq->policy;
-
-bad_alloc_table:
- epool_exit(&mq->cache_pool);
-bad_cache_init:
- epool_exit(&mq->pre_cache_pool);
-bad_pre_cache_init:
- kfree(mq);
-
- return NULL;
-}
-
-/*----------------------------------------------------------------*/
-
-static struct dm_cache_policy_type mq_policy_type = {
- .name = "mq",
- .version = {1, 4, 0},
- .hint_size = 4,
- .owner = THIS_MODULE,
- .create = mq_create
-};
-
-static int __init mq_init(void)
-{
- int r;
-
- mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry",
- sizeof(struct entry),
- __alignof__(struct entry),
- 0, NULL);
- if (!mq_entry_cache)
- return -ENOMEM;
-
- r = dm_cache_policy_register(&mq_policy_type);
- if (r) {
- DMERR("register failed %d", r);
- kmem_cache_destroy(mq_entry_cache);
- return -ENOMEM;
- }
-
- return 0;
-}
-
-static void __exit mq_exit(void)
-{
- dm_cache_policy_unregister(&mq_policy_type);
-
- kmem_cache_destroy(mq_entry_cache);
-}
-
-module_init(mq_init);
-module_exit(mq_exit);
-
-MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("mq cache policy");
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 28d4586748d0..cf48a617a3a4 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1567,8 +1567,48 @@ static void smq_tick(struct dm_cache_policy *p, bool can_block)
spin_unlock_irqrestore(&mq->lock, flags);
}
+/*
+ * smq has no config values, but the old mq policy did. To avoid breaking
+ * software we continue to accept these configurables for the mq policy,
+ * but they have no effect.
+ */
+static int mq_set_config_value(struct dm_cache_policy *p,
+ const char *key, const char *value)
+{
+ unsigned long tmp;
+
+ if (kstrtoul(value, 10, &tmp))
+ return -EINVAL;
+
+ if (!strcasecmp(key, "random_threshold") ||
+ !strcasecmp(key, "sequential_threshold") ||
+ !strcasecmp(key, "discard_promote_adjustment") ||
+ !strcasecmp(key, "read_promote_adjustment") ||
+ !strcasecmp(key, "write_promote_adjustment")) {
+ DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
+ unsigned maxlen, ssize_t *sz_ptr)
+{
+ ssize_t sz = *sz_ptr;
+
+ DMEMIT("10 random_threshold 0 "
+ "sequential_threshold 0 "
+ "discard_promote_adjustment 0 "
+ "read_promote_adjustment 0 "
+ "write_promote_adjustment 0 ");
+
+ *sz_ptr = sz;
+ return 0;
+}
+
/* Init the policy plugin interface function pointers. */
-static void init_policy_functions(struct smq_policy *mq)
+static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
{
mq->policy.destroy = smq_destroy;
mq->policy.map = smq_map;
@@ -1583,6 +1623,11 @@ static void init_policy_functions(struct smq_policy *mq)
mq->policy.force_mapping = smq_force_mapping;
mq->policy.residency = smq_residency;
mq->policy.tick = smq_tick;
+
+ if (mimic_mq) {
+ mq->policy.set_config_value = mq_set_config_value;
+ mq->policy.emit_config_values = mq_emit_config_values;
+ }
}
static bool too_many_hotspot_blocks(sector_t origin_size,
@@ -1606,9 +1651,10 @@ static void calc_hotspot_params(sector_t origin_size,
*hotspot_block_size /= 2u;
}
-static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
- sector_t origin_size,
- sector_t cache_block_size)
+static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
+ sector_t origin_size,
+ sector_t cache_block_size,
+ bool mimic_mq)
{
unsigned i;
unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
@@ -1618,7 +1664,7 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
if (!mq)
return NULL;
- init_policy_functions(mq);
+ init_policy_functions(mq, mimic_mq);
mq->cache_size = cache_size;
mq->cache_block_size = cache_block_size;
@@ -1706,19 +1752,41 @@ bad_pool_init:
return NULL;
}
+static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
+ sector_t origin_size,
+ sector_t cache_block_size)
+{
+ return __smq_create(cache_size, origin_size, cache_block_size, false);
+}
+
+static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
+ sector_t origin_size,
+ sector_t cache_block_size)
+{
+ return __smq_create(cache_size, origin_size, cache_block_size, true);
+}
+
/*----------------------------------------------------------------*/
static struct dm_cache_policy_type smq_policy_type = {
.name = "smq",
- .version = {1, 0, 0},
+ .version = {1, 5, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = smq_create
};
+static struct dm_cache_policy_type mq_policy_type = {
+ .name = "mq",
+ .version = {1, 5, 0},
+ .hint_size = 4,
+ .owner = THIS_MODULE,
+ .create = mq_create,
+};
+
static struct dm_cache_policy_type default_policy_type = {
.name = "default",
- .version = {1, 4, 0},
+ .version = {1, 5, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = smq_create,
@@ -1735,9 +1803,17 @@ static int __init smq_init(void)
return -ENOMEM;
}
+ r = dm_cache_policy_register(&mq_policy_type);
+ if (r) {
+ DMERR("register failed (as mq) %d", r);
+ dm_cache_policy_unregister(&smq_policy_type);
+ return -ENOMEM;
+ }
+
r = dm_cache_policy_register(&default_policy_type);
if (r) {
DMERR("register failed (as default) %d", r);
+ dm_cache_policy_unregister(&mq_policy_type);
dm_cache_policy_unregister(&smq_policy_type);
return -ENOMEM;
}
@@ -1748,6 +1824,7 @@ static int __init smq_init(void)
static void __exit smq_exit(void)
{
dm_cache_policy_unregister(&smq_policy_type);
+ dm_cache_policy_unregister(&mq_policy_type);
dm_cache_policy_unregister(&default_policy_type);
}
@@ -1759,3 +1836,4 @@ MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("smq cache policy");
MODULE_ALIAS("dm-cache-default");
+MODULE_ALIAS("dm-cache-mq");
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 5780accffa30..ee0510f9a85e 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -984,9 +984,14 @@ static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mod
static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
{
- bool needs_check = dm_cache_metadata_needs_check(cache->cmd);
+ bool needs_check;
enum cache_metadata_mode old_mode = get_cache_mode(cache);
+ if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) {
+ DMERR("unable to read needs_check flag, setting failure mode");
+ new_mode = CM_FAIL;
+ }
+
if (new_mode == CM_WRITE && needs_check) {
DMERR("%s: unable to switch cache to write mode until repaired.",
cache_device_name(cache));
@@ -2771,7 +2776,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
ti->split_discard_bios = false;
cache->features = ca->features;
- ti->per_bio_data_size = get_per_bio_data_size(cache);
+ ti->per_io_data_size = get_per_bio_data_size(cache);
cache->callbacks.congested_fn = cache_is_congested;
dm_table_add_target_callbacks(ti->table, &cache->callbacks);
@@ -3510,6 +3515,7 @@ static void cache_status(struct dm_target *ti, status_type_t type,
char buf[BDEVNAME_SIZE];
struct cache *cache = ti->private;
dm_cblock_t residency;
+ bool needs_check;
switch (type) {
case STATUSTYPE_INFO:
@@ -3583,7 +3589,9 @@ static void cache_status(struct dm_target *ti, status_type_t type,
else
DMEMIT("rw ");
- if (dm_cache_metadata_needs_check(cache->cmd))
+ r = dm_cache_metadata_needs_check(cache->cmd, &needs_check);
+
+ if (r || needs_check)
DMEMIT("needs_check ");
else
DMEMIT("- ");
@@ -3806,7 +3814,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
- .version = {1, 8, 0},
+ .version = {1, 9, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 3147c8d09ea8..5c934b670154 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1788,7 +1788,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
- cc->per_bio_data_size = ti->per_bio_data_size =
+ cc->per_bio_data_size = ti->per_io_data_size =
ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start +
sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size,
ARCH_KMALLOC_MINALIGN);
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index b4c356a21123..cc70871a6d29 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -204,7 +204,7 @@ out:
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
- ti->per_bio_data_size = sizeof(struct dm_delay_info);
+ ti->per_io_data_size = sizeof(struct dm_delay_info);
ti->private = dc;
return 0;
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 09e2afcafd2d..b7341de87015 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -220,7 +220,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
- ti->per_bio_data_size = sizeof(struct per_bio_data);
+ ti->per_io_data_size = sizeof(struct per_bio_data);
ti->private = fc;
return 0;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 80a439543259..2adf81d81fca 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1291,7 +1291,8 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
immutable_target_type = dm_get_immutable_target_type(md);
if (immutable_target_type &&
- (immutable_target_type != dm_table_get_immutable_target_type(t))) {
+ (immutable_target_type != dm_table_get_immutable_target_type(t)) &&
+ !dm_table_get_wildcard_target(t)) {
DMWARN("can't replace immutable target type %s",
immutable_target_type->name);
r = -EINVAL;
@@ -1303,7 +1304,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
dm_set_md_type(md, dm_table_get_type(t));
/* setup md->queue to reflect md's type (may block) */
- r = dm_setup_md_queue(md);
+ r = dm_setup_md_queue(md, t);
if (r) {
DMWARN("unable to set up device queue for new table.");
goto err_unlock_md_type;
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 624589d51c2c..608302e222af 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -475,7 +475,7 @@ static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->flush_supported = true;
ti->num_discard_bios = 1;
ti->discards_supported = true;
- ti->per_bio_data_size = sizeof(struct per_bio_data);
+ ti->per_io_data_size = sizeof(struct per_bio_data);
ti->private = lc;
return 0;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index cfa29f574c2a..677ba223e2ae 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -23,6 +23,7 @@
#include <linux/delay.h>
#include <scsi/scsi_dh.h>
#include <linux/atomic.h>
+#include <linux/blk-mq.h>
#define DM_MSG_PREFIX "multipath"
#define DM_PG_INIT_DELAY_MSECS 2000
@@ -33,11 +34,12 @@ struct pgpath {
struct list_head list;
struct priority_group *pg; /* Owning PG */
- unsigned is_active; /* Path status */
unsigned fail_count; /* Cumulative failure count */
struct dm_path path;
struct delayed_work activate_path;
+
+ bool is_active:1; /* Path status */
};
#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -53,10 +55,10 @@ struct priority_group {
struct path_selector ps;
unsigned pg_num; /* Reference number */
- unsigned bypassed; /* Temporarily bypass this PG? */
-
unsigned nr_pgpaths; /* Number of paths in PG */
struct list_head pgpaths;
+
+ bool bypassed:1; /* Temporarily bypass this PG? */
};
/* Multipath context */
@@ -74,21 +76,20 @@ struct multipath {
wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
- unsigned pg_init_required; /* pg_init needs calling? */
unsigned pg_init_in_progress; /* Only one pg_init allowed at once */
- unsigned pg_init_delay_retry; /* Delay pg_init retry? */
unsigned nr_valid_paths; /* Total number of usable paths */
struct pgpath *current_pgpath;
struct priority_group *current_pg;
struct priority_group *next_pg; /* Switch to this PG if set */
- unsigned repeat_count; /* I/Os left before calling PS again */
- unsigned queue_io:1; /* Must we queue all I/O? */
- unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */
- unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
- unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
- unsigned pg_init_disabled:1; /* pg_init is not currently allowed */
+ bool queue_io:1; /* Must we queue all I/O? */
+ bool queue_if_no_path:1; /* Queue I/O if last path fails? */
+ bool saved_queue_if_no_path:1; /* Saved state during suspension */
+ bool retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
+ bool pg_init_disabled:1; /* pg_init is not currently allowed */
+ bool pg_init_required:1; /* pg_init needs calling? */
+ bool pg_init_delay_retry:1; /* Delay pg_init retry? */
unsigned pg_init_retries; /* Number of times to retry pg_init */
unsigned pg_init_count; /* Number of times pg_init called */
@@ -120,7 +121,6 @@ static struct kmem_cache *_mpio_cache;
static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
static void trigger_event(struct work_struct *work);
static void activate_path(struct work_struct *work);
-static int __pgpath_busy(struct pgpath *pgpath);
/*-----------------------------------------------
@@ -132,7 +132,7 @@ static struct pgpath *alloc_pgpath(void)
struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
if (pgpath) {
- pgpath->is_active = 1;
+ pgpath->is_active = true;
INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
}
@@ -181,25 +181,31 @@ static void free_priority_group(struct priority_group *pg,
kfree(pg);
}
-static struct multipath *alloc_multipath(struct dm_target *ti)
+static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq)
{
struct multipath *m;
- unsigned min_ios = dm_get_reserved_rq_based_ios();
m = kzalloc(sizeof(*m), GFP_KERNEL);
if (m) {
INIT_LIST_HEAD(&m->priority_groups);
spin_lock_init(&m->lock);
- m->queue_io = 1;
+ m->queue_io = true;
m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
INIT_WORK(&m->trigger_event, trigger_event);
init_waitqueue_head(&m->pg_init_wait);
mutex_init(&m->work_mutex);
- m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
- if (!m->mpio_pool) {
- kfree(m);
- return NULL;
+
+ m->mpio_pool = NULL;
+ if (!use_blk_mq) {
+ unsigned min_ios = dm_get_reserved_rq_based_ios();
+
+ m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
+ if (!m->mpio_pool) {
+ kfree(m);
+ return NULL;
+ }
}
+
m->ti = ti;
ti->private = m;
}
@@ -222,26 +228,41 @@ static void free_multipath(struct multipath *m)
kfree(m);
}
-static int set_mapinfo(struct multipath *m, union map_info *info)
+static struct dm_mpath_io *get_mpio(union map_info *info)
+{
+ return info->ptr;
+}
+
+static struct dm_mpath_io *set_mpio(struct multipath *m, union map_info *info)
{
struct dm_mpath_io *mpio;
+ if (!m->mpio_pool) {
+ /* Use blk-mq pdu memory requested via per_io_data_size */
+ mpio = get_mpio(info);
+ memset(mpio, 0, sizeof(*mpio));
+ return mpio;
+ }
+
mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
if (!mpio)
- return -ENOMEM;
+ return NULL;
memset(mpio, 0, sizeof(*mpio));
info->ptr = mpio;
- return 0;
+ return mpio;
}
-static void clear_mapinfo(struct multipath *m, union map_info *info)
+static void clear_request_fn_mpio(struct multipath *m, union map_info *info)
{
- struct dm_mpath_io *mpio = info->ptr;
+ /* Only needed for non blk-mq (.request_fn) multipath */
+ if (m->mpio_pool) {
+ struct dm_mpath_io *mpio = info->ptr;
- info->ptr = NULL;
- mempool_free(mpio, m->mpio_pool);
+ info->ptr = NULL;
+ mempool_free(mpio, m->mpio_pool);
+ }
}
/*-----------------------------------------------
@@ -257,7 +278,7 @@ static int __pg_init_all_paths(struct multipath *m)
return 0;
m->pg_init_count++;
- m->pg_init_required = 0;
+ m->pg_init_required = false;
/* Check here to reset pg_init_required */
if (!m->current_pg)
@@ -283,11 +304,11 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
/* Must we initialise the PG first, and queue I/O till it's ready? */
if (m->hw_handler_name) {
- m->pg_init_required = 1;
- m->queue_io = 1;
+ m->pg_init_required = true;
+ m->queue_io = true;
} else {
- m->pg_init_required = 0;
- m->queue_io = 0;
+ m->pg_init_required = false;
+ m->queue_io = false;
}
m->pg_init_count = 0;
@@ -298,7 +319,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
{
struct dm_path *path;
- path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes);
+ path = pg->ps.type->select_path(&pg->ps, nr_bytes);
if (!path)
return -ENXIO;
@@ -313,10 +334,10 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
{
struct priority_group *pg;
- unsigned bypassed = 1;
+ bool bypassed = true;
if (!m->nr_valid_paths) {
- m->queue_io = 0;
+ m->queue_io = false;
goto failed;
}
@@ -344,7 +365,7 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
continue;
if (!__choose_path_in_pg(m, pg, nr_bytes)) {
if (!bypassed)
- m->pg_init_delay_retry = 1;
+ m->pg_init_delay_retry = true;
return;
}
}
@@ -380,7 +401,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
union map_info *map_context,
struct request *rq, struct request **__clone)
{
- struct multipath *m = (struct multipath *) ti->private;
+ struct multipath *m = ti->private;
int r = DM_MAPIO_REQUEUE;
size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq);
struct pgpath *pgpath;
@@ -390,8 +411,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
spin_lock_irq(&m->lock);
/* Do we need to select a new pgpath? */
- if (!m->current_pgpath ||
- (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
+ if (!m->current_pgpath || !m->queue_io)
__choose_pgpath(m, nr_bytes);
pgpath = m->current_pgpath;
@@ -405,11 +425,11 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
goto out_unlock;
}
- if (set_mapinfo(m, map_context) < 0)
+ mpio = set_mpio(m, map_context);
+ if (!mpio)
/* ENOMEM, requeue */
goto out_unlock;
- mpio = map_context->ptr;
mpio->pgpath = pgpath;
mpio->nr_bytes = nr_bytes;
@@ -418,17 +438,24 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
spin_unlock_irq(&m->lock);
if (clone) {
- /* Old request-based interface: allocated clone is passed in */
+ /*
+ * Old request-based interface: allocated clone is passed in.
+ * Used by: .request_fn stacked on .request_fn path(s).
+ */
clone->q = bdev_get_queue(bdev);
clone->rq_disk = bdev->bd_disk;
clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
} else {
- /* blk-mq request-based interface */
- *__clone = blk_get_request(bdev_get_queue(bdev),
- rq_data_dir(rq), GFP_ATOMIC);
+ /*
+ * blk-mq request-based interface; used by both:
+ * .request_fn stacked on blk-mq path(s) and
+ * blk-mq stacked on blk-mq path(s).
+ */
+ *__clone = blk_mq_alloc_request(bdev_get_queue(bdev),
+ rq_data_dir(rq), BLK_MQ_REQ_NOWAIT);
if (IS_ERR(*__clone)) {
/* ENOMEM, requeue */
- clear_mapinfo(m, map_context);
+ clear_request_fn_mpio(m, map_context);
return r;
}
(*__clone)->bio = (*__clone)->biotail = NULL;
@@ -463,14 +490,14 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
static void multipath_release_clone(struct request *clone)
{
- blk_put_request(clone);
+ blk_mq_free_request(clone);
}
/*
* If we run out of usable paths, should we queue I/O or error it?
*/
-static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
- unsigned save_old_value)
+static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
+ bool save_old_value)
{
unsigned long flags;
@@ -776,12 +803,12 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
argc--;
if (!strcasecmp(arg_name, "queue_if_no_path")) {
- r = queue_if_no_path(m, 1, 0);
+ r = queue_if_no_path(m, true, false);
continue;
}
if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
- m->retain_attached_hw_handler = 1;
+ m->retain_attached_hw_handler = true;
continue;
}
@@ -820,11 +847,12 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
struct dm_arg_set as;
unsigned pg_count = 0;
unsigned next_pg_num;
+ bool use_blk_mq = dm_use_blk_mq(dm_table_get_md(ti->table));
as.argc = argc;
as.argv = argv;
- m = alloc_multipath(ti);
+ m = alloc_multipath(ti, use_blk_mq);
if (!m) {
ti->error = "can't allocate multipath";
return -EINVAL;
@@ -880,6 +908,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_same_bios = 1;
+ if (use_blk_mq)
+ ti->per_io_data_size = sizeof(struct dm_mpath_io);
return 0;
@@ -917,7 +947,7 @@ static void flush_multipath_work(struct multipath *m)
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
- m->pg_init_disabled = 1;
+ m->pg_init_disabled = true;
spin_unlock_irqrestore(&m->lock, flags);
flush_workqueue(kmpath_handlerd);
@@ -926,7 +956,7 @@ static void flush_multipath_work(struct multipath *m)
flush_work(&m->trigger_event);
spin_lock_irqsave(&m->lock, flags);
- m->pg_init_disabled = 0;
+ m->pg_init_disabled = false;
spin_unlock_irqrestore(&m->lock, flags);
}
@@ -954,7 +984,7 @@ static int fail_path(struct pgpath *pgpath)
DMWARN("Failing path %s.", pgpath->path.dev->name);
pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
- pgpath->is_active = 0;
+ pgpath->is_active = false;
pgpath->fail_count++;
m->nr_valid_paths--;
@@ -987,18 +1017,13 @@ static int reinstate_path(struct pgpath *pgpath)
if (pgpath->is_active)
goto out;
- if (!pgpath->pg->ps.type->reinstate_path) {
- DMWARN("Reinstate path not supported by path selector %s",
- pgpath->pg->ps.type->name);
- r = -EINVAL;
- goto out;
- }
+ DMWARN("Reinstating path %s.", pgpath->path.dev->name);
r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
if (r)
goto out;
- pgpath->is_active = 1;
+ pgpath->is_active = true;
if (!m->nr_valid_paths++) {
m->current_pgpath = NULL;
@@ -1045,7 +1070,7 @@ static int action_dev(struct multipath *m, struct dm_dev *dev,
* Temporarily try to avoid having to use the specified PG
*/
static void bypass_pg(struct multipath *m, struct priority_group *pg,
- int bypassed)
+ bool bypassed)
{
unsigned long flags;
@@ -1078,7 +1103,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
spin_lock_irqsave(&m->lock, flags);
list_for_each_entry(pg, &m->priority_groups, list) {
- pg->bypassed = 0;
+ pg->bypassed = false;
if (--pgnum)
continue;
@@ -1096,7 +1121,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
* Set/clear bypassed status of a PG.
* PGs are numbered upwards from 1 in the order they were declared.
*/
-static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
+static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed)
{
struct priority_group *pg;
unsigned pgnum;
@@ -1120,17 +1145,17 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
/*
* Should we retry pg_init immediately?
*/
-static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
+static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
{
unsigned long flags;
- int limit_reached = 0;
+ bool limit_reached = false;
spin_lock_irqsave(&m->lock, flags);
if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled)
- m->pg_init_required = 1;
+ m->pg_init_required = true;
else
- limit_reached = 1;
+ limit_reached = true;
spin_unlock_irqrestore(&m->lock, flags);
@@ -1143,7 +1168,7 @@ static void pg_init_done(void *data, int errors)
struct priority_group *pg = pgpath->pg;
struct multipath *m = pg->m;
unsigned long flags;
- unsigned delay_retry = 0;
+ bool delay_retry = false;
/* device or driver problems */
switch (errors) {
@@ -1166,7 +1191,7 @@ static void pg_init_done(void *data, int errors)
* Probably doing something like FW upgrade on the
* controller so try the other pg.
*/
- bypass_pg(m, pg, 1);
+ bypass_pg(m, pg, true);
break;
case SCSI_DH_RETRY:
/* Wait before retrying. */
@@ -1177,6 +1202,7 @@ static void pg_init_done(void *data, int errors)
fail_path(pgpath);
errors = 0;
break;
+ case SCSI_DH_DEV_OFFLINED:
default:
/*
* We probably do not want to fail the path for a device
@@ -1194,7 +1220,7 @@ static void pg_init_done(void *data, int errors)
m->current_pg = NULL;
}
} else if (!m->pg_init_required)
- pg->bypassed = 0;
+ pg->bypassed = false;
if (--m->pg_init_in_progress)
/* Activations of other paths are still on going */
@@ -1205,7 +1231,7 @@ static void pg_init_done(void *data, int errors)
if (__pg_init_all_paths(m))
goto out;
}
- m->queue_io = 0;
+ m->queue_io = false;
/*
* Wake up any thread waiting to suspend.
@@ -1291,21 +1317,21 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
int error, union map_info *map_context)
{
struct multipath *m = ti->private;
- struct dm_mpath_io *mpio = map_context->ptr;
+ struct dm_mpath_io *mpio = get_mpio(map_context);
struct pgpath *pgpath;
struct path_selector *ps;
int r;
BUG_ON(!mpio);
- r = do_end_io(m, clone, error, mpio);
+ r = do_end_io(m, clone, error, mpio);
pgpath = mpio->pgpath;
if (pgpath) {
ps = &pgpath->pg->ps;
if (ps->type->end_io)
ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
}
- clear_mapinfo(m, map_context);
+ clear_request_fn_mpio(m, map_context);
return r;
}
@@ -1318,9 +1344,9 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
*/
static void multipath_presuspend(struct dm_target *ti)
{
- struct multipath *m = (struct multipath *) ti->private;
+ struct multipath *m = ti->private;
- queue_if_no_path(m, 0, 1);
+ queue_if_no_path(m, false, true);
}
static void multipath_postsuspend(struct dm_target *ti)
@@ -1337,7 +1363,7 @@ static void multipath_postsuspend(struct dm_target *ti)
*/
static void multipath_resume(struct dm_target *ti)
{
- struct multipath *m = (struct multipath *) ti->private;
+ struct multipath *m = ti->private;
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
@@ -1366,7 +1392,7 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
{
int sz = 0;
unsigned long flags;
- struct multipath *m = (struct multipath *) ti->private;
+ struct multipath *m = ti->private;
struct priority_group *pg;
struct pgpath *p;
unsigned pg_num;
@@ -1474,7 +1500,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
{
int r = -EINVAL;
struct dm_dev *dev;
- struct multipath *m = (struct multipath *) ti->private;
+ struct multipath *m = ti->private;
action_fn action;
mutex_lock(&m->work_mutex);
@@ -1486,10 +1512,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
if (argc == 1) {
if (!strcasecmp(argv[0], "queue_if_no_path")) {
- r = queue_if_no_path(m, 1, 0);
+ r = queue_if_no_path(m, true, false);
goto out;
} else if (!strcasecmp(argv[0], "fail_if_no_path")) {
- r = queue_if_no_path(m, 0, 0);
+ r = queue_if_no_path(m, false, false);
goto out;
}
}
@@ -1500,10 +1526,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
}
if (!strcasecmp(argv[0], "disable_group")) {
- r = bypass_pg_num(m, argv[1], 1);
+ r = bypass_pg_num(m, argv[1], true);
goto out;
} else if (!strcasecmp(argv[0], "enable_group")) {
- r = bypass_pg_num(m, argv[1], 0);
+ r = bypass_pg_num(m, argv[1], false);
goto out;
} else if (!strcasecmp(argv[0], "switch_group")) {
r = switch_pg_num(m, argv[1]);
@@ -1604,7 +1630,7 @@ out:
return ret;
}
-static int __pgpath_busy(struct pgpath *pgpath)
+static int pgpath_busy(struct pgpath *pgpath)
{
struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
@@ -1621,7 +1647,7 @@ static int __pgpath_busy(struct pgpath *pgpath)
*/
static int multipath_busy(struct dm_target *ti)
{
- int busy = 0, has_active = 0;
+ bool busy = false, has_active = false;
struct multipath *m = ti->private;
struct priority_group *pg;
struct pgpath *pgpath;
@@ -1632,7 +1658,7 @@ static int multipath_busy(struct dm_target *ti)
/* pg_init in progress or no paths available */
if (m->pg_init_in_progress ||
(!m->nr_valid_paths && m->queue_if_no_path)) {
- busy = 1;
+ busy = true;
goto out;
}
/* Guess which priority_group will be used at next mapping time */
@@ -1654,13 +1680,12 @@ static int multipath_busy(struct dm_target *ti)
* If there is one non-busy active path at least, the path selector
* will be able to select it. So we consider such a pg as not busy.
*/
- busy = 1;
+ busy = true;
list_for_each_entry(pgpath, &pg->pgpaths, list)
if (pgpath->is_active) {
- has_active = 1;
-
- if (!__pgpath_busy(pgpath)) {
- busy = 0;
+ has_active = true;
+ if (!pgpath_busy(pgpath)) {
+ busy = false;
break;
}
}
@@ -1671,7 +1696,7 @@ static int multipath_busy(struct dm_target *ti)
* the current_pg will be changed at next mapping time.
* We need to try mapping to determine it.
*/
- busy = 0;
+ busy = false;
out:
spin_unlock_irqrestore(&m->lock, flags);
@@ -1684,7 +1709,8 @@ out:
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
- .version = {1, 10, 0},
+ .version = {1, 11, 0},
+ .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE,
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index e7d1fa8b0459..b6eb5365b1a4 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -50,13 +50,8 @@ struct path_selector_type {
/*
* Chooses a path for this io, if no paths are available then
* NULL will be returned.
- *
- * repeat_count is the number of times to use the path before
- * calling the function again. 0 means don't call it again unless
- * the path fails.
*/
struct dm_path *(*select_path) (struct path_selector *ps,
- unsigned *repeat_count,
size_t nr_bytes);
/*
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
index 3941fae0de9f..23f178641794 100644
--- a/drivers/md/dm-queue-length.c
+++ b/drivers/md/dm-queue-length.c
@@ -23,12 +23,13 @@
#include <linux/atomic.h>
#define DM_MSG_PREFIX "multipath queue-length"
-#define QL_MIN_IO 128
-#define QL_VERSION "0.1.0"
+#define QL_MIN_IO 1
+#define QL_VERSION "0.2.0"
struct selector {
struct list_head valid_paths;
struct list_head failed_paths;
+ spinlock_t lock;
};
struct path_info {
@@ -45,6 +46,7 @@ static struct selector *alloc_selector(void)
if (s) {
INIT_LIST_HEAD(&s->valid_paths);
INIT_LIST_HEAD(&s->failed_paths);
+ spin_lock_init(&s->lock);
}
return s;
@@ -113,6 +115,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
struct path_info *pi;
unsigned repeat_count = QL_MIN_IO;
char dummy;
+ unsigned long flags;
/*
* Arguments: [<repeat_count>]
@@ -129,6 +132,11 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
return -EINVAL;
}
+ if (repeat_count > 1) {
+ DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead");
+ repeat_count = 1;
+ }
+
/* Allocate the path information structure */
pi = kmalloc(sizeof(*pi), GFP_KERNEL);
if (!pi) {
@@ -142,7 +150,9 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
path->pscontext = pi;
+ spin_lock_irqsave(&s->lock, flags);
list_add_tail(&pi->list, &s->valid_paths);
+ spin_unlock_irqrestore(&s->lock, flags);
return 0;
}
@@ -151,16 +161,22 @@ static void ql_fail_path(struct path_selector *ps, struct dm_path *path)
{
struct selector *s = ps->context;
struct path_info *pi = path->pscontext;
+ unsigned long flags;
+ spin_lock_irqsave(&s->lock, flags);
list_move(&pi->list, &s->failed_paths);
+ spin_unlock_irqrestore(&s->lock, flags);
}
static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path)
{
struct selector *s = ps->context;
struct path_info *pi = path->pscontext;
+ unsigned long flags;
+ spin_lock_irqsave(&s->lock, flags);
list_move_tail(&pi->list, &s->valid_paths);
+ spin_unlock_irqrestore(&s->lock, flags);
return 0;
}
@@ -168,14 +184,16 @@ static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path)
/*
* Select a path having the minimum number of in-flight I/Os
*/
-static struct dm_path *ql_select_path(struct path_selector *ps,
- unsigned *repeat_count, size_t nr_bytes)
+static struct dm_path *ql_select_path(struct path_selector *ps, size_t nr_bytes)
{
struct selector *s = ps->context;
struct path_info *pi = NULL, *best = NULL;
+ struct dm_path *ret = NULL;
+ unsigned long flags;
+ spin_lock_irqsave(&s->lock, flags);
if (list_empty(&s->valid_paths))
- return NULL;
+ goto out;
/* Change preferred (first in list) path to evenly balance. */
list_move_tail(s->valid_paths.next, &s->valid_paths);
@@ -190,11 +208,12 @@ static struct dm_path *ql_select_path(struct path_selector *ps,
}
if (!best)
- return NULL;
-
- *repeat_count = best->repeat_count;
+ goto out;
- return best->path;
+ ret = best->path;
+out:
+ spin_unlock_irqrestore(&s->lock, flags);
+ return ret;
}
static int ql_start_io(struct path_selector *ps, struct dm_path *path,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index f2a363a89629..b3ccf1e0d4f2 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1121,7 +1121,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
- ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record);
+ ti->per_io_data_size = sizeof(struct dm_raid1_bio_record);
ti->discard_zeroes_data_unsupported = true;
ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0);
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index 6ab1192cdd5f..4ace1da17db8 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -17,6 +17,8 @@
#include <linux/module.h>
#define DM_MSG_PREFIX "multipath round-robin"
+#define RR_MIN_IO 1000
+#define RR_VERSION "1.1.0"
/*-----------------------------------------------------------------
* Path-handling code, paths are held in lists
@@ -41,23 +43,48 @@ static void free_paths(struct list_head *paths)
* Round-robin selector
*---------------------------------------------------------------*/
-#define RR_MIN_IO 1000
-
struct selector {
struct list_head valid_paths;
struct list_head invalid_paths;
+ spinlock_t lock;
+ struct dm_path * __percpu *current_path;
+ struct percpu_counter repeat_count;
};
+static void set_percpu_current_path(struct selector *s, struct dm_path *path)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ *per_cpu_ptr(s->current_path, cpu) = path;
+}
+
static struct selector *alloc_selector(void)
{
struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (s) {
- INIT_LIST_HEAD(&s->valid_paths);
- INIT_LIST_HEAD(&s->invalid_paths);
- }
+ if (!s)
+ return NULL;
+
+ INIT_LIST_HEAD(&s->valid_paths);
+ INIT_LIST_HEAD(&s->invalid_paths);
+ spin_lock_init(&s->lock);
+
+ s->current_path = alloc_percpu(struct dm_path *);
+ if (!s->current_path)
+ goto out_current_path;
+ set_percpu_current_path(s, NULL);
+
+ if (percpu_counter_init(&s->repeat_count, 0, GFP_KERNEL))
+ goto out_repeat_count;
return s;
+
+out_repeat_count:
+ free_percpu(s->current_path);
+out_current_path:
+ kfree(s);
+ return NULL;;
}
static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
@@ -74,10 +101,12 @@ static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
static void rr_destroy(struct path_selector *ps)
{
- struct selector *s = (struct selector *) ps->context;
+ struct selector *s = ps->context;
free_paths(&s->valid_paths);
free_paths(&s->invalid_paths);
+ free_percpu(s->current_path);
+ percpu_counter_destroy(&s->repeat_count);
kfree(s);
ps->context = NULL;
}
@@ -111,10 +140,11 @@ static int rr_status(struct path_selector *ps, struct dm_path *path,
static int rr_add_path(struct path_selector *ps, struct dm_path *path,
int argc, char **argv, char **error)
{
- struct selector *s = (struct selector *) ps->context;
+ struct selector *s = ps->context;
struct path_info *pi;
unsigned repeat_count = RR_MIN_IO;
char dummy;
+ unsigned long flags;
if (argc > 1) {
*error = "round-robin ps: incorrect number of arguments";
@@ -139,42 +169,65 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
path->pscontext = pi;
+ spin_lock_irqsave(&s->lock, flags);
list_add_tail(&pi->list, &s->valid_paths);
+ spin_unlock_irqrestore(&s->lock, flags);
return 0;
}
static void rr_fail_path(struct path_selector *ps, struct dm_path *p)
{
- struct selector *s = (struct selector *) ps->context;
+ unsigned long flags;
+ struct selector *s = ps->context;
struct path_info *pi = p->pscontext;
+ spin_lock_irqsave(&s->lock, flags);
+ if (p == *this_cpu_ptr(s->current_path))
+ set_percpu_current_path(s, NULL);
+
list_move(&pi->list, &s->invalid_paths);
+ spin_unlock_irqrestore(&s->lock, flags);
}
static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p)
{
- struct selector *s = (struct selector *) ps->context;
+ unsigned long flags;
+ struct selector *s = ps->context;
struct path_info *pi = p->pscontext;
+ spin_lock_irqsave(&s->lock, flags);
list_move(&pi->list, &s->valid_paths);
+ spin_unlock_irqrestore(&s->lock, flags);
return 0;
}
-static struct dm_path *rr_select_path(struct path_selector *ps,
- unsigned *repeat_count, size_t nr_bytes)
+static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes)
{
- struct selector *s = (struct selector *) ps->context;
+ unsigned long flags;
+ struct selector *s = ps->context;
struct path_info *pi = NULL;
+ struct dm_path *current_path = NULL;
+
+ current_path = *this_cpu_ptr(s->current_path);
+ if (current_path) {
+ percpu_counter_dec(&s->repeat_count);
+ if (percpu_counter_read_positive(&s->repeat_count) > 0)
+ return current_path;
+ }
+ spin_lock_irqsave(&s->lock, flags);
if (!list_empty(&s->valid_paths)) {
pi = list_entry(s->valid_paths.next, struct path_info, list);
list_move_tail(&pi->list, &s->valid_paths);
- *repeat_count = pi->repeat_count;
+ percpu_counter_set(&s->repeat_count, pi->repeat_count);
+ set_percpu_current_path(s, pi->path);
+ current_path = pi->path;
}
+ spin_unlock_irqrestore(&s->lock, flags);
- return pi ? pi->path : NULL;
+ return current_path;
}
static struct path_selector_type rr_ps = {
@@ -198,7 +251,7 @@ static int __init dm_rr_init(void)
if (r < 0)
DMERR("register failed %d", r);
- DMINFO("version 1.0.0 loaded");
+ DMINFO("version " RR_VERSION " loaded");
return r;
}
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
index 9df8f6bd6418..7b8642045c55 100644
--- a/drivers/md/dm-service-time.c
+++ b/drivers/md/dm-service-time.c
@@ -19,11 +19,12 @@
#define ST_MAX_RELATIVE_THROUGHPUT 100
#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7
#define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
-#define ST_VERSION "0.2.0"
+#define ST_VERSION "0.3.0"
struct selector {
struct list_head valid_paths;
struct list_head failed_paths;
+ spinlock_t lock;
};
struct path_info {
@@ -41,6 +42,7 @@ static struct selector *alloc_selector(void)
if (s) {
INIT_LIST_HEAD(&s->valid_paths);
INIT_LIST_HEAD(&s->failed_paths);
+ spin_lock_init(&s->lock);
}
return s;
@@ -111,6 +113,7 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
unsigned repeat_count = ST_MIN_IO;
unsigned relative_throughput = 1;
char dummy;
+ unsigned long flags;
/*
* Arguments: [<repeat_count> [<relative_throughput>]]
@@ -134,6 +137,11 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
return -EINVAL;
}
+ if (repeat_count > 1) {
+ DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead");
+ repeat_count = 1;
+ }
+
if ((argc == 2) &&
(sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 ||
relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
@@ -155,7 +163,9 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
path->pscontext = pi;
+ spin_lock_irqsave(&s->lock, flags);
list_add_tail(&pi->list, &s->valid_paths);
+ spin_unlock_irqrestore(&s->lock, flags);
return 0;
}
@@ -164,16 +174,22 @@ static void st_fail_path(struct path_selector *ps, struct dm_path *path)
{
struct selector *s = ps->context;
struct path_info *pi = path->pscontext;
+ unsigned long flags;
+ spin_lock_irqsave(&s->lock, flags);
list_move(&pi->list, &s->failed_paths);
+ spin_unlock_irqrestore(&s->lock, flags);
}
static int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
{
struct selector *s = ps->context;
struct path_info *pi = path->pscontext;
+ unsigned long flags;
+ spin_lock_irqsave(&s->lock, flags);
list_move_tail(&pi->list, &s->valid_paths);
+ spin_unlock_irqrestore(&s->lock, flags);
return 0;
}
@@ -255,14 +271,16 @@ static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
return pi2->relative_throughput - pi1->relative_throughput;
}
-static struct dm_path *st_select_path(struct path_selector *ps,
- unsigned *repeat_count, size_t nr_bytes)
+static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes)
{
struct selector *s = ps->context;
struct path_info *pi = NULL, *best = NULL;
+ struct dm_path *ret = NULL;
+ unsigned long flags;
+ spin_lock_irqsave(&s->lock, flags);
if (list_empty(&s->valid_paths))
- return NULL;
+ goto out;
/* Change preferred (first in list) path to evenly balance. */
list_move_tail(s->valid_paths.next, &s->valid_paths);
@@ -272,11 +290,12 @@ static struct dm_path *st_select_path(struct path_selector *ps,
best = pi;
if (!best)
- return NULL;
-
- *repeat_count = best->repeat_count;
+ goto out;
- return best->path;
+ ret = best->path;
+out:
+ spin_unlock_irqrestore(&s->lock, flags);
+ return ret;
}
static int st_start_io(struct path_selector *ps, struct dm_path *path,
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 3766386080a4..70bb0e8b62ce 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1105,6 +1105,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
int i;
int r = -EINVAL;
char *origin_path, *cow_path;
+ dev_t origin_dev, cow_dev;
unsigned args_used, num_flush_bios = 1;
fmode_t origin_mode = FMODE_READ;
@@ -1135,11 +1136,19 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->error = "Cannot get origin device";
goto bad_origin;
}
+ origin_dev = s->origin->bdev->bd_dev;
cow_path = argv[0];
argv++;
argc--;
+ cow_dev = dm_get_dev_t(cow_path);
+ if (cow_dev && cow_dev == origin_dev) {
+ ti->error = "COW device cannot be the same as origin device";
+ r = -EINVAL;
+ goto bad_cow;
+ }
+
r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow);
if (r) {
ti->error = "Cannot get COW device";
@@ -1201,7 +1210,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->private = s;
ti->num_flush_bios = num_flush_bios;
- ti->per_bio_data_size = sizeof(struct dm_snap_tracked_chunk);
+ ti->per_io_data_size = sizeof(struct dm_snap_tracked_chunk);
/* Add snapshot to the list of snapshots for this origin */
/* Exceptions aren't triggered till snapshot_resume() is called */
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 061152a43730..f9e8f0bef332 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -365,6 +365,26 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
}
/*
+ * Convert the path to a device
+ */
+dev_t dm_get_dev_t(const char *path)
+{
+ dev_t uninitialized_var(dev);
+ struct block_device *bdev;
+
+ bdev = lookup_bdev(path);
+ if (IS_ERR(bdev))
+ dev = name_to_dev_t(path);
+ else {
+ dev = bdev->bd_dev;
+ bdput(bdev);
+ }
+
+ return dev;
+}
+EXPORT_SYMBOL_GPL(dm_get_dev_t);
+
+/*
* Add a device to the list, or just increment the usage count if
* it's already present.
*/
@@ -372,23 +392,15 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
struct dm_dev **result)
{
int r;
- dev_t uninitialized_var(dev);
+ dev_t dev;
struct dm_dev_internal *dd;
struct dm_table *t = ti->table;
- struct block_device *bdev;
BUG_ON(!t);
- /* convert the path to a device */
- bdev = lookup_bdev(path);
- if (IS_ERR(bdev)) {
- dev = name_to_dev_t(path);
- if (!dev)
- return -ENODEV;
- } else {
- dev = bdev->bd_dev;
- bdput(bdev);
- }
+ dev = dm_get_dev_t(path);
+ if (!dev)
+ return -ENODEV;
dd = find_device(&t->devices, dev);
if (!dd) {
@@ -920,6 +932,30 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t)
return t->immutable_target_type;
}
+struct dm_target *dm_table_get_immutable_target(struct dm_table *t)
+{
+ /* Immutable target is implicitly a singleton */
+ if (t->num_targets > 1 ||
+ !dm_target_is_immutable(t->targets[0].type))
+ return NULL;
+
+ return t->targets;
+}
+
+struct dm_target *dm_table_get_wildcard_target(struct dm_table *t)
+{
+ struct dm_target *uninitialized_var(ti);
+ unsigned i = 0;
+
+ while (i < dm_table_get_num_targets(t)) {
+ ti = dm_table_get_target(t, i++);
+ if (dm_target_is_wildcard(ti->type))
+ return ti;
+ }
+
+ return NULL;
+}
+
bool dm_table_request_based(struct dm_table *t)
{
return __table_type_request_based(dm_table_get_type(t));
@@ -933,7 +969,7 @@ bool dm_table_mq_request_based(struct dm_table *t)
static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
{
unsigned type = dm_table_get_type(t);
- unsigned per_bio_data_size = 0;
+ unsigned per_io_data_size = 0;
struct dm_target *tgt;
unsigned i;
@@ -945,10 +981,10 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
if (type == DM_TYPE_BIO_BASED)
for (i = 0; i < t->num_targets; i++) {
tgt = t->targets + i;
- per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
+ per_io_data_size = max(per_io_data_size, tgt->per_io_data_size);
}
- t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size);
+ t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_io_data_size);
if (!t->mempools)
return -ENOMEM;
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 925ec1b15e75..a317dd884ba6 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -150,7 +150,8 @@ static void io_err_release_clone_rq(struct request *clone)
static struct target_type error_target = {
.name = "error",
- .version = {1, 3, 0},
+ .version = {1, 4, 0},
+ .features = DM_TARGET_WILDCARD,
.ctr = io_err_ctr,
.dtr = io_err_dtr,
.map = io_err_map,
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index f962d6453afd..43824d73366d 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -344,7 +344,7 @@ static void subtree_dec(void *context, const void *value)
memcpy(&root_le, value, sizeof(root_le));
root = le64_to_cpu(root_le);
if (dm_btree_del(info, root))
- DMERR("btree delete failed\n");
+ DMERR("btree delete failed");
}
static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
@@ -1981,5 +1981,8 @@ bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd)
{
- dm_tm_issue_prefetches(pmd->tm);
+ down_read(&pmd->root_lock);
+ if (!pmd->fail_io)
+ dm_tm_issue_prefetches(pmd->tm);
+ up_read(&pmd->root_lock);
}
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 72d91f477683..92237b6fa8cd 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -235,6 +235,7 @@ struct pool {
struct pool_features pf;
bool low_water_triggered:1; /* A dm event has been sent */
bool suspended:1;
+ bool out_of_data_space:1;
struct dm_bio_prison *prison;
struct dm_kcopyd_client *copier;
@@ -461,9 +462,16 @@ static void cell_error_with_code(struct pool *pool,
dm_bio_prison_free_cell(pool->prison, cell);
}
+static int get_pool_io_error_code(struct pool *pool)
+{
+ return pool->out_of_data_space ? -ENOSPC : -EIO;
+}
+
static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
{
- cell_error_with_code(pool, cell, -EIO);
+ int error = get_pool_io_error_code(pool);
+
+ cell_error_with_code(pool, cell, error);
}
static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
@@ -622,7 +630,9 @@ static void error_retry_list_with_code(struct pool *pool, int error)
static void error_retry_list(struct pool *pool)
{
- return error_retry_list_with_code(pool, -EIO);
+ int error = get_pool_io_error_code(pool);
+
+ return error_retry_list_with_code(pool, error);
}
/*
@@ -2419,6 +2429,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
*/
if (old_mode != new_mode)
notify_of_pool_mode_change_to_oods(pool);
+ pool->out_of_data_space = true;
pool->process_bio = process_bio_read_only;
pool->process_discard = process_discard_bio;
pool->process_cell = process_cell_read_only;
@@ -2432,6 +2443,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
case PM_WRITE:
if (old_mode != new_mode)
notify_of_pool_mode_change(pool, "write");
+ pool->out_of_data_space = false;
pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space;
dm_pool_metadata_read_write(pool->pmd);
pool->process_bio = process_bio;
@@ -2832,6 +2844,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
INIT_LIST_HEAD(&pool->active_thins);
pool->low_water_triggered = false;
pool->suspended = true;
+ pool->out_of_data_space = false;
pool->shared_read_ds = dm_deferred_set_create();
if (!pool->shared_read_ds) {
@@ -3886,7 +3899,7 @@ static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 17, 0},
+ .version = {1, 18, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -4037,7 +4050,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->num_flush_bios = 1;
ti->flush_supported = true;
- ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
+ ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
/* In case the pool supports discards, pass them on. */
ti->discard_zeroes_data_unsupported = true;
@@ -4260,7 +4273,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 17, 0},
+ .version = {1, 18, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 1cc10c4de701..459a9f8905ed 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -812,7 +812,7 @@ int verity_fec_ctr(struct dm_verity *v)
}
/* Reserve space for our per-bio data */
- ti->per_bio_data_size += sizeof(struct dm_verity_fec_io);
+ ti->per_io_data_size += sizeof(struct dm_verity_fec_io);
return 0;
}
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 5c5d30cb6ec5..0aba34a7b3b3 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -354,7 +354,7 @@ int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
size_t len))
{
unsigned todo = 1 << v->data_dev_block_bits;
- struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size);
+ struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
do {
int r;
@@ -460,7 +460,7 @@ static int verity_verify_io(struct dm_verity_io *io)
static void verity_finish_io(struct dm_verity_io *io, int error)
{
struct dm_verity *v = io->v;
- struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size);
+ struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
bio->bi_end_io = io->orig_bi_end_io;
bio->bi_error = error;
@@ -574,7 +574,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
if (bio_data_dir(bio) == WRITE)
return -EIO;
- io = dm_per_bio_data(bio, ti->per_bio_data_size);
+ io = dm_per_bio_data(bio, ti->per_io_data_size);
io->v = v;
io->orig_bi_end_io = bio->bi_end_io;
io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
@@ -1036,15 +1036,15 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
- ti->per_bio_data_size = sizeof(struct dm_verity_io) +
+ ti->per_io_data_size = sizeof(struct dm_verity_io) +
v->shash_descsize + v->digest_size * 2;
r = verity_fec_ctr(v);
if (r)
goto bad;
- ti->per_bio_data_size = roundup(ti->per_bio_data_size,
- __alignof__(struct dm_verity_io));
+ ti->per_io_data_size = roundup(ti->per_io_data_size,
+ __alignof__(struct dm_verity_io));
return 0;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dd834927bc66..be4905769a45 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -106,14 +106,6 @@ struct dm_rq_clone_bio_info {
struct bio clone;
};
-union map_info *dm_get_rq_mapinfo(struct request *rq)
-{
- if (rq && rq->end_io_data)
- return &((struct dm_rq_target_io *)rq->end_io_data)->info;
- return NULL;
-}
-EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
-
#define MINOR_ALLOCED ((void *)-1)
/*
@@ -129,28 +121,18 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
#define DMF_SUSPENDED_INTERNALLY 7
/*
- * A dummy definition to make RCU happy.
- * struct dm_table should never be dereferenced in this file.
- */
-struct dm_table {
- int undefined__;
-};
-
-/*
* Work processed by per-device workqueue.
*/
struct mapped_device {
struct srcu_struct io_barrier;
struct mutex suspend_lock;
- atomic_t holders;
- atomic_t open_count;
/*
- * The current mapping.
+ * The current mapping (struct dm_table *).
* Use dm_get_live_table{_fast} or take suspend_lock for
* dereference.
*/
- struct dm_table __rcu *map;
+ void __rcu *map;
struct list_head table_devices;
struct mutex table_devices_lock;
@@ -158,10 +140,16 @@ struct mapped_device {
unsigned long flags;
struct request_queue *queue;
+ int numa_node_id;
+
unsigned type;
/* Protect queue and type against concurrent access. */
struct mutex type_lock;
+ atomic_t holders;
+ atomic_t open_count;
+
+ struct dm_target *immutable_target;
struct target_type *immutable_target_type;
struct gendisk *disk;
@@ -175,8 +163,20 @@ struct mapped_device {
atomic_t pending[2];
wait_queue_head_t wait;
struct work_struct work;
- struct bio_list deferred;
spinlock_t deferred_lock;
+ struct bio_list deferred;
+
+ /*
+ * Event handling.
+ */
+ wait_queue_head_t eventq;
+ atomic_t event_nr;
+ atomic_t uevent_seq;
+ struct list_head uevent_list;
+ spinlock_t uevent_lock; /* Protect access to uevent_list */
+
+ /* the number of internal suspends */
+ unsigned internal_suspend_count;
/*
* Processing queue (flush)
@@ -192,32 +192,21 @@ struct mapped_device {
struct bio_set *bs;
/*
- * Event handling.
- */
- atomic_t event_nr;
- wait_queue_head_t eventq;
- atomic_t uevent_seq;
- struct list_head uevent_list;
- spinlock_t uevent_lock; /* Protect access to uevent_list */
-
- /*
* freeze/thaw support require holding onto a super block
*/
struct super_block *frozen_sb;
- struct block_device *bdev;
/* forced geometry settings */
struct hd_geometry geometry;
+ struct block_device *bdev;
+
/* kobject and completion */
struct dm_kobject_holder kobj_holder;
/* zero-length flush that will be cloned and submitted to targets */
struct bio flush_bio;
- /* the number of internal suspends */
- unsigned internal_suspend_count;
-
struct dm_stats stats;
struct kthread_worker kworker;
@@ -230,8 +219,9 @@ struct mapped_device {
ktime_t last_rq_start_time;
/* for blk-mq request-based DM support */
- struct blk_mq_tag_set tag_set;
- bool use_blk_mq;
+ struct blk_mq_tag_set *tag_set;
+ bool use_blk_mq:1;
+ bool init_tio_pdu:1;
};
#ifdef CONFIG_DM_MQ_DEFAULT
@@ -240,10 +230,19 @@ static bool use_blk_mq = true;
static bool use_blk_mq = false;
#endif
+#define DM_MQ_NR_HW_QUEUES 1
+#define DM_MQ_QUEUE_DEPTH 2048
+#define DM_NUMA_NODE NUMA_NO_NODE
+
+static unsigned dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES;
+static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH;
+static int dm_numa_node = DM_NUMA_NODE;
+
bool dm_use_blk_mq(struct mapped_device *md)
{
return md->use_blk_mq;
}
+EXPORT_SYMBOL_GPL(dm_use_blk_mq);
/*
* For mempools pre-allocation at the table loading time.
@@ -277,6 +276,27 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
*/
static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
+static int __dm_get_module_param_int(int *module_param, int min, int max)
+{
+ int param = ACCESS_ONCE(*module_param);
+ int modified_param = 0;
+ bool modified = true;
+
+ if (param < min)
+ modified_param = min;
+ else if (param > max)
+ modified_param = max;
+ else
+ modified = false;
+
+ if (modified) {
+ (void)cmpxchg(module_param, param, modified_param);
+ param = modified_param;
+ }
+
+ return param;
+}
+
static unsigned __dm_get_module_param(unsigned *module_param,
unsigned def, unsigned max)
{
@@ -310,6 +330,23 @@ unsigned dm_get_reserved_rq_based_ios(void)
}
EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
+static unsigned dm_get_blk_mq_nr_hw_queues(void)
+{
+ return __dm_get_module_param(&dm_mq_nr_hw_queues, 1, 32);
+}
+
+static unsigned dm_get_blk_mq_queue_depth(void)
+{
+ return __dm_get_module_param(&dm_mq_queue_depth,
+ DM_MQ_QUEUE_DEPTH, BLK_MQ_MAX_DEPTH);
+}
+
+static unsigned dm_get_numa_node(void)
+{
+ return __dm_get_module_param_int(&dm_numa_node,
+ DM_NUMA_NODE, num_online_nodes() - 1);
+}
+
static int __init local_init(void)
{
int r = -ENOMEM;
@@ -323,7 +360,7 @@ static int __init local_init(void)
if (!_rq_tio_cache)
goto out_free_io_cache;
- _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request),
+ _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
__alignof__(struct request), 0, NULL);
if (!_rq_cache)
goto out_free_rq_tio_cache;
@@ -556,16 +593,17 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return dm_get_geometry(md, geo);
}
-static int dm_get_live_table_for_ioctl(struct mapped_device *md,
- struct dm_target **tgt, struct block_device **bdev,
- fmode_t *mode, int *srcu_idx)
+static int dm_grab_bdev_for_ioctl(struct mapped_device *md,
+ struct block_device **bdev,
+ fmode_t *mode)
{
+ struct dm_target *tgt;
struct dm_table *map;
- int r;
+ int srcu_idx, r;
retry:
r = -ENOTTY;
- map = dm_get_live_table(md, srcu_idx);
+ map = dm_get_live_table(md, &srcu_idx);
if (!map || !dm_table_get_size(map))
goto out;
@@ -573,9 +611,8 @@ retry:
if (dm_table_get_num_targets(map) != 1)
goto out;
- *tgt = dm_table_get_target(map, 0);
-
- if (!(*tgt)->type->prepare_ioctl)
+ tgt = dm_table_get_target(map, 0);
+ if (!tgt->type->prepare_ioctl)
goto out;
if (dm_suspended_md(md)) {
@@ -583,14 +620,16 @@ retry:
goto out;
}
- r = (*tgt)->type->prepare_ioctl(*tgt, bdev, mode);
+ r = tgt->type->prepare_ioctl(tgt, bdev, mode);
if (r < 0)
goto out;
+ bdgrab(*bdev);
+ dm_put_live_table(md, srcu_idx);
return r;
out:
- dm_put_live_table(md, *srcu_idx);
+ dm_put_live_table(md, srcu_idx);
if (r == -ENOTCONN && !fatal_signal_pending(current)) {
msleep(10);
goto retry;
@@ -602,11 +641,9 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct mapped_device *md = bdev->bd_disk->private_data;
- struct dm_target *tgt;
- struct block_device *tgt_bdev = NULL;
- int srcu_idx, r;
+ int r;
- r = dm_get_live_table_for_ioctl(md, &tgt, &tgt_bdev, &mode, &srcu_idx);
+ r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
if (r < 0)
return r;
@@ -621,9 +658,9 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
goto out;
}
- r = __blkdev_driver_ioctl(tgt_bdev, mode, cmd, arg);
+ r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
out:
- dm_put_live_table(md, srcu_idx);
+ bdput(bdev);
return r;
}
@@ -642,24 +679,24 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
bio_put(&tio->clone);
}
-static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
- gfp_t gfp_mask)
+static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md,
+ gfp_t gfp_mask)
{
return mempool_alloc(md->io_pool, gfp_mask);
}
-static void free_rq_tio(struct dm_rq_target_io *tio)
+static void free_old_rq_tio(struct dm_rq_target_io *tio)
{
mempool_free(tio, tio->md->io_pool);
}
-static struct request *alloc_clone_request(struct mapped_device *md,
- gfp_t gfp_mask)
+static struct request *alloc_old_clone_request(struct mapped_device *md,
+ gfp_t gfp_mask)
{
return mempool_alloc(md->rq_pool, gfp_mask);
}
-static void free_clone_request(struct mapped_device *md, struct request *rq)
+static void free_old_clone_request(struct mapped_device *md, struct request *rq)
{
mempool_free(rq, md->rq_pool);
}
@@ -827,7 +864,7 @@ int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
mutex_lock(&md->table_devices_lock);
td = find_table_device(&md->table_devices, dev, mode);
if (!td) {
- td = kmalloc(sizeof(*td), GFP_KERNEL);
+ td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
if (!td) {
mutex_unlock(&md->table_devices_lock);
return -ENOMEM;
@@ -1109,12 +1146,8 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
* back into ->request_fn() could deadlock attempting to grab the
* queue lock again.
*/
- if (run_queue) {
- if (md->queue->mq_ops)
- blk_mq_run_hw_queues(md->queue, true);
- else
- blk_run_queue_async(md->queue);
- }
+ if (!md->queue->mq_ops && run_queue)
+ blk_run_queue_async(md->queue);
/*
* dm_put() must be at the end of this function. See the comment above
@@ -1134,15 +1167,10 @@ static void free_rq_clone(struct request *clone)
tio->ti->type->release_clone_rq(clone);
else if (!md->queue->mq_ops)
/* request_fn queue stacked on request_fn queue(s) */
- free_clone_request(md, clone);
- /*
- * NOTE: for the blk-mq queue stacked on request_fn queue(s) case:
- * no need to call free_clone_request() because we leverage blk-mq by
- * allocating the clone at the end of the blk-mq pdu (see: clone_rq)
- */
+ free_old_clone_request(md, clone);
if (!md->queue->mq_ops)
- free_rq_tio(tio);
+ free_old_rq_tio(tio);
}
/*
@@ -1192,13 +1220,13 @@ static void dm_unprep_request(struct request *rq)
if (clone)
free_rq_clone(clone);
else if (!tio->md->queue->mq_ops)
- free_rq_tio(tio);
+ free_old_rq_tio(tio);
}
/*
* Requeue the original request of a clone.
*/
-static void old_requeue_request(struct request *rq)
+static void dm_old_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
unsigned long flags;
@@ -1209,45 +1237,57 @@ static void old_requeue_request(struct request *rq)
spin_unlock_irqrestore(q->queue_lock, flags);
}
+static void dm_mq_requeue_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ unsigned long flags;
+
+ blk_mq_requeue_request(rq);
+ spin_lock_irqsave(q->queue_lock, flags);
+ if (!blk_queue_stopped(q))
+ blk_mq_kick_requeue_list(q);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
static void dm_requeue_original_request(struct mapped_device *md,
struct request *rq)
{
int rw = rq_data_dir(rq);
+ rq_end_stats(md, rq);
dm_unprep_request(rq);
- rq_end_stats(md, rq);
if (!rq->q->mq_ops)
- old_requeue_request(rq);
- else {
- blk_mq_requeue_request(rq);
- blk_mq_kick_requeue_list(rq->q);
- }
+ dm_old_requeue_request(rq);
+ else
+ dm_mq_requeue_request(rq);
rq_completed(md, rw, false);
}
-static void old_stop_queue(struct request_queue *q)
+static void dm_old_stop_queue(struct request_queue *q)
{
unsigned long flags;
- if (blk_queue_stopped(q))
+ spin_lock_irqsave(q->queue_lock, flags);
+ if (blk_queue_stopped(q)) {
+ spin_unlock_irqrestore(q->queue_lock, flags);
return;
+ }
- spin_lock_irqsave(q->queue_lock, flags);
blk_stop_queue(q);
spin_unlock_irqrestore(q->queue_lock, flags);
}
-static void stop_queue(struct request_queue *q)
+static void dm_stop_queue(struct request_queue *q)
{
if (!q->mq_ops)
- old_stop_queue(q);
+ dm_old_stop_queue(q);
else
blk_mq_stop_hw_queues(q);
}
-static void old_start_queue(struct request_queue *q)
+static void dm_old_start_queue(struct request_queue *q)
{
unsigned long flags;
@@ -1257,12 +1297,14 @@ static void old_start_queue(struct request_queue *q)
spin_unlock_irqrestore(q->queue_lock, flags);
}
-static void start_queue(struct request_queue *q)
+static void dm_start_queue(struct request_queue *q)
{
if (!q->mq_ops)
- old_start_queue(q);
- else
+ dm_old_start_queue(q);
+ else {
blk_mq_start_stopped_hw_queues(q, true);
+ blk_mq_kick_requeue_list(q);
+ }
}
static void dm_done(struct request *clone, int error, bool mapped)
@@ -1313,7 +1355,7 @@ static void dm_softirq_done(struct request *rq)
if (!rq->q->mq_ops) {
blk_end_request_all(rq, tio->error);
rq_completed(tio->md, rw, false);
- free_rq_tio(tio);
+ free_old_rq_tio(tio);
} else {
blk_mq_end_request(rq, tio->error);
rq_completed(tio->md, rw, false);
@@ -1336,7 +1378,10 @@ static void dm_complete_request(struct request *rq, int error)
struct dm_rq_target_io *tio = tio_from_request(rq);
tio->error = error;
- blk_complete_request(rq);
+ if (!rq->q->mq_ops)
+ blk_complete_request(rq);
+ else
+ blk_mq_complete_request(rq, error);
}
/*
@@ -1352,7 +1397,7 @@ static void dm_kill_unmapped_request(struct request *rq, int error)
}
/*
- * Called with the clone's queue lock held (for non-blk-mq)
+ * Called with the clone's queue lock held (in the case of .request_fn)
*/
static void end_clone_request(struct request *clone, int error)
{
@@ -1522,21 +1567,26 @@ static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
/*
* Creates a bio that consists of range of complete bvecs.
*/
-static void clone_bio(struct dm_target_io *tio, struct bio *bio,
- sector_t sector, unsigned len)
+static int clone_bio(struct dm_target_io *tio, struct bio *bio,
+ sector_t sector, unsigned len)
{
struct bio *clone = &tio->clone;
__bio_clone_fast(clone, bio);
- if (bio_integrity(bio))
- bio_integrity_clone(clone, bio, GFP_NOIO);
+ if (bio_integrity(bio)) {
+ int r = bio_integrity_clone(clone, bio, GFP_NOIO);
+ if (r < 0)
+ return r;
+ }
bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
clone->bi_iter.bi_size = to_bytes(len);
if (bio_integrity(bio))
bio_integrity_trim(clone, 0, len);
+
+ return 0;
}
static struct dm_target_io *alloc_tio(struct clone_info *ci,
@@ -1593,13 +1643,14 @@ static int __send_empty_flush(struct clone_info *ci)
return 0;
}
-static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
+static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
sector_t sector, unsigned *len)
{
struct bio *bio = ci->bio;
struct dm_target_io *tio;
unsigned target_bio_nr;
unsigned num_target_bios = 1;
+ int r = 0;
/*
* Does the target want to receive duplicate copies of the bio?
@@ -1610,9 +1661,13 @@ static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti
for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
tio = alloc_tio(ci, ti, target_bio_nr);
tio->len_ptr = len;
- clone_bio(tio, bio, sector, *len);
+ r = clone_bio(tio, bio, sector, *len);
+ if (r < 0)
+ break;
__map_bio(tio);
}
+
+ return r;
}
typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
@@ -1689,6 +1744,7 @@ static int __split_and_process_non_flush(struct clone_info *ci)
struct bio *bio = ci->bio;
struct dm_target *ti;
unsigned len;
+ int r;
if (unlikely(bio->bi_rw & REQ_DISCARD))
return __send_discard(ci);
@@ -1701,7 +1757,9 @@ static int __split_and_process_non_flush(struct clone_info *ci)
len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
- __clone_and_map_data_bio(ci, ti, ci->sector, &len);
+ r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
+ if (r < 0)
+ return r;
ci->sector += len;
ci->sector_count -= len;
@@ -1839,28 +1897,22 @@ static int setup_clone(struct request *clone, struct request *rq,
return 0;
}
-static struct request *clone_rq(struct request *rq, struct mapped_device *md,
- struct dm_rq_target_io *tio, gfp_t gfp_mask)
+static struct request *clone_old_rq(struct request *rq, struct mapped_device *md,
+ struct dm_rq_target_io *tio, gfp_t gfp_mask)
{
/*
- * Do not allocate a clone if tio->clone was already set
- * (see: dm_mq_queue_rq).
+ * Create clone for use with .request_fn request_queue
*/
- bool alloc_clone = !tio->clone;
struct request *clone;
- if (alloc_clone) {
- clone = alloc_clone_request(md, gfp_mask);
- if (!clone)
- return NULL;
- } else
- clone = tio->clone;
+ clone = alloc_old_clone_request(md, gfp_mask);
+ if (!clone)
+ return NULL;
blk_rq_init(NULL, clone);
if (setup_clone(clone, rq, tio, gfp_mask)) {
/* -ENOMEM */
- if (alloc_clone)
- free_clone_request(md, clone);
+ free_old_clone_request(md, clone);
return NULL;
}
@@ -1877,29 +1929,40 @@ static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
tio->clone = NULL;
tio->orig = rq;
tio->error = 0;
- memset(&tio->info, 0, sizeof(tio->info));
+ /*
+ * Avoid initializing info for blk-mq; it passes
+ * target-specific data through info.ptr
+ * (see: dm_mq_init_request)
+ */
+ if (!md->init_tio_pdu)
+ memset(&tio->info, 0, sizeof(tio->info));
if (md->kworker_task)
init_kthread_work(&tio->work, map_tio_request);
}
-static struct dm_rq_target_io *prep_tio(struct request *rq,
- struct mapped_device *md, gfp_t gfp_mask)
+static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq,
+ struct mapped_device *md,
+ gfp_t gfp_mask)
{
struct dm_rq_target_io *tio;
int srcu_idx;
struct dm_table *table;
- tio = alloc_rq_tio(md, gfp_mask);
+ tio = alloc_old_rq_tio(md, gfp_mask);
if (!tio)
return NULL;
init_tio(tio, rq, md);
table = dm_get_live_table(md, &srcu_idx);
+ /*
+ * Must clone a request if this .request_fn DM device
+ * is stacked on .request_fn device(s).
+ */
if (!dm_table_mq_request_based(table)) {
- if (!clone_rq(rq, md, tio, gfp_mask)) {
+ if (!clone_old_rq(rq, md, tio, gfp_mask)) {
dm_put_live_table(md, srcu_idx);
- free_rq_tio(tio);
+ free_old_rq_tio(tio);
return NULL;
}
}
@@ -1911,7 +1974,7 @@ static struct dm_rq_target_io *prep_tio(struct request *rq,
/*
* Called with the queue lock held.
*/
-static int dm_prep_fn(struct request_queue *q, struct request *rq)
+static int dm_old_prep_fn(struct request_queue *q, struct request *rq)
{
struct mapped_device *md = q->queuedata;
struct dm_rq_target_io *tio;
@@ -1921,7 +1984,7 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
return BLKPREP_KILL;
}
- tio = prep_tio(rq, md, GFP_ATOMIC);
+ tio = dm_old_prep_tio(rq, md, GFP_ATOMIC);
if (!tio)
return BLKPREP_DEFER;
@@ -2079,12 +2142,18 @@ static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md)
static void dm_request_fn(struct request_queue *q)
{
struct mapped_device *md = q->queuedata;
- int srcu_idx;
- struct dm_table *map = dm_get_live_table(md, &srcu_idx);
- struct dm_target *ti;
+ struct dm_target *ti = md->immutable_target;
struct request *rq;
struct dm_rq_target_io *tio;
- sector_t pos;
+ sector_t pos = 0;
+
+ if (unlikely(!ti)) {
+ int srcu_idx;
+ struct dm_table *map = dm_get_live_table(md, &srcu_idx);
+
+ ti = dm_table_find_target(map, pos);
+ dm_put_live_table(md, srcu_idx);
+ }
/*
* For suspend, check blk_queue_stopped() and increment
@@ -2095,33 +2164,21 @@ static void dm_request_fn(struct request_queue *q)
while (!blk_queue_stopped(q)) {
rq = blk_peek_request(q);
if (!rq)
- goto out;
+ return;
/* always use block 0 to find the target for flushes for now */
pos = 0;
if (!(rq->cmd_flags & REQ_FLUSH))
pos = blk_rq_pos(rq);
- ti = dm_table_find_target(map, pos);
- if (!dm_target_is_valid(ti)) {
- /*
- * Must perform setup, that rq_completed() requires,
- * before calling dm_kill_unmapped_request
- */
- DMERR_LIMIT("request attempted access beyond the end of device");
- dm_start_request(md, rq);
- dm_kill_unmapped_request(rq, -EIO);
- continue;
+ if ((dm_request_peeked_before_merge_deadline(md) &&
+ md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
+ md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) ||
+ (ti->type->busy && ti->type->busy(ti))) {
+ blk_delay_queue(q, HZ / 100);
+ return;
}
- if (dm_request_peeked_before_merge_deadline(md) &&
- md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
- md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq))
- goto delay_and_out;
-
- if (ti->type->busy && ti->type->busy(ti))
- goto delay_and_out;
-
dm_start_request(md, rq);
tio = tio_from_request(rq);
@@ -2130,13 +2187,6 @@ static void dm_request_fn(struct request_queue *q)
queue_kthread_work(&md->kworker, &tio->work);
BUG_ON(!irqs_disabled());
}
-
- goto out;
-
-delay_and_out:
- blk_delay_queue(q, HZ / 100);
-out:
- dm_put_live_table(md, srcu_idx);
}
static int dm_any_congested(void *congested_data, int bdi_bits)
@@ -2146,19 +2196,18 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
struct dm_table *map;
if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
- map = dm_get_live_table_fast(md);
- if (map) {
+ if (dm_request_based(md)) {
/*
- * Request-based dm cares about only own queue for
- * the query about congestion status of request_queue
+ * With request-based DM we only need to check the
+ * top-level queue for congestion.
*/
- if (dm_request_based(md))
- r = md->queue->backing_dev_info.wb.state &
- bdi_bits;
- else
+ r = md->queue->backing_dev_info.wb.state & bdi_bits;
+ } else {
+ map = dm_get_live_table_fast(md);
+ if (map)
r = dm_table_any_congested(map, bdi_bits);
+ dm_put_live_table_fast(md);
}
- dm_put_live_table_fast(md);
}
return r;
@@ -2238,7 +2287,7 @@ static void dm_init_md_queue(struct mapped_device *md)
md->queue->backing_dev_info.congested_data = md;
}
-static void dm_init_old_md_queue(struct mapped_device *md)
+static void dm_init_normal_md_queue(struct mapped_device *md)
{
md->use_blk_mq = false;
dm_init_md_queue(md);
@@ -2285,10 +2334,11 @@ static void cleanup_mapped_device(struct mapped_device *md)
*/
static struct mapped_device *alloc_dev(int minor)
{
- int r;
- struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
+ int r, numa_node_id = dm_get_numa_node();
+ struct mapped_device *md;
void *old_md;
+ md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
if (!md) {
DMWARN("unable to allocate device, out of memory.");
return NULL;
@@ -2309,7 +2359,9 @@ static struct mapped_device *alloc_dev(int minor)
if (r < 0)
goto bad_io_barrier;
+ md->numa_node_id = numa_node_id;
md->use_blk_mq = use_blk_mq;
+ md->init_tio_pdu = false;
md->type = DM_TYPE_NONE;
mutex_init(&md->suspend_lock);
mutex_init(&md->type_lock);
@@ -2323,13 +2375,13 @@ static struct mapped_device *alloc_dev(int minor)
INIT_LIST_HEAD(&md->table_devices);
spin_lock_init(&md->uevent_lock);
- md->queue = blk_alloc_queue(GFP_KERNEL);
+ md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
if (!md->queue)
goto bad;
dm_init_md_queue(md);
- md->disk = alloc_disk(1);
+ md->disk = alloc_disk_node(1, numa_node_id);
if (!md->disk)
goto bad;
@@ -2393,8 +2445,10 @@ static void free_dev(struct mapped_device *md)
unlock_fs(md);
cleanup_mapped_device(md);
- if (md->use_blk_mq)
- blk_mq_free_tag_set(&md->tag_set);
+ if (md->tag_set) {
+ blk_mq_free_tag_set(md->tag_set);
+ kfree(md->tag_set);
+ }
free_table_devices(&md->table_devices);
dm_stats_cleanup(&md->stats);
@@ -2502,13 +2556,20 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
* This must be done before setting the queue restrictions,
* because request-based dm may be run just after the setting.
*/
- if (dm_table_request_based(t))
- stop_queue(q);
+ if (dm_table_request_based(t)) {
+ dm_stop_queue(q);
+ /*
+ * Leverage the fact that request-based DM targets are
+ * immutable singletons and establish md->immutable_target
+ * - used to optimize both dm_request_fn and dm_mq_queue_rq
+ */
+ md->immutable_target = dm_table_get_immutable_target(t);
+ }
__bind_mempools(md, t);
old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
- rcu_assign_pointer(md->map, t);
+ rcu_assign_pointer(md->map, (void *)t);
md->immutable_target_type = dm_table_get_immutable_target_type(t);
dm_table_set_restrictions(t, q, limits);
@@ -2574,7 +2635,6 @@ void dm_set_md_type(struct mapped_device *md, unsigned type)
unsigned dm_get_md_type(struct mapped_device *md)
{
- BUG_ON(!mutex_is_locked(&md->type_lock));
return md->type;
}
@@ -2594,7 +2654,7 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
}
EXPORT_SYMBOL_GPL(dm_get_queue_limits);
-static void init_rq_based_worker_thread(struct mapped_device *md)
+static void dm_old_init_rq_based_worker_thread(struct mapped_device *md)
{
/* Initialize the request-based DM worker thread */
init_kthread_worker(&md->kworker);
@@ -2603,26 +2663,22 @@ static void init_rq_based_worker_thread(struct mapped_device *md)
}
/*
- * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
+ * Fully initialize a .request_fn request-based queue.
*/
-static int dm_init_request_based_queue(struct mapped_device *md)
+static int dm_old_init_request_queue(struct mapped_device *md)
{
- struct request_queue *q = NULL;
-
/* Fully initialize the queue */
- q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
- if (!q)
+ if (!blk_init_allocated_queue(md->queue, dm_request_fn, NULL))
return -EINVAL;
/* disable dm_request_fn's merge heuristic by default */
md->seq_rq_merge_deadline_usecs = 0;
- md->queue = q;
- dm_init_old_md_queue(md);
+ dm_init_normal_md_queue(md);
blk_queue_softirq_done(md->queue, dm_softirq_done);
- blk_queue_prep_rq(md->queue, dm_prep_fn);
+ blk_queue_prep_rq(md->queue, dm_old_prep_fn);
- init_rq_based_worker_thread(md);
+ dm_old_init_rq_based_worker_thread(md);
elv_register_queue(md->queue);
@@ -2642,6 +2698,11 @@ static int dm_mq_init_request(void *data, struct request *rq,
*/
tio->md = md;
+ if (md->init_tio_pdu) {
+ /* target-specific per-io data is immediately after the tio */
+ tio->info.ptr = tio + 1;
+ }
+
return 0;
}
@@ -2651,28 +2712,15 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
struct request *rq = bd->rq;
struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
struct mapped_device *md = tio->md;
- int srcu_idx;
- struct dm_table *map = dm_get_live_table(md, &srcu_idx);
- struct dm_target *ti;
- sector_t pos;
+ struct dm_target *ti = md->immutable_target;
- /* always use block 0 to find the target for flushes for now */
- pos = 0;
- if (!(rq->cmd_flags & REQ_FLUSH))
- pos = blk_rq_pos(rq);
+ if (unlikely(!ti)) {
+ int srcu_idx;
+ struct dm_table *map = dm_get_live_table(md, &srcu_idx);
- ti = dm_table_find_target(map, pos);
- if (!dm_target_is_valid(ti)) {
+ ti = dm_table_find_target(map, 0);
dm_put_live_table(md, srcu_idx);
- DMERR_LIMIT("request attempted access beyond the end of device");
- /*
- * Must perform setup, that rq_completed() requires,
- * before returning BLK_MQ_RQ_QUEUE_ERROR
- */
- dm_start_request(md, rq);
- return BLK_MQ_RQ_QUEUE_ERROR;
}
- dm_put_live_table(md, srcu_idx);
if (ti->type->busy && ti->type->busy(ti))
return BLK_MQ_RQ_QUEUE_BUSY;
@@ -2688,20 +2736,12 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
*/
tio->ti = ti;
- /* Clone the request if underlying devices aren't blk-mq */
- if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) {
- /* clone request is allocated at the end of the pdu */
- tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io);
- (void) clone_rq(rq, md, tio, GFP_ATOMIC);
- queue_kthread_work(&md->kworker, &tio->work);
- } else {
- /* Direct call is fine since .queue_rq allows allocations */
- if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
- /* Undo dm_start_request() before requeuing */
- rq_end_stats(md, rq);
- rq_completed(md, rq_data_dir(rq), false);
- return BLK_MQ_RQ_QUEUE_BUSY;
- }
+ /* Direct call is fine since .queue_rq allows allocations */
+ if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
+ /* Undo dm_start_request() before requeuing */
+ rq_end_stats(md, rq);
+ rq_completed(md, rq_data_dir(rq), false);
+ return BLK_MQ_RQ_QUEUE_BUSY;
}
return BLK_MQ_RQ_QUEUE_OK;
@@ -2714,47 +2754,56 @@ static struct blk_mq_ops dm_mq_ops = {
.init_request = dm_mq_init_request,
};
-static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
+static int dm_mq_init_request_queue(struct mapped_device *md,
+ struct dm_target *immutable_tgt)
{
- unsigned md_type = dm_get_md_type(md);
struct request_queue *q;
int err;
- memset(&md->tag_set, 0, sizeof(md->tag_set));
- md->tag_set.ops = &dm_mq_ops;
- md->tag_set.queue_depth = BLKDEV_MAX_RQ;
- md->tag_set.numa_node = NUMA_NO_NODE;
- md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
- md->tag_set.nr_hw_queues = 1;
- if (md_type == DM_TYPE_REQUEST_BASED) {
- /* make the memory for non-blk-mq clone part of the pdu */
- md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request);
- } else
- md->tag_set.cmd_size = sizeof(struct dm_rq_target_io);
- md->tag_set.driver_data = md;
-
- err = blk_mq_alloc_tag_set(&md->tag_set);
+ if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) {
+ DMERR("request-based dm-mq may only be stacked on blk-mq device(s)");
+ return -EINVAL;
+ }
+
+ md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id);
+ if (!md->tag_set)
+ return -ENOMEM;
+
+ md->tag_set->ops = &dm_mq_ops;
+ md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
+ md->tag_set->numa_node = md->numa_node_id;
+ md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+ md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
+ md->tag_set->driver_data = md;
+
+ md->tag_set->cmd_size = sizeof(struct dm_rq_target_io);
+ if (immutable_tgt && immutable_tgt->per_io_data_size) {
+ /* any target-specific per-io data is immediately after the tio */
+ md->tag_set->cmd_size += immutable_tgt->per_io_data_size;
+ md->init_tio_pdu = true;
+ }
+
+ err = blk_mq_alloc_tag_set(md->tag_set);
if (err)
- return err;
+ goto out_kfree_tag_set;
- q = blk_mq_init_allocated_queue(&md->tag_set, md->queue);
+ q = blk_mq_init_allocated_queue(md->tag_set, md->queue);
if (IS_ERR(q)) {
err = PTR_ERR(q);
goto out_tag_set;
}
- md->queue = q;
dm_init_md_queue(md);
/* backfill 'mq' sysfs registration normally done in blk_register_queue */
blk_mq_register_disk(md->disk);
- if (md_type == DM_TYPE_REQUEST_BASED)
- init_rq_based_worker_thread(md);
-
return 0;
out_tag_set:
- blk_mq_free_tag_set(&md->tag_set);
+ blk_mq_free_tag_set(md->tag_set);
+out_kfree_tag_set:
+ kfree(md->tag_set);
+
return err;
}
@@ -2769,28 +2818,28 @@ static unsigned filter_md_type(unsigned type, struct mapped_device *md)
/*
* Setup the DM device's queue based on md's type
*/
-int dm_setup_md_queue(struct mapped_device *md)
+int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
{
int r;
unsigned md_type = filter_md_type(dm_get_md_type(md), md);
switch (md_type) {
case DM_TYPE_REQUEST_BASED:
- r = dm_init_request_based_queue(md);
+ r = dm_old_init_request_queue(md);
if (r) {
- DMWARN("Cannot initialize queue for request-based mapped device");
+ DMERR("Cannot initialize queue for request-based mapped device");
return r;
}
break;
case DM_TYPE_MQ_REQUEST_BASED:
- r = dm_init_request_based_blk_mq_queue(md);
+ r = dm_mq_init_request_queue(md, dm_table_get_immutable_target(t));
if (r) {
- DMWARN("Cannot initialize queue for request-based blk-mq mapped device");
+ DMERR("Cannot initialize queue for request-based dm-mq mapped device");
return r;
}
break;
case DM_TYPE_BIO_BASED:
- dm_init_old_md_queue(md);
+ dm_init_normal_md_queue(md);
blk_queue_make_request(md->queue, dm_make_request);
/*
* DM handles splitting bios as needed. Free the bio_split bioset
@@ -3133,7 +3182,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
* dm defers requests to md->wq from md->queue.
*/
if (dm_request_based(md)) {
- stop_queue(md->queue);
+ dm_stop_queue(md->queue);
if (md->kworker_task)
flush_kthread_worker(&md->kworker);
}
@@ -3157,7 +3206,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
dm_queue_flush(md);
if (dm_request_based(md))
- start_queue(md->queue);
+ dm_start_queue(md->queue);
unlock_fs(md);
dm_table_presuspend_undo_targets(map);
@@ -3236,7 +3285,7 @@ static int __dm_resume(struct mapped_device *md, struct dm_table *map)
* Request-based dm is queueing the deferred I/Os in its request_queue.
*/
if (dm_request_based(md))
- start_queue(md->queue);
+ dm_start_queue(md->queue);
unlock_fs(md);
@@ -3482,9 +3531,9 @@ int dm_noflush_suspending(struct dm_target *ti)
EXPORT_SYMBOL_GPL(dm_noflush_suspending);
struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
- unsigned integrity, unsigned per_bio_data_size)
+ unsigned integrity, unsigned per_io_data_size)
{
- struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
+ struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
struct kmem_cache *cachep = NULL;
unsigned int pool_size = 0;
unsigned int front_pad;
@@ -3498,7 +3547,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
case DM_TYPE_BIO_BASED:
cachep = _io_cache;
pool_size = dm_get_reserved_bio_based_ios();
- front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
+ front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
break;
case DM_TYPE_REQUEST_BASED:
cachep = _rq_tio_cache;
@@ -3511,8 +3560,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
if (!pool_size)
pool_size = dm_get_reserved_rq_based_ios();
front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
- /* per_bio_data_size is not used. See __bind_mempools(). */
- WARN_ON(per_bio_data_size != 0);
+ /* per_io_data_size is used for blk-mq pdu at queue allocation */
break;
default:
BUG();
@@ -3554,15 +3602,14 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
}
static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
- u32 flags)
+ u32 flags)
{
struct mapped_device *md = bdev->bd_disk->private_data;
const struct pr_ops *ops;
- struct dm_target *tgt;
fmode_t mode;
- int srcu_idx, r;
+ int r;
- r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
if (r < 0)
return r;
@@ -3572,20 +3619,19 @@ static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
else
r = -EOPNOTSUPP;
- dm_put_live_table(md, srcu_idx);
+ bdput(bdev);
return r;
}
static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
- u32 flags)
+ u32 flags)
{
struct mapped_device *md = bdev->bd_disk->private_data;
const struct pr_ops *ops;
- struct dm_target *tgt;
fmode_t mode;
- int srcu_idx, r;
+ int r;
- r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
if (r < 0)
return r;
@@ -3595,7 +3641,7 @@ static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
else
r = -EOPNOTSUPP;
- dm_put_live_table(md, srcu_idx);
+ bdput(bdev);
return r;
}
@@ -3603,11 +3649,10 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
{
struct mapped_device *md = bdev->bd_disk->private_data;
const struct pr_ops *ops;
- struct dm_target *tgt;
fmode_t mode;
- int srcu_idx, r;
+ int r;
- r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
if (r < 0)
return r;
@@ -3617,20 +3662,19 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
else
r = -EOPNOTSUPP;
- dm_put_live_table(md, srcu_idx);
+ bdput(bdev);
return r;
}
static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
- enum pr_type type, bool abort)
+ enum pr_type type, bool abort)
{
struct mapped_device *md = bdev->bd_disk->private_data;
const struct pr_ops *ops;
- struct dm_target *tgt;
fmode_t mode;
- int srcu_idx, r;
+ int r;
- r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
if (r < 0)
return r;
@@ -3640,7 +3684,7 @@ static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
else
r = -EOPNOTSUPP;
- dm_put_live_table(md, srcu_idx);
+ bdput(bdev);
return r;
}
@@ -3648,11 +3692,10 @@ static int dm_pr_clear(struct block_device *bdev, u64 key)
{
struct mapped_device *md = bdev->bd_disk->private_data;
const struct pr_ops *ops;
- struct dm_target *tgt;
fmode_t mode;
- int srcu_idx, r;
+ int r;
- r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
if (r < 0)
return r;
@@ -3662,7 +3705,7 @@ static int dm_pr_clear(struct block_device *bdev, u64 key)
else
r = -EOPNOTSUPP;
- dm_put_live_table(md, srcu_idx);
+ bdput(bdev);
return r;
}
@@ -3701,6 +3744,15 @@ MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"
module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
+module_param(dm_mq_nr_hw_queues, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dm_mq_nr_hw_queues, "Number of hardware queues for request-based dm-mq devices");
+
+module_param(dm_mq_queue_depth, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dm_mq_queue_depth, "Queue depth for request-based dm-mq devices");
+
+module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
+
MODULE_DESCRIPTION(DM_NAME " driver");
MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 7edcf97dfa5a..13a758ec0f88 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -73,6 +73,8 @@ int dm_table_resume_targets(struct dm_table *t);
int dm_table_any_congested(struct dm_table *t, int bdi_bits);
unsigned dm_table_get_type(struct dm_table *t);
struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
+struct dm_target *dm_table_get_immutable_target(struct dm_table *t);
+struct dm_target *dm_table_get_wildcard_target(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
bool dm_table_mq_request_based(struct dm_table *t);
void dm_table_free_md_mempools(struct dm_table *t);
@@ -84,7 +86,7 @@ void dm_set_md_type(struct mapped_device *md, unsigned type);
unsigned dm_get_md_type(struct mapped_device *md);
struct target_type *dm_get_immutable_target_type(struct mapped_device *md);
-int dm_setup_md_queue(struct mapped_device *md);
+int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
/*
* To check the return value from dm_table_find_target().
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index ec1c61c87d89..0830c9e86f0d 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -124,6 +124,8 @@ struct dm_dev {
char name[16];
};
+dev_t dm_get_dev_t(const char *path);
+
/*
* Constructors should call these functions to ensure destination devices
* are opened/closed correctly.
@@ -190,6 +192,13 @@ struct target_type {
#define dm_target_is_immutable(type) ((type)->features & DM_TARGET_IMMUTABLE)
/*
+ * Indicates that a target may replace any target; even immutable targets.
+ * .map, .map_rq, .clone_and_map_rq and .release_clone_rq are all defined.
+ */
+#define DM_TARGET_WILDCARD 0x00000008
+#define dm_target_is_wildcard(type) ((type)->features & DM_TARGET_WILDCARD)
+
+/*
* Some targets need to be sent the same WRITE bio severals times so
* that they can send copies of it to different devices. This function
* examines any supplied bio and returns the number of copies of it the
@@ -231,10 +240,10 @@ struct dm_target {
unsigned num_write_same_bios;
/*
- * The minimum number of extra bytes allocated in each bio for the
- * target to use. dm_per_bio_data returns the data location.
+ * The minimum number of extra bytes allocated in each io for the
+ * target to use.
*/
- unsigned per_bio_data_size;
+ unsigned per_io_data_size;
/*
* If defined, this function is called to find out how many