From 6cf2a73cb2bc422a03984b285a63632c27f8c4e4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 18 Jun 2019 12:40:23 -0300 Subject: docs: device-mapper: move it to the admin-guide The DM support describes lots of aspects related to mapped disk partitions from the userspace PoV. Signed-off-by: Mauro Carvalho Chehab --- .../admin-guide/device-mapper/cache-policies.rst | 131 +++++++ Documentation/admin-guide/device-mapper/cache.rst | 337 ++++++++++++++++ Documentation/admin-guide/device-mapper/delay.rst | 31 ++ .../admin-guide/device-mapper/dm-crypt.rst | 173 +++++++++ .../admin-guide/device-mapper/dm-dust.txt | 272 +++++++++++++ .../admin-guide/device-mapper/dm-flakey.rst | 74 ++++ .../admin-guide/device-mapper/dm-init.rst | 125 ++++++ .../admin-guide/device-mapper/dm-integrity.rst | 259 +++++++++++++ Documentation/admin-guide/device-mapper/dm-io.rst | 75 ++++ Documentation/admin-guide/device-mapper/dm-log.rst | 57 +++ .../admin-guide/device-mapper/dm-queue-length.rst | 48 +++ .../admin-guide/device-mapper/dm-raid.rst | 419 ++++++++++++++++++++ .../admin-guide/device-mapper/dm-service-time.rst | 101 +++++ .../admin-guide/device-mapper/dm-uevent.rst | 110 ++++++ .../admin-guide/device-mapper/dm-zoned.rst | 146 +++++++ Documentation/admin-guide/device-mapper/era.rst | 116 ++++++ Documentation/admin-guide/device-mapper/index.rst | 42 ++ Documentation/admin-guide/device-mapper/kcopyd.rst | 47 +++ Documentation/admin-guide/device-mapper/linear.rst | 63 +++ .../admin-guide/device-mapper/log-writes.rst | 145 +++++++ .../admin-guide/device-mapper/persistent-data.rst | 88 +++++ .../admin-guide/device-mapper/snapshot.rst | 196 ++++++++++ .../admin-guide/device-mapper/statistics.rst | 225 +++++++++++ .../admin-guide/device-mapper/striped.rst | 61 +++ Documentation/admin-guide/device-mapper/switch.rst | 141 +++++++ .../device-mapper/thin-provisioning.rst | 427 +++++++++++++++++++++ .../admin-guide/device-mapper/unstriped.rst | 135 +++++++ Documentation/admin-guide/device-mapper/verity.rst | 229 +++++++++++ .../admin-guide/device-mapper/writecache.rst | 79 ++++ Documentation/admin-guide/device-mapper/zero.rst | 37 ++ Documentation/admin-guide/index.rst | 1 + Documentation/device-mapper/cache-policies.rst | 131 ------- Documentation/device-mapper/cache.rst | 337 ---------------- Documentation/device-mapper/delay.rst | 31 -- Documentation/device-mapper/dm-crypt.rst | 173 --------- Documentation/device-mapper/dm-dust.txt | 272 ------------- Documentation/device-mapper/dm-flakey.rst | 74 ---- Documentation/device-mapper/dm-init.rst | 125 ------ Documentation/device-mapper/dm-integrity.rst | 259 ------------- Documentation/device-mapper/dm-io.rst | 75 ---- Documentation/device-mapper/dm-log.rst | 57 --- Documentation/device-mapper/dm-queue-length.rst | 48 --- Documentation/device-mapper/dm-raid.rst | 419 -------------------- Documentation/device-mapper/dm-service-time.rst | 101 ----- Documentation/device-mapper/dm-uevent.rst | 110 ------ Documentation/device-mapper/dm-zoned.rst | 146 ------- Documentation/device-mapper/era.rst | 116 ------ Documentation/device-mapper/index.rst | 44 --- Documentation/device-mapper/kcopyd.rst | 47 --- Documentation/device-mapper/linear.rst | 63 --- Documentation/device-mapper/log-writes.rst | 145 ------- Documentation/device-mapper/persistent-data.rst | 88 ----- Documentation/device-mapper/snapshot.rst | 196 ---------- Documentation/device-mapper/statistics.rst | 225 ----------- Documentation/device-mapper/striped.rst | 61 --- Documentation/device-mapper/switch.rst | 141 ------- Documentation/device-mapper/thin-provisioning.rst | 427 --------------------- Documentation/device-mapper/unstriped.rst | 135 ------- Documentation/device-mapper/verity.rst | 229 ----------- Documentation/device-mapper/writecache.rst | 79 ---- Documentation/device-mapper/zero.rst | 37 -- MAINTAINERS | 2 +- drivers/md/Kconfig | 2 +- drivers/md/dm-init.c | 2 +- drivers/md/dm-raid.c | 2 +- 65 files changed, 4394 insertions(+), 4395 deletions(-) create mode 100644 Documentation/admin-guide/device-mapper/cache-policies.rst create mode 100644 Documentation/admin-guide/device-mapper/cache.rst create mode 100644 Documentation/admin-guide/device-mapper/delay.rst create mode 100644 Documentation/admin-guide/device-mapper/dm-crypt.rst create mode 100644 Documentation/admin-guide/device-mapper/dm-dust.txt create mode 100644 Documentation/admin-guide/device-mapper/dm-flakey.rst create mode 100644 Documentation/admin-guide/device-mapper/dm-init.rst create mode 100644 Documentation/admin-guide/device-mapper/dm-integrity.rst create mode 100644 Documentation/admin-guide/device-mapper/dm-io.rst create mode 100644 Documentation/admin-guide/device-mapper/dm-log.rst create mode 100644 Documentation/admin-guide/device-mapper/dm-queue-length.rst create mode 100644 Documentation/admin-guide/device-mapper/dm-raid.rst create mode 100644 Documentation/admin-guide/device-mapper/dm-service-time.rst create mode 100644 Documentation/admin-guide/device-mapper/dm-uevent.rst create mode 100644 Documentation/admin-guide/device-mapper/dm-zoned.rst create mode 100644 Documentation/admin-guide/device-mapper/era.rst create mode 100644 Documentation/admin-guide/device-mapper/index.rst create mode 100644 Documentation/admin-guide/device-mapper/kcopyd.rst create mode 100644 Documentation/admin-guide/device-mapper/linear.rst create mode 100644 Documentation/admin-guide/device-mapper/log-writes.rst create mode 100644 Documentation/admin-guide/device-mapper/persistent-data.rst create mode 100644 Documentation/admin-guide/device-mapper/snapshot.rst create mode 100644 Documentation/admin-guide/device-mapper/statistics.rst create mode 100644 Documentation/admin-guide/device-mapper/striped.rst create mode 100644 Documentation/admin-guide/device-mapper/switch.rst create mode 100644 Documentation/admin-guide/device-mapper/thin-provisioning.rst create mode 100644 Documentation/admin-guide/device-mapper/unstriped.rst create mode 100644 Documentation/admin-guide/device-mapper/verity.rst create mode 100644 Documentation/admin-guide/device-mapper/writecache.rst create mode 100644 Documentation/admin-guide/device-mapper/zero.rst delete mode 100644 Documentation/device-mapper/cache-policies.rst delete mode 100644 Documentation/device-mapper/cache.rst delete mode 100644 Documentation/device-mapper/delay.rst delete mode 100644 Documentation/device-mapper/dm-crypt.rst delete mode 100644 Documentation/device-mapper/dm-dust.txt delete mode 100644 Documentation/device-mapper/dm-flakey.rst delete mode 100644 Documentation/device-mapper/dm-init.rst delete mode 100644 Documentation/device-mapper/dm-integrity.rst delete mode 100644 Documentation/device-mapper/dm-io.rst delete mode 100644 Documentation/device-mapper/dm-log.rst delete mode 100644 Documentation/device-mapper/dm-queue-length.rst delete mode 100644 Documentation/device-mapper/dm-raid.rst delete mode 100644 Documentation/device-mapper/dm-service-time.rst delete mode 100644 Documentation/device-mapper/dm-uevent.rst delete mode 100644 Documentation/device-mapper/dm-zoned.rst delete mode 100644 Documentation/device-mapper/era.rst delete mode 100644 Documentation/device-mapper/index.rst delete mode 100644 Documentation/device-mapper/kcopyd.rst delete mode 100644 Documentation/device-mapper/linear.rst delete mode 100644 Documentation/device-mapper/log-writes.rst delete mode 100644 Documentation/device-mapper/persistent-data.rst delete mode 100644 Documentation/device-mapper/snapshot.rst delete mode 100644 Documentation/device-mapper/statistics.rst delete mode 100644 Documentation/device-mapper/striped.rst delete mode 100644 Documentation/device-mapper/switch.rst delete mode 100644 Documentation/device-mapper/thin-provisioning.rst delete mode 100644 Documentation/device-mapper/unstriped.rst delete mode 100644 Documentation/device-mapper/verity.rst delete mode 100644 Documentation/device-mapper/writecache.rst delete mode 100644 Documentation/device-mapper/zero.rst diff --git a/Documentation/admin-guide/device-mapper/cache-policies.rst b/Documentation/admin-guide/device-mapper/cache-policies.rst new file mode 100644 index 000000000000..b17fe352fc41 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/cache-policies.rst @@ -0,0 +1,131 @@ +============================= +Guidance for writing policies +============================= + +Try to keep transactionality out of it. The core is careful to +avoid asking about anything that is migrating. This is a pain, but +makes it easier to write the policies. + +Mappings are loaded into the policy at construction time. + +Every bio that is mapped by the target is referred to the policy. +The policy can return a simple HIT or MISS or issue a migration. + +Currently there's no way for the policy to issue background work, +e.g. to start writing back dirty blocks that are going to be evicted +soon. + +Because we map bios, rather than requests it's easy for the policy +to get fooled by many small bios. For this reason the core target +issues periodic ticks to the policy. It's suggested that the policy +doesn't update states (eg, hit counts) for a block more than once +for each tick. The core ticks by watching bios complete, and so +trying to see when the io scheduler has let the ios run. + + +Overview of supplied cache replacement policies +=============================================== + +multiqueue (mq) +--------------- + +This policy is now an alias for smq (see below). + +The following tunables are accepted, but have no effect:: + + 'sequential_threshold <#nr_sequential_ios>' + 'random_threshold <#nr_random_ios>' + 'read_promote_adjustment ' + 'write_promote_adjustment ' + 'discard_promote_adjustment ' + +Stochastic multiqueue (smq) +--------------------------- + +This policy is the default. + +The stochastic multi-queue (smq) policy addresses some of the problems +with the multiqueue (mq) policy. + +The smq policy (vs mq) offers the promise of less memory utilization, +improved performance and increased adaptability in the face of changing +workloads. smq also does not have any cumbersome tuning knobs. + +Users may switch from "mq" to "smq" simply by appropriately reloading a +DM table that is using the cache target. Doing so will cause all of the +mq policy's hints to be dropped. Also, performance of the cache may +degrade slightly until smq recalculates the origin device's hotspots +that should be cached. + +Memory usage +^^^^^^^^^^^^ + +The mq policy used a lot of memory; 88 bytes per cache block on a 64 +bit machine. + +smq uses 28bit indexes to implement its data structures rather than +pointers. It avoids storing an explicit hit count for each block. It +has a 'hotspot' queue, rather than a pre-cache, which uses a quarter of +the entries (each hotspot block covers a larger area than a single +cache block). + +All this means smq uses ~25bytes per cache block. Still a lot of +memory, but a substantial improvement nontheless. + +Level balancing +^^^^^^^^^^^^^^^ + +mq placed entries in different levels of the multiqueue structures +based on their hit count (~ln(hit count)). This meant the bottom +levels generally had the most entries, and the top ones had very +few. Having unbalanced levels like this reduced the efficacy of the +multiqueue. + +smq does not maintain a hit count, instead it swaps hit entries with +the least recently used entry from the level above. The overall +ordering being a side effect of this stochastic process. With this +scheme we can decide how many entries occupy each multiqueue level, +resulting in better promotion/demotion decisions. + +Adaptability: +The mq policy maintained a hit count for each cache block. For a +different block to get promoted to the cache its hit count has to +exceed the lowest currently in the cache. This meant it could take a +long time for the cache to adapt between varying IO patterns. + +smq doesn't maintain hit counts, so a lot of this problem just goes +away. In addition it tracks performance of the hotspot queue, which +is used to decide which blocks to promote. If the hotspot queue is +performing badly then it starts moving entries more quickly between +levels. This lets it adapt to new IO patterns very quickly. + +Performance +^^^^^^^^^^^ + +Testing smq shows substantially better performance than mq. + +cleaner +------- + +The cleaner writes back all dirty blocks in a cache to decommission it. + +Examples +======== + +The syntax for a table is:: + + cache + <#feature_args> []* + <#policy_args> []* + +The syntax to send a message using the dmsetup command is:: + + dmsetup message 0 sequential_threshold 1024 + dmsetup message 0 random_threshold 8 + +Using dmsetup:: + + dmsetup create blah --table "0 268435456 cache /dev/sdb /dev/sdc \ + /dev/sdd 512 0 mq 4 sequential_threshold 1024 random_threshold 8" + creates a 128GB large mapped device named 'blah' with the + sequential threshold set to 1024 and the random_threshold set to 8. diff --git a/Documentation/admin-guide/device-mapper/cache.rst b/Documentation/admin-guide/device-mapper/cache.rst new file mode 100644 index 000000000000..f15e5254d05b --- /dev/null +++ b/Documentation/admin-guide/device-mapper/cache.rst @@ -0,0 +1,337 @@ +===== +Cache +===== + +Introduction +============ + +dm-cache is a device mapper target written by Joe Thornber, Heinz +Mauelshagen, and Mike Snitzer. + +It aims to improve performance of a block device (eg, a spindle) by +dynamically migrating some of its data to a faster, smaller device +(eg, an SSD). + +This device-mapper solution allows us to insert this caching at +different levels of the dm stack, for instance above the data device for +a thin-provisioning pool. Caching solutions that are integrated more +closely with the virtual memory system should give better performance. + +The target reuses the metadata library used in the thin-provisioning +library. + +The decision as to what data to migrate and when is left to a plug-in +policy module. Several of these have been written as we experiment, +and we hope other people will contribute others for specific io +scenarios (eg. a vm image server). + +Glossary +======== + + Migration + Movement of the primary copy of a logical block from one + device to the other. + Promotion + Migration from slow device to fast device. + Demotion + Migration from fast device to slow device. + +The origin device always contains a copy of the logical block, which +may be out of date or kept in sync with the copy on the cache device +(depending on policy). + +Design +====== + +Sub-devices +----------- + +The target is constructed by passing three devices to it (along with +other parameters detailed later): + +1. An origin device - the big, slow one. + +2. A cache device - the small, fast one. + +3. A small metadata device - records which blocks are in the cache, + which are dirty, and extra hints for use by the policy object. + This information could be put on the cache device, but having it + separate allows the volume manager to configure it differently, + e.g. as a mirror for extra robustness. This metadata device may only + be used by a single cache device. + +Fixed block size +---------------- + +The origin is divided up into blocks of a fixed size. This block size +is configurable when you first create the cache. Typically we've been +using block sizes of 256KB - 1024KB. The block size must be between 64 +sectors (32KB) and 2097152 sectors (1GB) and a multiple of 64 sectors (32KB). + +Having a fixed block size simplifies the target a lot. But it is +something of a compromise. For instance, a small part of a block may be +getting hit a lot, yet the whole block will be promoted to the cache. +So large block sizes are bad because they waste cache space. And small +block sizes are bad because they increase the amount of metadata (both +in core and on disk). + +Cache operating modes +--------------------- + +The cache has three operating modes: writeback, writethrough and +passthrough. + +If writeback, the default, is selected then a write to a block that is +cached will go only to the cache and the block will be marked dirty in +the metadata. + +If writethrough is selected then a write to a cached block will not +complete until it has hit both the origin and cache devices. Clean +blocks should remain clean. + +If passthrough is selected, useful when the cache contents are not known +to be coherent with the origin device, then all reads are served from +the origin device (all reads miss the cache) and all writes are +forwarded to the origin device; additionally, write hits cause cache +block invalidates. To enable passthrough mode the cache must be clean. +Passthrough mode allows a cache device to be activated without having to +worry about coherency. Coherency that exists is maintained, although +the cache will gradually cool as writes take place. If the coherency of +the cache can later be verified, or established through use of the +"invalidate_cblocks" message, the cache device can be transitioned to +writethrough or writeback mode while still warm. Otherwise, the cache +contents can be discarded prior to transitioning to the desired +operating mode. + +A simple cleaner policy is provided, which will clean (write back) all +dirty blocks in a cache. Useful for decommissioning a cache or when +shrinking a cache. Shrinking the cache's fast device requires all cache +blocks, in the area of the cache being removed, to be clean. If the +area being removed from the cache still contains dirty blocks the resize +will fail. Care must be taken to never reduce the volume used for the +cache's fast device until the cache is clean. This is of particular +importance if writeback mode is used. Writethrough and passthrough +modes already maintain a clean cache. Future support to partially clean +the cache, above a specified threshold, will allow for keeping the cache +warm and in writeback mode during resize. + +Migration throttling +-------------------- + +Migrating data between the origin and cache device uses bandwidth. +The user can set a throttle to prevent more than a certain amount of +migration occurring at any one time. Currently we're not taking any +account of normal io traffic going to the devices. More work needs +doing here to avoid migrating during those peak io moments. + +For the time being, a message "migration_threshold <#sectors>" +can be used to set the maximum number of sectors being migrated, +the default being 2048 sectors (1MB). + +Updating on-disk metadata +------------------------- + +On-disk metadata is committed every time a FLUSH or FUA bio is written. +If no such requests are made then commits will occur every second. This +means the cache behaves like a physical disk that has a volatile write +cache. If power is lost you may lose some recent writes. The metadata +should always be consistent in spite of any crash. + +The 'dirty' state for a cache block changes far too frequently for us +to keep updating it on the fly. So we treat it as a hint. In normal +operation it will be written when the dm device is suspended. If the +system crashes all cache blocks will be assumed dirty when restarted. + +Per-block policy hints +---------------------- + +Policy plug-ins can store a chunk of data per cache block. It's up to +the policy how big this chunk is, but it should be kept small. Like the +dirty flags this data is lost if there's a crash so a safe fallback +value should always be possible. + +Policy hints affect performance, not correctness. + +Policy messaging +---------------- + +Policies will have different tunables, specific to each one, so we +need a generic way of getting and setting these. Device-mapper +messages are used. Refer to cache-policies.txt. + +Discard bitset resolution +------------------------- + +We can avoid copying data during migration if we know the block has +been discarded. A prime example of this is when mkfs discards the +whole block device. We store a bitset tracking the discard state of +blocks. However, we allow this bitset to have a different block size +from the cache blocks. This is because we need to track the discard +state for all of the origin device (compare with the dirty bitset +which is just for the smaller cache device). + +Target interface +================ + +Constructor +----------- + + :: + + cache + <#feature args> []* + <#policy args> [policy args]* + + ================ ======================================================= + metadata dev fast device holding the persistent metadata + cache dev fast device holding cached data blocks + origin dev slow device holding original data blocks + block size cache unit size in sectors + + #feature args number of feature arguments passed + feature args writethrough or passthrough (The default is writeback.) + + policy the replacement policy to use + #policy args an even number of arguments corresponding to + key/value pairs passed to the policy + policy args key/value pairs passed to the policy + E.g. 'sequential_threshold 1024' + See cache-policies.txt for details. + ================ ======================================================= + +Optional feature arguments are: + + + ==================== ======================================================== + writethrough write through caching that prohibits cache block + content from being different from origin block content. + Without this argument, the default behaviour is to write + back cache block contents later for performance reasons, + so they may differ from the corresponding origin blocks. + + passthrough a degraded mode useful for various cache coherency + situations (e.g., rolling back snapshots of + underlying storage). Reads and writes always go to + the origin. If a write goes to a cached origin + block, then the cache block is invalidated. + To enable passthrough mode the cache must be clean. + + metadata2 use version 2 of the metadata. This stores the dirty + bits in a separate btree, which improves speed of + shutting down the cache. + + no_discard_passdown disable passing down discards from the cache + to the origin's data device. + ==================== ======================================================== + +A policy called 'default' is always registered. This is an alias for +the policy we currently think is giving best all round performance. + +As the default policy could vary between kernels, if you are relying on +the characteristics of a specific policy, always request it by name. + +Status +------ + +:: + + <#used metadata blocks>/<#total metadata blocks> + <#used cache blocks>/<#total cache blocks> + <#read hits> <#read misses> <#write hits> <#write misses> + <#demotions> <#promotions> <#dirty> <#features> * + <#core args> * <#policy args> * + + + +========================= ===================================================== +metadata block size Fixed block size for each metadata block in + sectors +#used metadata blocks Number of metadata blocks used +#total metadata blocks Total number of metadata blocks +cache block size Configurable block size for the cache device + in sectors +#used cache blocks Number of blocks resident in the cache +#total cache blocks Total number of cache blocks +#read hits Number of times a READ bio has been mapped + to the cache +#read misses Number of times a READ bio has been mapped + to the origin +#write hits Number of times a WRITE bio has been mapped + to the cache +#write misses Number of times a WRITE bio has been + mapped to the origin +#demotions Number of times a block has been removed + from the cache +#promotions Number of times a block has been moved to + the cache +#dirty Number of blocks in the cache that differ + from the origin +#feature args Number of feature args to follow +feature args 'writethrough' (optional) +#core args Number of core arguments (must be even) +core args Key/value pairs for tuning the core + e.g. migration_threshold +policy name Name of the policy +#policy args Number of policy arguments to follow (must be even) +policy args Key/value pairs e.g. sequential_threshold +cache metadata mode ro if read-only, rw if read-write + + In serious cases where even a read-only mode is + deemed unsafe no further I/O will be permitted and + the status will just contain the string 'Fail'. + The userspace recovery tools should then be used. +needs_check 'needs_check' if set, '-' if not set + A metadata operation has failed, resulting in the + needs_check flag being set in the metadata's + superblock. The metadata device must be + deactivated and checked/repaired before the + cache can be made fully operational again. + '-' indicates needs_check is not set. +========================= ===================================================== + +Messages +-------- + +Policies will have different tunables, specific to each one, so we +need a generic way of getting and setting these. Device-mapper +messages are used. (A sysfs interface would also be possible.) + +The message format is:: + + + +E.g.:: + + dmsetup message my_cache 0 sequential_threshold 1024 + + +Invalidation is removing an entry from the cache without writing it +back. Cache blocks can be invalidated via the invalidate_cblocks +message, which takes an arbitrary number of cblock ranges. Each cblock +range's end value is "one past the end", meaning 5-10 expresses a range +of values from 5 to 9. Each cblock must be expressed as a decimal +value, in the future a variant message that takes cblock ranges +expressed in hexadecimal may be needed to better support efficient +invalidation of larger caches. The cache must be in passthrough mode +when invalidate_cblocks is used:: + + invalidate_cblocks [|-]* + +E.g.:: + + dmsetup message my_cache 0 invalidate_cblocks 2345 3456-4567 5678-6789 + +Examples +======== + +The test suite can be found here: + +https://github.com/jthornber/device-mapper-test-suite + +:: + + dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \ + /dev/mapper/ssd /dev/mapper/origin 512 1 writeback default 0' + dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \ + /dev/mapper/ssd /dev/mapper/origin 1024 1 writeback \ + mq 4 sequential_threshold 1024 random_threshold 8' diff --git a/Documentation/admin-guide/device-mapper/delay.rst b/Documentation/admin-guide/device-mapper/delay.rst new file mode 100644 index 000000000000..917ba8c33359 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/delay.rst @@ -0,0 +1,31 @@ +======== +dm-delay +======== + +Device-Mapper's "delay" target delays reads and/or writes +and maps them to different devices. + +Parameters:: + + [ + [ ]] + +With separate write parameters, the first set is only used for reads. +Offsets are specified in sectors. +Delays are specified in milliseconds. + +Example scripts +=============== + +:: + + #!/bin/sh + # Create device delaying rw operation for 500ms + echo "0 `blockdev --getsz $1` delay $1 0 500" | dmsetup create delayed + +:: + + #!/bin/sh + # Create device delaying only write operation for 500ms and + # splitting reads and writes to different devices $1 $2 + echo "0 `blockdev --getsz $1` delay $1 0 0 $2 0 500" | dmsetup create delayed diff --git a/Documentation/admin-guide/device-mapper/dm-crypt.rst b/Documentation/admin-guide/device-mapper/dm-crypt.rst new file mode 100644 index 000000000000..8f4a3f889d43 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-crypt.rst @@ -0,0 +1,173 @@ +======== +dm-crypt +======== + +Device-Mapper's "crypt" target provides transparent encryption of block devices +using the kernel crypto API. + +For a more detailed description of supported parameters see: +https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt + +Parameters:: + + \ + [<#opt_params> ] + + + Encryption cipher, encryption mode and Initial Vector (IV) generator. + + The cipher specifications format is:: + + cipher[:keycount]-chainmode-ivmode[:ivopts] + + Examples:: + + aes-cbc-essiv:sha256 + aes-xts-plain64 + serpent-xts-plain64 + + Cipher format also supports direct specification with kernel crypt API + format (selected by capi: prefix). The IV specification is the same + as for the first format type. + This format is mainly used for specification of authenticated modes. + + The crypto API cipher specifications format is:: + + capi:cipher_api_spec-ivmode[:ivopts] + + Examples:: + + capi:cbc(aes)-essiv:sha256 + capi:xts(aes)-plain64 + + Examples of authenticated modes:: + + capi:gcm(aes)-random + capi:authenc(hmac(sha256),xts(aes))-random + capi:rfc7539(chacha20,poly1305)-random + + The /proc/crypto contains a list of curently loaded crypto modes. + + + Key used for encryption. It is encoded either as a hexadecimal number + or it can be passed as prefixed with single colon + character (':') for keys residing in kernel keyring service. + You can only use key sizes that are valid for the selected cipher + in combination with the selected iv mode. + Note that for some iv modes the key string can contain additional + keys (for example IV seed) so the key contains more parts concatenated + into a single string. + + + The kernel keyring key is identified by string in following format: + ::. + + + The encryption key size in bytes. The kernel key payload size must match + the value passed in . + + + Either 'logon' or 'user' kernel key type. + + + The kernel keyring key description crypt target should look for + when loading key of . + + + Multi-key compatibility mode. You can define keys and + then sectors are encrypted according to their offsets (sector 0 uses key0; + sector 1 uses key1 etc.). must be a power of two. + + + The IV offset is a sector count that is added to the sector number + before creating the IV. + + + This is the device that is going to be used as backend and contains the + encrypted data. You can specify it as a path like /dev/xxx or a device + number :. + + + Starting sector within the device where the encrypted data begins. + +<#opt_params> + Number of optional parameters. If there are no optional parameters, + the optional paramaters section can be skipped or #opt_params can be zero. + Otherwise #opt_params is the number of following arguments. + + Example of optional parameters section: + 3 allow_discards same_cpu_crypt submit_from_crypt_cpus + +allow_discards + Block discard requests (a.k.a. TRIM) are passed through the crypt device. + The default is to ignore discard requests. + + WARNING: Assess the specific security risks carefully before enabling this + option. For example, allowing discards on encrypted devices may lead to + the leak of information about the ciphertext device (filesystem type, + used space etc.) if the discarded blocks can be located easily on the + device later. + +same_cpu_crypt + Perform encryption using the same cpu that IO was submitted on. + The default is to use an unbound workqueue so that encryption work + is automatically balanced between available CPUs. + +submit_from_crypt_cpus + Disable offloading writes to a separate thread after encryption. + There are some situations where offloading write bios from the + encryption threads to a single thread degrades performance + significantly. The default is to offload write bios to the same + thread because it benefits CFQ to have writes submitted using the + same context. + +integrity:: + The device requires additional metadata per-sector stored + in per-bio integrity structure. This metadata must by provided + by underlying dm-integrity target. + + The can be "none" if metadata is used only for persistent IV. + + For Authenticated Encryption with Additional Data (AEAD) + the is "aead". An AEAD mode additionally calculates and verifies + integrity for the encrypted device. The additional space is then + used for storing authentication tag (and persistent IV if needed). + +sector_size: + Use as the encryption unit instead of 512 bytes sectors. + This option can be in range 512 - 4096 bytes and must be power of two. + Virtual device will announce this size as a minimal IO and logical sector. + +iv_large_sectors + IV generators will use sector number counted in units + instead of default 512 bytes sectors. + + For example, if is 4096 bytes, plain64 IV for the second + sector will be 8 (without flag) and 1 if iv_large_sectors is present. + The must be multiple of (in 512 bytes units) + if this flag is specified. + +Example scripts +=============== +LUKS (Linux Unified Key Setup) is now the preferred way to set up disk +encryption with dm-crypt using the 'cryptsetup' utility, see +https://gitlab.com/cryptsetup/cryptsetup + +:: + + #!/bin/sh + # Create a crypt device using dmsetup + dmsetup create crypt1 --table "0 `blockdev --getsz $1` crypt aes-cbc-essiv:sha256 babebabebabebabebabebabebabebabe 0 $1 0" + +:: + + #!/bin/sh + # Create a crypt device using dmsetup when encryption key is stored in keyring service + dmsetup create crypt2 --table "0 `blockdev --getsize $1` crypt aes-cbc-essiv:sha256 :32:logon:my_prefix:my_key 0 $1 0" + +:: + + #!/bin/sh + # Create a crypt device using cryptsetup and LUKS header with default cipher + cryptsetup luksFormat $1 + cryptsetup luksOpen $1 crypt1 diff --git a/Documentation/admin-guide/device-mapper/dm-dust.txt b/Documentation/admin-guide/device-mapper/dm-dust.txt new file mode 100644 index 000000000000..954d402a1f6a --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-dust.txt @@ -0,0 +1,272 @@ +dm-dust +======= + +This target emulates the behavior of bad sectors at arbitrary +locations, and the ability to enable the emulation of the failures +at an arbitrary time. + +This target behaves similarly to a linear target. At a given time, +the user can send a message to the target to start failing read +requests on specific blocks (to emulate the behavior of a hard disk +drive with bad sectors). + +When the failure behavior is enabled (i.e.: when the output of +"dmsetup status" displays "fail_read_on_bad_block"), reads of blocks +in the "bad block list" will fail with EIO ("Input/output error"). + +Writes of blocks in the "bad block list will result in the following: + +1. Remove the block from the "bad block list". +2. Successfully complete the write. + +This emulates the "remapped sector" behavior of a drive with bad +sectors. + +Normally, a drive that is encountering bad sectors will most likely +encounter more bad sectors, at an unknown time or location. +With dm-dust, the user can use the "addbadblock" and "removebadblock" +messages to add arbitrary bad blocks at new locations, and the +"enable" and "disable" messages to modulate the state of whether the +configured "bad blocks" will be treated as bad, or bypassed. +This allows the pre-writing of test data and metadata prior to +simulating a "failure" event where bad sectors start to appear. + +Table parameters: +----------------- + + +Mandatory parameters: + : path to the block device. + : offset to data area from start of device_path + : block size in bytes + (minimum 512, maximum 1073741824, must be a power of 2) + +Usage instructions: +------------------- + +First, find the size (in 512-byte sectors) of the device to be used: + +$ sudo blockdev --getsz /dev/vdb1 +33552384 + +Create the dm-dust device: +(For a device with a block size of 512 bytes) +$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 512' + +(For a device with a block size of 4096 bytes) +$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 4096' + +Check the status of the read behavior ("bypass" indicates that all I/O +will be passed through to the underlying device): +$ sudo dmsetup status dust1 +0 33552384 dust 252:17 bypass + +$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=128 iflag=direct +128+0 records in +128+0 records out + +$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct +128+0 records in +128+0 records out + +Adding and removing bad blocks: +------------------------------- + +At any time (i.e.: whether the device has the "bad block" emulation +enabled or disabled), bad blocks may be added or removed from the +device via the "addbadblock" and "removebadblock" messages: + +$ sudo dmsetup message dust1 0 addbadblock 60 +kernel: device-mapper: dust: badblock added at block 60 + +$ sudo dmsetup message dust1 0 addbadblock 67 +kernel: device-mapper: dust: badblock added at block 67 + +$ sudo dmsetup message dust1 0 addbadblock 72 +kernel: device-mapper: dust: badblock added at block 72 + +These bad blocks will be stored in the "bad block list". +While the device is in "bypass" mode, reads and writes will succeed: + +$ sudo dmsetup status dust1 +0 33552384 dust 252:17 bypass + +Enabling block read failures: +----------------------------- + +To enable the "fail read on bad block" behavior, send the "enable" message: + +$ sudo dmsetup message dust1 0 enable +kernel: device-mapper: dust: enabling read failures on bad sectors + +$ sudo dmsetup status dust1 +0 33552384 dust 252:17 fail_read_on_bad_block + +With the device in "fail read on bad block" mode, attempting to read a +block will encounter an "Input/output error": + +$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=1 skip=67 iflag=direct +dd: error reading '/dev/mapper/dust1': Input/output error +0+0 records in +0+0 records out +0 bytes copied, 0.00040651 s, 0.0 kB/s + +...and writing to the bad blocks will remove the blocks from the list, +therefore emulating the "remap" behavior of hard disk drives: + +$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct +128+0 records in +128+0 records out + +kernel: device-mapper: dust: block 60 removed from badblocklist by write +kernel: device-mapper: dust: block 67 removed from badblocklist by write +kernel: device-mapper: dust: block 72 removed from badblocklist by write +kernel: device-mapper: dust: block 87 removed from badblocklist by write + +Bad block add/remove error handling: +------------------------------------ + +Attempting to add a bad block that already exists in the list will +result in an "Invalid argument" error, as well as a helpful message: + +$ sudo dmsetup message dust1 0 addbadblock 88 +device-mapper: message ioctl on dust1 failed: Invalid argument +kernel: device-mapper: dust: block 88 already in badblocklist + +Attempting to remove a bad block that doesn't exist in the list will +result in an "Invalid argument" error, as well as a helpful message: + +$ sudo dmsetup message dust1 0 removebadblock 87 +device-mapper: message ioctl on dust1 failed: Invalid argument +kernel: device-mapper: dust: block 87 not found in badblocklist + +Counting the number of bad blocks in the bad block list: +-------------------------------------------------------- + +To count the number of bad blocks configured in the device, run the +following message command: + +$ sudo dmsetup message dust1 0 countbadblocks + +A message will print with the number of bad blocks currently +configured on the device: + +kernel: device-mapper: dust: countbadblocks: 895 badblock(s) found + +Querying for specific bad blocks: +--------------------------------- + +To find out if a specific block is in the bad block list, run the +following message command: + +$ sudo dmsetup message dust1 0 queryblock 72 + +The following message will print if the block is in the list: +device-mapper: dust: queryblock: block 72 found in badblocklist + +The following message will print if the block is in the list: +device-mapper: dust: queryblock: block 72 not found in badblocklist + +The "queryblock" message command will work in both the "enabled" +and "disabled" modes, allowing the verification of whether a block +will be treated as "bad" without having to issue I/O to the device, +or having to "enable" the bad block emulation. + +Clearing the bad block list: +---------------------------- + +To clear the bad block list (without needing to individually run +a "removebadblock" message command for every block), run the +following message command: + +$ sudo dmsetup message dust1 0 clearbadblocks + +After clearing the bad block list, the following message will appear: + +kernel: device-mapper: dust: clearbadblocks: badblocks cleared + +If there were no bad blocks to clear, the following message will +appear: + +kernel: device-mapper: dust: clearbadblocks: no badblocks found + +Message commands list: +---------------------- + +Below is a list of the messages that can be sent to a dust device: + +Operations on blocks (requires a argument): + +addbadblock +queryblock +removebadblock + +...where is a block number within range of the device + (corresponding to the block size of the device.) + +Single argument message commands: + +countbadblocks +clearbadblocks +disable +enable +quiet + +Device removal: +--------------- + +When finished, remove the device via the "dmsetup remove" command: + +$ sudo dmsetup remove dust1 + +Quiet mode: +----------- + +On test runs with many bad blocks, it may be desirable to avoid +excessive logging (from bad blocks added, removed, or "remapped"). +This can be done by enabling "quiet mode" via the following message: + +$ sudo dmsetup message dust1 0 quiet + +This will suppress log messages from add / remove / removed by write +operations. Log messages from "countbadblocks" or "queryblock" +message commands will still print in quiet mode. + +The status of quiet mode can be seen by running "dmsetup status": + +$ sudo dmsetup status dust1 +0 33552384 dust 252:17 fail_read_on_bad_block quiet + +To disable quiet mode, send the "quiet" message again: + +$ sudo dmsetup message dust1 0 quiet + +$ sudo dmsetup status dust1 +0 33552384 dust 252:17 fail_read_on_bad_block verbose + +(The presence of "verbose" indicates normal logging.) + +"Why not...?" +------------- + +scsi_debug has a "medium error" mode that can fail reads on one +specified sector (sector 0x1234, hardcoded in the source code), but +it uses RAM for the persistent storage, which drastically decreases +the potential device size. + +dm-flakey fails all I/O from all block locations at a specified time +frequency, and not a given point in time. + +When a bad sector occurs on a hard disk drive, reads to that sector +are failed by the device, usually resulting in an error code of EIO +("I/O error") or ENODATA ("No data available"). However, a write to +the sector may succeed, and result in the sector becoming readable +after the device controller no longer experiences errors reading the +sector (or after a reallocation of the sector). However, there may +be bad sectors that occur on the device in the future, in a different, +unpredictable location. + +This target seeks to provide a device that can exhibit the behavior +of a bad sector at a known sector location, at a known time, based +on a large storage device (at least tens of gigabytes, not occupying +system memory). diff --git a/Documentation/admin-guide/device-mapper/dm-flakey.rst b/Documentation/admin-guide/device-mapper/dm-flakey.rst new file mode 100644 index 000000000000..86138735879d --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-flakey.rst @@ -0,0 +1,74 @@ +========= +dm-flakey +========= + +This target is the same as the linear target except that it exhibits +unreliable behaviour periodically. It's been found useful in simulating +failing devices for testing purposes. + +Starting from the time the table is loaded, the device is available for + seconds, then exhibits unreliable behaviour for seconds, and then this cycle repeats. + +Also, consider using this in combination with the dm-delay target too, +which can delay reads and writes and/or send them to different +underlying devices. + +Table parameters +---------------- + +:: + + \ + [ []] + +Mandatory parameters: + + : + Full pathname to the underlying block-device, or a + "major:minor" device-number. + : + Starting sector within the device. + : + Number of seconds device is available. + : + Number of seconds device returns errors. + +Optional feature parameters: + + If no feature parameters are present, during the periods of + unreliability, all I/O returns errors. + + drop_writes: + All write I/O is silently ignored. + Read I/O is handled correctly. + + error_writes: + All write I/O is failed with an error signalled. + Read I/O is handled correctly. + + corrupt_bio_byte : + During , replace of the data of + each matching bio with . + + : + The offset of the byte to replace. + Counting starts at 1, to replace the first byte. + : + Either 'r' to corrupt reads or 'w' to corrupt writes. + 'w' is incompatible with drop_writes. + : + The value (from 0-255) to write. + : + Perform the replacement only if bio->bi_opf has all the + selected flags set. + +Examples: + +Replaces the 32nd byte of READ bios with the value 1:: + + corrupt_bio_byte 32 r 1 0 + +Replaces the 224th byte of REQ_META (=32) bios with the value 0:: + + corrupt_bio_byte 224 w 0 32 diff --git a/Documentation/admin-guide/device-mapper/dm-init.rst b/Documentation/admin-guide/device-mapper/dm-init.rst new file mode 100644 index 000000000000..e5242ff17e9b --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-init.rst @@ -0,0 +1,125 @@ +================================ +Early creation of mapped devices +================================ + +It is possible to configure a device-mapper device to act as the root device for +your system in two ways. + +The first is to build an initial ramdisk which boots to a minimal userspace +which configures the device, then pivot_root(8) in to it. + +The second is to create one or more device-mappers using the module parameter +"dm-mod.create=" through the kernel boot command line argument. + +The format is specified as a string of data separated by commas and optionally +semi-colons, where: + + - a comma is used to separate fields like name, uuid, flags and table + (specifies one device) + - a semi-colon is used to separate devices. + +So the format will look like this:: + + dm-mod.create=,,,,[,
+][;,,,,
[,
+]+] + +Where:: + + ::= The device name. + ::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | "" + ::= The device minor number | "" + ::= "ro" | "rw" +
::= + ::= "verity" | "linear" | ... (see list below) + +The dm line should be equivalent to the one used by the dmsetup tool with the +`--concise` argument. + +Target types +============ + +Not all target types are available as there are serious risks in allowing +activation of certain DM targets without first using userspace tools to check +the validity of associated metadata. + +======================= ======================================================= +`cache` constrained, userspace should verify cache device +`crypt` allowed +`delay` allowed +`era` constrained, userspace should verify metadata device +`flakey` constrained, meant for test +`linear` allowed +`log-writes` constrained, userspace should verify metadata device +`mirror` constrained, userspace should verify main/mirror device +`raid` constrained, userspace should verify metadata device +`snapshot` constrained, userspace should verify src/dst device +`snapshot-origin` allowed +`snapshot-merge` constrained, userspace should verify src/dst device +`striped` allowed +`switch` constrained, userspace should verify dev path +`thin` constrained, requires dm target message from userspace +`thin-pool` constrained, requires dm target message from userspace +`verity` allowed +`writecache` constrained, userspace should verify cache device +`zero` constrained, not meant for rootfs +======================= ======================================================= + +If the target is not listed above, it is constrained by default (not tested). + +Examples +======== +An example of booting to a linear array made up of user-mode linux block +devices:: + + dm-mod.create="lroot,,,rw, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" root=/dev/dm-0 + +This will boot to a rw dm-linear target of 8192 sectors split across two block +devices identified by their major:minor numbers. After boot, udev will rename +this target to /dev/mapper/lroot (depending on the rules). No uuid was assigned. + +An example of multiple device-mappers, with the dm-mod.create="..." contents +is shown here split on multiple lines for readability:: + + dm-linear,,1,rw, + 0 32768 linear 8:1 0, + 32768 1024000 linear 8:2 0; + dm-verity,,3,ro, + 0 1638400 verity 1 /dev/sdc1 /dev/sdc2 4096 4096 204800 1 sha256 + ac87db56303c9c1da433d7209b5a6ef3e4779df141200cbd7c157dcb8dd89c42 + 5ebfe87f7df3235b80a117ebc4078e44f55045487ad4a96581d1adb564615b51 + +Other examples (per target): + +"crypt":: + + dm-crypt,,8,ro, + 0 1048576 crypt aes-xts-plain64 + babebabebabebabebabebabebabebabebabebabebabebabebabebabebabebabe 0 + /dev/sda 0 1 allow_discards + +"delay":: + + dm-delay,,4,ro,0 409600 delay /dev/sda1 0 500 + +"linear":: + + dm-linear,,,rw, + 0 32768 linear /dev/sda1 0, + 32768 1024000 linear /dev/sda2 0, + 1056768 204800 linear /dev/sda3 0, + 1261568 512000 linear /dev/sda4 0 + +"snapshot-origin":: + + dm-snap-orig,,4,ro,0 409600 snapshot-origin 8:2 + +"striped":: + + dm-striped,,4,ro,0 1638400 striped 4 4096 + /dev/sda1 0 /dev/sda2 0 /dev/sda3 0 /dev/sda4 0 + +"verity":: + + dm-verity,,4,ro, + 0 1638400 verity 1 8:1 8:2 4096 4096 204800 1 sha256 + fb1a5a0f00deb908d8b53cb270858975e76cf64105d412ce764225d53b8f3cfd + 51934789604d1b92399c52e7cb149d1b3a1b74bbbcb103b2a0aaacbed5c08584 diff --git a/Documentation/admin-guide/device-mapper/dm-integrity.rst b/Documentation/admin-guide/device-mapper/dm-integrity.rst new file mode 100644 index 000000000000..a30aa91b5fbe --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-integrity.rst @@ -0,0 +1,259 @@ +============ +dm-integrity +============ + +The dm-integrity target emulates a block device that has additional +per-sector tags that can be used for storing integrity information. + +A general problem with storing integrity tags with every sector is that +writing the sector and the integrity tag must be atomic - i.e. in case of +crash, either both sector and integrity tag or none of them is written. + +To guarantee write atomicity, the dm-integrity target uses journal, it +writes sector data and integrity tags into a journal, commits the journal +and then copies the data and integrity tags to their respective location. + +The dm-integrity target can be used with the dm-crypt target - in this +situation the dm-crypt target creates the integrity data and passes them +to the dm-integrity target via bio_integrity_payload attached to the bio. +In this mode, the dm-crypt and dm-integrity targets provide authenticated +disk encryption - if the attacker modifies the encrypted device, an I/O +error is returned instead of random data. + +The dm-integrity target can also be used as a standalone target, in this +mode it calculates and verifies the integrity tag internally. In this +mode, the dm-integrity target can be used to detect silent data +corruption on the disk or in the I/O path. + +There's an alternate mode of operation where dm-integrity uses bitmap +instead of a journal. If a bit in the bitmap is 1, the corresponding +region's data and integrity tags are not synchronized - if the machine +crashes, the unsynchronized regions will be recalculated. The bitmap mode +is faster than the journal mode, because we don't have to write the data +twice, but it is also less reliable, because if data corruption happens +when the machine crashes, it may not be detected. + +When loading the target for the first time, the kernel driver will format +the device. But it will only format the device if the superblock contains +zeroes. If the superblock is neither valid nor zeroed, the dm-integrity +target can't be loaded. + +To use the target for the first time: + +1. overwrite the superblock with zeroes +2. load the dm-integrity target with one-sector size, the kernel driver + will format the device +3. unload the dm-integrity target +4. read the "provided_data_sectors" value from the superblock +5. load the dm-integrity target with the the target size + "provided_data_sectors" +6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target + with the size "provided_data_sectors" + + +Target arguments: + +1. the underlying block device + +2. the number of reserved sector at the beginning of the device - the + dm-integrity won't read of write these sectors + +3. the size of the integrity tag (if "-" is used, the size is taken from + the internal-hash algorithm) + +4. mode: + + D - direct writes (without journal) + in this mode, journaling is + not used and data sectors and integrity tags are written + separately. In case of crash, it is possible that the data + and integrity tag doesn't match. + J - journaled writes + data and integrity tags are written to the + journal and atomicity is guaranteed. In case of crash, + either both data and tag or none of them are written. The + journaled mode degrades write throughput twice because the + data have to be written twice. + B - bitmap mode - data and metadata are written without any + synchronization, the driver maintains a bitmap of dirty + regions where data and metadata don't match. This mode can + only be used with internal hash. + R - recovery mode - in this mode, journal is not replayed, + checksums are not checked and writes to the device are not + allowed. This mode is useful for data recovery if the + device cannot be activated in any of the other standard + modes. + +5. the number of additional arguments + +Additional arguments: + +journal_sectors:number + The size of journal, this argument is used only if formatting the + device. If the device is already formatted, the value from the + superblock is used. + +interleave_sectors:number + The number of interleaved sectors. This values is rounded down to + a power of two. If the device is already formatted, the value from + the superblock is used. + +meta_device:device + Don't interleave the data and metadata on on device. Use a + separate device for metadata. + +buffer_sectors:number + The number of sectors in one buffer. The value is rounded down to + a power of two. + + The tag area is accessed using buffers, the buffer size is + configurable. The large buffer size means that the I/O size will + be larger, but there could be less I/Os issued. + +journal_watermark:number + The journal watermark in percents. When the size of the journal + exceeds this watermark, the thread that flushes the journal will + be started. + +commit_time:number + Commit time in milliseconds. When this time passes, the journal is + written. The journal is also written immediatelly if the FLUSH + request is received. + +internal_hash:algorithm(:key) (the key is optional) + Use internal hash or crc. + When this argument is used, the dm-integrity target won't accept + integrity tags from the upper target, but it will automatically + generate and verify the integrity tags. + + You can use a crc algorithm (such as crc32), then integrity target + will protect the data against accidental corruption. + You can also use a hmac algorithm (for example + "hmac(sha256):0123456789abcdef"), in this mode it will provide + cryptographic authentication of the data without encryption. + + When this argument is not used, the integrity tags are accepted + from an upper layer target, such as dm-crypt. The upper layer + target should check the validity of the integrity tags. + +recalculate + Recalculate the integrity tags automatically. It is only valid + when using internal hash. + +journal_crypt:algorithm(:key) (the key is optional) + Encrypt the journal using given algorithm to make sure that the + attacker can't read the journal. You can use a block cipher here + (such as "cbc(aes)") or a stream cipher (for example "chacha20", + "salsa20", "ctr(aes)" or "ecb(arc4)"). + + The journal contains history of last writes to the block device, + an attacker reading the journal could see the last sector nubmers + that were written. From the sector numbers, the attacker can infer + the size of files that were written. To protect against this + situation, you can encrypt the journal. + +journal_mac:algorithm(:key) (the key is optional) + Protect sector numbers in the journal from accidental or malicious + modification. To protect against accidental modification, use a + crc algorithm, to protect against malicious modification, use a + hmac algorithm with a key. + + This option is not needed when using internal-hash because in this + mode, the integrity of journal entries is checked when replaying + the journal. Thus, modified sector number would be detected at + this stage. + +block_size:number + The size of a data block in bytes. The larger the block size the + less overhead there is for per-block integrity metadata. + Supported values are 512, 1024, 2048 and 4096 bytes. If not + specified the default block size is 512 bytes. + +sectors_per_bit:number + In the bitmap mode, this parameter specifies the number of + 512-byte sectors that corresponds to one bitmap bit. + +bitmap_flush_interval:number + The bitmap flush interval in milliseconds. The metadata buffers + are synchronized when this interval expires. + + +The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can +be changed when reloading the target (load an inactive table and swap the +tables with suspend and resume). The other arguments should not be changed +when reloading the target because the layout of disk data depend on them +and the reloaded target would be non-functional. + + +The layout of the formatted block device: + +* reserved sectors + (they are not used by this target, they can be used for + storing LUKS metadata or for other purpose), the size of the reserved + area is specified in the target arguments + +* superblock (4kiB) + * magic string - identifies that the device was formatted + * version + * log2(interleave sectors) + * integrity tag size + * the number of journal sections + * provided data sectors - the number of sectors that this target + provides (i.e. the size of the device minus the size of all + metadata and padding). The user of this target should not send + bios that access data beyond the "provided data sectors" limit. + * flags + SB_FLAG_HAVE_JOURNAL_MAC + - a flag is set if journal_mac is used + SB_FLAG_RECALCULATING + - recalculating is in progress + SB_FLAG_DIRTY_BITMAP + - journal area contains the bitmap of dirty + blocks + * log2(sectors per block) + * a position where recalculating finished +* journal + The journal is divided into sections, each section contains: + + * metadata area (4kiB), it contains journal entries + + - every journal entry contains: + + * logical sector (specifies where the data and tag should + be written) + * last 8 bytes of data + * integrity tag (the size is specified in the superblock) + + - every metadata sector ends with + + * mac (8-bytes), all the macs in 8 metadata sectors form a + 64-byte value. It is used to store hmac of sector + numbers in the journal section, to protect against a + possibility that the attacker tampers with sector + numbers in the journal. + * commit id + + * data area (the size is variable; it depends on how many journal + entries fit into the metadata area) + + - every sector in the data area contains: + + * data (504 bytes of data, the last 8 bytes are stored in + the journal entry) + * commit id + + To test if the whole journal section was written correctly, every + 512-byte sector of the journal ends with 8-byte commit id. If the + commit id matches on all sectors in a journal section, then it is + assumed that the section was written correctly. If the commit id + doesn't match, the section was written partially and it should not + be replayed. + +* one or more runs of interleaved tags and data. + Each run contains: + + * tag area - it contains integrity tags. There is one tag for each + sector in the data area + * data area - it contains data sectors. The number of data sectors + in one run must be a power of two. log2 of this value is stored + in the superblock. diff --git a/Documentation/admin-guide/device-mapper/dm-io.rst b/Documentation/admin-guide/device-mapper/dm-io.rst new file mode 100644 index 000000000000..d2492917a1f5 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-io.rst @@ -0,0 +1,75 @@ +===== +dm-io +===== + +Dm-io provides synchronous and asynchronous I/O services. There are three +types of I/O services available, and each type has a sync and an async +version. + +The user must set up an io_region structure to describe the desired location +of the I/O. Each io_region indicates a block-device along with the starting +sector and size of the region:: + + struct io_region { + struct block_device *bdev; + sector_t sector; + sector_t count; + }; + +Dm-io can read from one io_region or write to one or more io_regions. Writes +to multiple regions are specified by an array of io_region structures. + +The first I/O service type takes a list of memory pages as the data buffer for +the I/O, along with an offset into the first page:: + + struct page_list { + struct page_list *next; + struct page *page; + }; + + int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw, + struct page_list *pl, unsigned int offset, + unsigned long *error_bits); + int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, + struct page_list *pl, unsigned int offset, + io_notify_fn fn, void *context); + +The second I/O service type takes an array of bio vectors as the data buffer +for the I/O. This service can be handy if the caller has a pre-assembled bio, +but wants to direct different portions of the bio to different devices:: + + int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, + int rw, struct bio_vec *bvec, + unsigned long *error_bits); + int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, + int rw, struct bio_vec *bvec, + io_notify_fn fn, void *context); + +The third I/O service type takes a pointer to a vmalloc'd memory buffer as the +data buffer for the I/O. This service can be handy if the caller needs to do +I/O to a large region but doesn't want to allocate a large number of individual +memory pages:: + + int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw, + void *data, unsigned long *error_bits); + int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw, + void *data, io_notify_fn fn, void *context); + +Callers of the asynchronous I/O services must include the name of a completion +callback routine and a pointer to some context data for the I/O:: + + typedef void (*io_notify_fn)(unsigned long error, void *context); + +The "error" parameter in this callback, as well as the `*error` parameter in +all of the synchronous versions, is a bitset (instead of a simple error value). +In the case of an write-I/O to multiple regions, this bitset allows dm-io to +indicate success or failure on each individual region. + +Before using any of the dm-io services, the user should call dm_io_get() +and specify the number of pages they expect to perform I/O on concurrently. +Dm-io will attempt to resize its mempool to make sure enough pages are +always available in order to avoid unnecessary waiting while performing I/O. + +When the user is finished using the dm-io services, they should call +dm_io_put() and specify the same number of pages that were given on the +dm_io_get() call. diff --git a/Documentation/admin-guide/device-mapper/dm-log.rst b/Documentation/admin-guide/device-mapper/dm-log.rst new file mode 100644 index 000000000000..ba4fce39bc27 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-log.rst @@ -0,0 +1,57 @@ +===================== +Device-Mapper Logging +===================== +The device-mapper logging code is used by some of the device-mapper +RAID targets to track regions of the disk that are not consistent. +A region (or portion of the address space) of the disk may be +inconsistent because a RAID stripe is currently being operated on or +a machine died while the region was being altered. In the case of +mirrors, a region would be considered dirty/inconsistent while you +are writing to it because the writes need to be replicated for all +the legs of the mirror and may not reach the legs at the same time. +Once all writes are complete, the region is considered clean again. + +There is a generic logging interface that the device-mapper RAID +implementations use to perform logging operations (see +dm_dirty_log_type in include/linux/dm-dirty-log.h). Various different +logging implementations are available and provide different +capabilities. The list includes: + +============== ============================================================== +Type Files +============== ============================================================== +disk drivers/md/dm-log.c +core drivers/md/dm-log.c +userspace drivers/md/dm-log-userspace* include/linux/dm-log-userspace.h +============== ============================================================== + +The "disk" log type +------------------- +This log implementation commits the log state to disk. This way, the +logging state survives reboots/crashes. + +The "core" log type +------------------- +This log implementation keeps the log state in memory. The log state +will not survive a reboot or crash, but there may be a small boost in +performance. This method can also be used if no storage device is +available for storing log state. + +The "userspace" log type +------------------------ +This log type simply provides a way to export the log API to userspace, +so log implementations can be done there. This is done by forwarding most +logging requests to userspace, where a daemon receives and processes the +request. + +The structure used for communication between kernel and userspace are +located in include/linux/dm-log-userspace.h. Due to the frequency, +diversity, and 2-way communication nature of the exchanges between +kernel and userspace, 'connector' is used as the interface for +communication. + +There are currently two userspace log implementations that leverage this +framework - "clustered-disk" and "clustered-core". These implementations +provide a cluster-coherent log for shared-storage. Device-mapper mirroring +can be used in a shared-storage environment when the cluster log implementations +are employed. diff --git a/Documentation/admin-guide/device-mapper/dm-queue-length.rst b/Documentation/admin-guide/device-mapper/dm-queue-length.rst new file mode 100644 index 000000000000..d8e381c1cb02 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-queue-length.rst @@ -0,0 +1,48 @@ +=============== +dm-queue-length +=============== + +dm-queue-length is a path selector module for device-mapper targets, +which selects a path with the least number of in-flight I/Os. +The path selector name is 'queue-length'. + +Table parameters for each path: [] + +:: + + : The number of I/Os to dispatch using the selected + path before switching to the next path. + If not given, internal default is used. To check + the default value, see the activated table. + +Status for each path: + +:: + + : 'A' if the path is active, 'F' if the path is failed. + : The number of path failures. + : The number of in-flight I/Os on the path. + + +Algorithm +========= + +dm-queue-length increments/decrements 'in-flight' when an I/O is +dispatched/completed respectively. +dm-queue-length selects a path with the minimum 'in-flight'. + + +Examples +======== +In case that 2 paths (sda and sdb) are used with repeat_count == 128. + +:: + + # echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \ + dmsetup create test + # + # dmsetup table + test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128 + # + # dmsetup status + test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0 diff --git a/Documentation/admin-guide/device-mapper/dm-raid.rst b/Documentation/admin-guide/device-mapper/dm-raid.rst new file mode 100644 index 000000000000..2fe255b130fb --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-raid.rst @@ -0,0 +1,419 @@ +======= +dm-raid +======= + +The device-mapper RAID (dm-raid) target provides a bridge from DM to MD. +It allows the MD RAID drivers to be accessed using a device-mapper +interface. + + +Mapping Table Interface +----------------------- +The target is named "raid" and it accepts the following parameters:: + + <#raid_params> \ + <#raid_devs> [.. ] + +: + + ============= =============================================================== + raid0 RAID0 striping (no resilience) + raid1 RAID1 mirroring + raid4 RAID4 with dedicated last parity disk + raid5_n RAID5 with dedicated last parity disk supporting takeover + Same as raid4 + + - Transitory layout + raid5_la RAID5 left asymmetric + + - rotating parity 0 with data continuation + raid5_ra RAID5 right asymmetric + + - rotating parity N with data continuation + raid5_ls RAID5 left symmetric + + - rotating parity 0 with data restart + raid5_rs RAID5 right symmetric + + - rotating parity N with data restart + raid6_zr RAID6 zero restart + + - rotating parity zero (left-to-right) with data restart + raid6_nr RAID6 N restart + + - rotating parity N (right-to-left) with data restart + raid6_nc RAID6 N continue + + - rotating parity N (right-to-left) with data continuation + raid6_n_6 RAID6 with dedicate parity disks + + - parity and Q-syndrome on the last 2 disks; + layout for takeover from/to raid4/raid5_n + raid6_la_6 Same as "raid_la" plus dedicated last Q-syndrome disk + + - layout for takeover from raid5_la from/to raid6 + raid6_ra_6 Same as "raid5_ra" dedicated last Q-syndrome disk + + - layout for takeover from raid5_ra from/to raid6 + raid6_ls_6 Same as "raid5_ls" dedicated last Q-syndrome disk + + - layout for takeover from raid5_ls from/to raid6 + raid6_rs_6 Same as "raid5_rs" dedicated last Q-syndrome disk + + - layout for takeover from raid5_rs from/to raid6 + raid10 Various RAID10 inspired algorithms chosen by additional params + (see raid10_format and raid10_copies below) + + - RAID10: Striped Mirrors (aka 'Striping on top of mirrors') + - RAID1E: Integrated Adjacent Stripe Mirroring + - RAID1E: Integrated Offset Stripe Mirroring + - and other similar RAID10 variants + ============= =============================================================== + + Reference: Chapter 4 of + http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf + +<#raid_params>: The number of parameters that follow. + + consists of + + Mandatory parameters: + : + Chunk size in sectors. This parameter is often known as + "stripe size". It is the only mandatory parameter and + is placed first. + + followed by optional parameters (in any order): + [sync|nosync] + Force or prevent RAID initialization. + + [rebuild ] + Rebuild drive number 'idx' (first drive is 0). + + [daemon_sleep ] + Interval between runs of the bitmap daemon that + clear bits. A longer interval means less bitmap I/O but + resyncing after a failure is likely to take longer. + + [min_recovery_rate ] + Throttle RAID initialization + [max_recovery_rate ] + Throttle RAID initialization + [write_mostly ] + Mark drive index 'idx' write-mostly. + [max_write_behind ] + See '--write-behind=' (man mdadm) + [stripe_cache ] + Stripe cache size (RAID 4/5/6 only) + [region_size ] + The region_size multiplied by the number of regions is the + logical size of the array. The bitmap records the device + synchronisation state for each region. + + [raid10_copies <# copies>], [raid10_format ] + These two options are used to alter the default layout of + a RAID10 configuration. The number of copies is can be + specified, but the default is 2. There are also three + variations to how the copies are laid down - the default + is "near". Near copies are what most people think of with + respect to mirroring. If these options are left unspecified, + or 'raid10_copies 2' and/or 'raid10_format near' are given, + then the layouts for 2, 3 and 4 devices are: + + ======== ========== ============== + 2 drives 3 drives 4 drives + ======== ========== ============== + A1 A1 A1 A1 A2 A1 A1 A2 A2 + A2 A2 A2 A3 A3 A3 A3 A4 A4 + A3 A3 A4 A4 A5 A5 A5 A6 A6 + A4 A4 A5 A6 A6 A7 A7 A8 A8 + .. .. .. .. .. .. .. .. .. + ======== ========== ============== + + The 2-device layout is equivalent 2-way RAID1. The 4-device + layout is what a traditional RAID10 would look like. The + 3-device layout is what might be called a 'RAID1E - Integrated + Adjacent Stripe Mirroring'. + + If 'raid10_copies 2' and 'raid10_format far', then the layouts + for 2, 3 and 4 devices are: + + ======== ============ =================== + 2 drives 3 drives 4 drives + ======== ============ =================== + A1 A2 A1 A2 A3 A1 A2 A3 A4 + A3 A4 A4 A5 A6 A5 A6 A7 A8 + A5 A6 A7 A8 A9 A9 A10 A11 A12 + .. .. .. .. .. .. .. .. .. + A2 A1 A3 A1 A2 A2 A1 A4 A3 + A4 A3 A6 A4 A5 A6 A5 A8 A7 + A6 A5 A9 A7 A8 A10 A9 A12 A11 + .. .. .. .. .. .. .. .. .. + ======== ============ =================== + + If 'raid10_copies 2' and 'raid10_format offset', then the + layouts for 2, 3 and 4 devices are: + + ======== ========== ================ + 2 drives 3 drives 4 drives + ======== ========== ================ + A1 A2 A1 A2 A3 A1 A2 A3 A4 + A2 A1 A3 A1 A2 A2 A1 A4 A3 + A3 A4 A4 A5 A6 A5 A6 A7 A8 + A4 A3 A6 A4 A5 A6 A5 A8 A7 + A5 A6 A7 A8 A9 A9 A10 A11 A12 + A6 A5 A9 A7 A8 A10 A9 A12 A11 + .. .. .. .. .. .. .. .. .. + ======== ========== ================ + + Here we see layouts closely akin to 'RAID1E - Integrated + Offset Stripe Mirroring'. + + [delta_disks ] + The delta_disks option value (-251 < N < +251) triggers + device removal (negative value) or device addition (positive + value) to any reshape supporting raid levels 4/5/6 and 10. + RAID levels 4/5/6 allow for addition of devices (metadata + and data device tuple), raid10_near and raid10_offset only + allow for device addition. raid10_far does not support any + reshaping at all. + A minimum of devices have to be kept to enforce resilience, + which is 3 devices for raid4/5 and 4 devices for raid6. + + [data_offset ] + This option value defines the offset into each data device + where the data starts. This is used to provide out-of-place + reshaping space to avoid writing over data while + changing the layout of stripes, hence an interruption/crash + may happen at any time without the risk of losing data. + E.g. when adding devices to an existing raid set during + forward reshaping, the out-of-place space will be allocated + at the beginning of each raid device. The kernel raid4/5/6/10 + MD personalities supporting such device addition will read the data from + the existing first stripes (those with smaller number of stripes) + starting at data_offset to fill up a new stripe with the larger + number of stripes, calculate the redundancy blocks (CRC/Q-syndrome) + and write that new stripe to offset 0. Same will be applied to all + N-1 other new stripes. This out-of-place scheme is used to change + the RAID type (i.e. the allocation algorithm) as well, e.g. + changing from raid5_ls to raid5_n. + + [journal_dev ] + This option adds a journal device to raid4/5/6 raid sets and + uses it to close the 'write hole' caused by the non-atomic updates + to the component devices which can cause data loss during recovery. + The journal device is used as writethrough thus causing writes to + be throttled versus non-journaled raid4/5/6 sets. + Takeover/reshape is not possible with a raid4/5/6 journal device; + it has to be deconfigured before requesting these. + + [journal_mode ] + This option sets the caching mode on journaled raid4/5/6 raid sets + (see 'journal_dev ' above) to 'writethrough' or 'writeback'. + If 'writeback' is selected the journal device has to be resilient + and must not suffer from the 'write hole' problem itself (e.g. use + raid1 or raid10) to avoid a single point of failure. + +<#raid_devs>: The number of devices composing the array. + Each device consists of two entries. The first is the device + containing the metadata (if any); the second is the one containing the + data. A Maximum of 64 metadata/data device entries are supported + up to target version 1.8.0. + 1.9.0 supports up to 253 which is enforced by the used MD kernel runtime. + + If a drive has failed or is missing at creation time, a '-' can be + given for both the metadata and data drives for a given position. + + +Example Tables +-------------- + +:: + + # RAID4 - 4 data drives, 1 parity (no metadata devices) + # No metadata devices specified to hold superblock/bitmap info + # Chunk size of 1MiB + # (Lines separated for easy reading) + + 0 1960893648 raid \ + raid4 1 2048 \ + 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 + + # RAID4 - 4 data drives, 1 parity (with metadata devices) + # Chunk size of 1MiB, force RAID initialization, + # min recovery rate at 20 kiB/sec/disk + + 0 1960893648 raid \ + raid4 4 2048 sync min_recovery_rate 20 \ + 5 8:17 8:18 8:33 8:34 8:49 8:50 8:65 8:66 8:81 8:82 + + +Status Output +------------- +'dmsetup table' displays the table used to construct the mapping. +The optional parameters are always printed in the order listed +above with "sync" or "nosync" always output ahead of the other +arguments, regardless of the order used when originally loading the table. +Arguments that can be repeated are ordered by value. + + +'dmsetup status' yields information on the state and health of the array. +The output is as follows (normally a single line, but expanded here for +clarity):: + + 1: raid \ + 2: <#devices> \ + 3: + +Line 1 is the standard output produced by device-mapper. + +Line 2 & 3 are produced by the raid target and are best explained by example:: + + 0 1960893648 raid raid4 5 AAAAA 2/490221568 init 0 + +Here we can see the RAID type is raid4, there are 5 devices - all of +which are 'A'live, and the array is 2/490221568 complete with its initial +recovery. Here is a fuller description of the individual fields: + + =============== ========================================================= + Same as the used to create the array. + One char for each device, indicating: + + - 'A' = alive and in-sync + - 'a' = alive but not in-sync + - 'D' = dead/failed. + The ratio indicating how much of the array has undergone + the process described by 'sync_action'. If the + 'sync_action' is "check" or "repair", then the process + of "resync" or "recover" can be considered complete. + One of the following possible states: + + idle + - No synchronization action is being performed. + frozen + - The current action has been halted. + resync + - Array is undergoing its initial synchronization + or is resynchronizing after an unclean shutdown + (possibly aided by a bitmap). + recover + - A device in the array is being rebuilt or + replaced. + check + - A user-initiated full check of the array is + being performed. All blocks are read and + checked for consistency. The number of + discrepancies found are recorded in + . No changes are made to the + array by this action. + repair + - The same as "check", but discrepancies are + corrected. + reshape + - The array is undergoing a reshape. + The number of discrepancies found between mirror copies + in RAID1/10 or wrong parity values found in RAID4/5/6. + This value is valid only after a "check" of the array + is performed. A healthy array has a 'mismatch_cnt' of 0. + The current data offset to the start of the user data on + each component device of a raid set (see the respective + raid parameter to support out-of-place reshaping). + - 'A' - active write-through journal device. + - 'a' - active write-back journal device. + - 'D' - dead journal device. + - '-' - no journal device. + =============== ========================================================= + + +Message Interface +----------------- +The dm-raid target will accept certain actions through the 'message' interface. +('man dmsetup' for more information on the message interface.) These actions +include: + + ========= ================================================ + "idle" Halt the current sync action. + "frozen" Freeze the current sync action. + "resync" Initiate/continue a resync. + "recover" Initiate/continue a recover process. + "check" Initiate a check (i.e. a "scrub") of the array. + "repair" Initiate a repair of the array. + ========= ================================================ + + +Discard Support +--------------- +The implementation of discard support among hardware vendors varies. +When a block is discarded, some storage devices will return zeroes when +the block is read. These devices set the 'discard_zeroes_data' +attribute. Other devices will return random data. Confusingly, some +devices that advertise 'discard_zeroes_data' will not reliably return +zeroes when discarded blocks are read! Since RAID 4/5/6 uses blocks +from a number of devices to calculate parity blocks and (for performance +reasons) relies on 'discard_zeroes_data' being reliable, it is important +that the devices be consistent. Blocks may be discarded in the middle +of a RAID 4/5/6 stripe and if subsequent read results are not +consistent, the parity blocks may be calculated differently at any time; +making the parity blocks useless for redundancy. It is important to +understand how your hardware behaves with discards if you are going to +enable discards with RAID 4/5/6. + +Since the behavior of storage devices is unreliable in this respect, +even when reporting 'discard_zeroes_data', by default RAID 4/5/6 +discard support is disabled -- this ensures data integrity at the +expense of losing some performance. + +Storage devices that properly support 'discard_zeroes_data' are +increasingly whitelisted in the kernel and can thus be trusted. + +For trusted devices, the following dm-raid module parameter can be set +to safely enable discard support for RAID 4/5/6: + + 'devices_handle_discards_safely' + + +Version History +--------------- + +:: + + 1.0.0 Initial version. Support for RAID 4/5/6 + 1.1.0 Added support for RAID 1 + 1.2.0 Handle creation of arrays that contain failed devices. + 1.3.0 Added support for RAID 10 + 1.3.1 Allow device replacement/rebuild for RAID 10 + 1.3.2 Fix/improve redundancy checking for RAID10 + 1.4.0 Non-functional change. Removes arg from mapping function. + 1.4.1 RAID10 fix redundancy validation checks (commit 55ebbb5). + 1.4.2 Add RAID10 "far" and "offset" algorithm support. + 1.5.0 Add message interface to allow manipulation of the sync_action. + New status (STATUSTYPE_INFO) fields: sync_action and mismatch_cnt. + 1.5.1 Add ability to restore transiently failed devices on resume. + 1.5.2 'mismatch_cnt' is zero unless [last_]sync_action is "check". + 1.6.0 Add discard support (and devices_handle_discard_safely module param). + 1.7.0 Add support for MD RAID0 mappings. + 1.8.0 Explicitly check for compatible flags in the superblock metadata + and reject to start the raid set if any are set by a newer + target version, thus avoiding data corruption on a raid set + with a reshape in progress. + 1.9.0 Add support for RAID level takeover/reshape/region size + and set size reduction. + 1.9.1 Fix activation of existing RAID 4/10 mapped devices + 1.9.2 Don't emit '- -' on the status table line in case the constructor + fails reading a superblock. Correctly emit 'maj:min1 maj:min2' and + 'D' on the status line. If '- -' is passed into the constructor, emit + '- -' on the table line and '-' as the status line health character. + 1.10.0 Add support for raid4/5/6 journal device + 1.10.1 Fix data corruption on reshape request + 1.11.0 Fix table line argument order + (wrong raid10_copies/raid10_format sequence) + 1.11.1 Add raid4/5/6 journal write-back support via journal_mode option + 1.12.1 Fix for MD deadlock between mddev_suspend() and md_write_start() available + 1.13.0 Fix dev_health status at end of "recover" (was 'a', now 'A') + 1.13.1 Fix deadlock caused by early md_stop_writes(). Also fix size an + state races. + 1.13.2 Fix raid redundancy validation and avoid keeping raid set frozen + 1.14.0 Fix reshape race on small devices. Fix stripe adding reshape + deadlock/potential data corruption. Update superblock when + specific devices are requested via rebuild. Fix RAID leg + rebuild errors. diff --git a/Documentation/admin-guide/device-mapper/dm-service-time.rst b/Documentation/admin-guide/device-mapper/dm-service-time.rst new file mode 100644 index 000000000000..facf277fc13c --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-service-time.rst @@ -0,0 +1,101 @@ +=============== +dm-service-time +=============== + +dm-service-time is a path selector module for device-mapper targets, +which selects a path with the shortest estimated service time for +the incoming I/O. + +The service time for each path is estimated by dividing the total size +of in-flight I/Os on a path with the performance value of the path. +The performance value is a relative throughput value among all paths +in a path-group, and it can be specified as a table argument. + +The path selector name is 'service-time'. + +Table parameters for each path: + + [ []] + : + The number of I/Os to dispatch using the selected + path before switching to the next path. + If not given, internal default is used. To check + the default value, see the activated table. + : + The relative throughput value of the path + among all paths in the path-group. + The valid range is 0-100. + If not given, minimum value '1' is used. + If '0' is given, the path isn't selected while + other paths having a positive value are available. + +Status for each path: + + + : + 'A' if the path is active, 'F' if the path is failed. + : + The number of path failures. + : + The size of in-flight I/Os on the path. + : + The relative throughput value of the path + among all paths in the path-group. + + +Algorithm +========= + +dm-service-time adds the I/O size to 'in-flight-size' when the I/O is +dispatched and subtracts when completed. +Basically, dm-service-time selects a path having minimum service time +which is calculated by:: + + ('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput' + +However, some optimizations below are used to reduce the calculation +as much as possible. + + 1. If the paths have the same 'relative_throughput', skip + the division and just compare the 'in-flight-size'. + + 2. If the paths have the same 'in-flight-size', skip the division + and just compare the 'relative_throughput'. + + 3. If some paths have non-zero 'relative_throughput' and others + have zero 'relative_throughput', ignore those paths with zero + 'relative_throughput'. + +If such optimizations can't be applied, calculate service time, and +compare service time. +If calculated service time is equal, the path having maximum +'relative_throughput' may be better. So compare 'relative_throughput' +then. + + +Examples +======== +In case that 2 paths (sda and sdb) are used with repeat_count == 128 +and sda has an average throughput 1GB/s and sdb has 4GB/s, +'relative_throughput' value may be '1' for sda and '4' for sdb:: + + # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \ + dmsetup create test + # + # dmsetup table + test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4 + # + # dmsetup status + test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4 + + +Or '2' for sda and '8' for sdb would be also true:: + + # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \ + dmsetup create test + # + # dmsetup table + test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8 + # + # dmsetup status + test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8 diff --git a/Documentation/admin-guide/device-mapper/dm-uevent.rst b/Documentation/admin-guide/device-mapper/dm-uevent.rst new file mode 100644 index 000000000000..4a8ee8d069c9 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-uevent.rst @@ -0,0 +1,110 @@ +==================== +device-mapper uevent +==================== + +The device-mapper uevent code adds the capability to device-mapper to create +and send kobject uevents (uevents). Previously device-mapper events were only +available through the ioctl interface. The advantage of the uevents interface +is the event contains environment attributes providing increased context for +the event avoiding the need to query the state of the device-mapper device after +the event is received. + +There are two functions currently for device-mapper events. The first function +listed creates the event and the second function sends the event(s):: + + void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, + const char *path, unsigned nr_valid_paths) + + void dm_send_uevents(struct list_head *events, struct kobject *kobj) + + +The variables added to the uevent environment are: + +Variable Name: DM_TARGET +------------------------ +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: +:Value: Name of device-mapper target that generated the event. + +Variable Name: DM_ACTION +------------------------ +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: +:Value: Device-mapper specific action that caused the uevent action. + PATH_FAILED - A path has failed; + PATH_REINSTATED - A path has been reinstated. + +Variable Name: DM_SEQNUM +------------------------ +:Uevent Action(s): KOBJ_CHANGE +:Type: unsigned integer +:Description: A sequence number for this specific device-mapper device. +:Value: Valid unsigned integer range. + +Variable Name: DM_PATH +---------------------- +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: Major and minor number of the path device pertaining to this + event. +:Value: Path name in the form of "Major:Minor" + +Variable Name: DM_NR_VALID_PATHS +-------------------------------- +:Uevent Action(s): KOBJ_CHANGE +:Type: unsigned integer +:Description: +:Value: Valid unsigned integer range. + +Variable Name: DM_NAME +---------------------- +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: Name of the device-mapper device. +:Value: Name + +Variable Name: DM_UUID +---------------------- +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: UUID of the device-mapper device. +:Value: UUID. (Empty string if there isn't one.) + +An example of the uevents generated as captured by udevmonitor is shown +below + +1.) Path failure:: + + UEVENT[1192521009.711215] change@/block/dm-3 + ACTION=change + DEVPATH=/block/dm-3 + SUBSYSTEM=block + DM_TARGET=multipath + DM_ACTION=PATH_FAILED + DM_SEQNUM=1 + DM_PATH=8:32 + DM_NR_VALID_PATHS=0 + DM_NAME=mpath2 + DM_UUID=mpath-35333333000002328 + MINOR=3 + MAJOR=253 + SEQNUM=1130 + +2.) Path reinstate:: + + UEVENT[1192521132.989927] change@/block/dm-3 + ACTION=change + DEVPATH=/block/dm-3 + SUBSYSTEM=block + DM_TARGET=multipath + DM_ACTION=PATH_REINSTATED + DM_SEQNUM=2 + DM_PATH=8:32 + DM_NR_VALID_PATHS=1 + DM_NAME=mpath2 + DM_UUID=mpath-35333333000002328 + MINOR=3 + MAJOR=253 + SEQNUM=1131 diff --git a/Documentation/admin-guide/device-mapper/dm-zoned.rst b/Documentation/admin-guide/device-mapper/dm-zoned.rst new file mode 100644 index 000000000000..07f56ebc1730 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-zoned.rst @@ -0,0 +1,146 @@ +======== +dm-zoned +======== + +The dm-zoned device mapper target exposes a zoned block device (ZBC and +ZAC compliant devices) as a regular block device without any write +pattern constraints. In effect, it implements a drive-managed zoned +block device which hides from the user (a file system or an application +doing raw block device accesses) the sequential write constraints of +host-managed zoned block devices and can mitigate the potential +device-side performance degradation due to excessive random writes on +host-aware zoned block devices. + +For a more detailed description of the zoned block device models and +their constraints see (for SCSI devices): + +http://www.t10.org/drafts.htm#ZBC_Family + +and (for ATA devices): + +http://www.t13.org/Documents/UploadedDocuments/docs2015/di537r05-Zoned_Device_ATA_Command_Set_ZAC.pdf + +The dm-zoned implementation is simple and minimizes system overhead (CPU +and memory usage as well as storage capacity loss). For a 10TB +host-managed disk with 256 MB zones, dm-zoned memory usage per disk +instance is at most 4.5 MB and as little as 5 zones will be used +internally for storing metadata and performaing reclaim operations. + +dm-zoned target devices are formatted and checked using the dmzadm +utility available at: + +https://github.com/hgst/dm-zoned-tools + +Algorithm +========= + +dm-zoned implements an on-disk buffering scheme to handle non-sequential +write accesses to the sequential zones of a zoned block device. +Conventional zones are used for caching as well as for storing internal +metadata. + +The zones of the device are separated into 2 types: + +1) Metadata zones: these are conventional zones used to store metadata. +Metadata zones are not reported as useable capacity to the user. + +2) Data zones: all remaining zones, the vast majority of which will be +sequential zones used exclusively to store user data. The conventional +zones of the device may be used also for buffering user random writes. +Data in these zones may be directly mapped to the conventional zone, but +later moved to a sequential zone so that the conventional zone can be +reused for buffering incoming random writes. + +dm-zoned exposes a logical device with a sector size of 4096 bytes, +irrespective of the physical sector size of the backend zoned block +device being used. This allows reducing the amount of metadata needed to +manage valid blocks (blocks written). + +The on-disk metadata format is as follows: + +1) The first block of the first conventional zone found contains the +super block which describes the on disk amount and position of metadata +blocks. + +2) Following the super block, a set of blocks is used to describe the +mapping of the logical device blocks. The mapping is done per chunk of +blocks, with the chunk size equal to the zoned block device size. The +mapping table is indexed by chunk number and each mapping entry +indicates the zone number of the device storing the chunk of data. Each +mapping entry may also indicate if the zone number of a conventional +zone used to buffer random modification to the data zone. + +3) A set of blocks used to store bitmaps indicating the validity of +blocks in the data zones follows the mapping table. A valid block is +defined as a block that was written and not discarded. For a buffered +data chunk, a block is always valid only in the data zone mapping the +chunk or in the buffer zone of the chunk. + +For a logical chunk mapped to a conventional zone, all write operations +are processed by directly writing to the zone. If the mapping zone is a +sequential zone, the write operation is processed directly only if the +write offset within the logical chunk is equal to the write pointer +offset within of the sequential data zone (i.e. the write operation is +aligned on the zone write pointer). Otherwise, write operations are +processed indirectly using a buffer zone. In that case, an unused +conventional zone is allocated and assigned to the chunk being +accessed. Writing a block to the buffer zone of a chunk will +automatically invalidate the same block in the sequential zone mapping +the chunk. If all blocks of the sequential zone become invalid, the zone +is freed and the chunk buffer zone becomes the primary zone mapping the +chunk, resulting in native random write performance similar to a regular +block device. + +Read operations are processed according to the block validity +information provided by the bitmaps. Valid blocks are read either from +the sequential zone mapping a chunk, or if the chunk is buffered, from +the buffer zone assigned. If the accessed chunk has no mapping, or the +accessed blocks are invalid, the read buffer is zeroed and the read +operation terminated. + +After some time, the limited number of convnetional zones available may +be exhausted (all used to map chunks or buffer sequential zones) and +unaligned writes to unbuffered chunks become impossible. To avoid this +situation, a reclaim process regularly scans used conventional zones and +tries to reclaim the least recently used zones by copying the valid +blocks of the buffer zone to a free sequential zone. Once the copy +completes, the chunk mapping is updated to point to the sequential zone +and the buffer zone freed for reuse. + +Metadata Protection +=================== + +To protect metadata against corruption in case of sudden power loss or +system crash, 2 sets of metadata zones are used. One set, the primary +set, is used as the main metadata region, while the secondary set is +used as a staging area. Modified metadata is first written to the +secondary set and validated by updating the super block in the secondary +set, a generation counter is used to indicate that this set contains the +newest metadata. Once this operation completes, in place of metadata +block updates can be done in the primary metadata set. This ensures that +one of the set is always consistent (all modifications committed or none +at all). Flush operations are used as a commit point. Upon reception of +a flush request, metadata modification activity is temporarily blocked +(for both incoming BIO processing and reclaim process) and all dirty +metadata blocks are staged and updated. Normal operation is then +resumed. Flushing metadata thus only temporarily delays write and +discard requests. Read requests can be processed concurrently while +metadata flush is being executed. + +Usage +===== + +A zoned block device must first be formatted using the dmzadm tool. This +will analyze the device zone configuration, determine where to place the +metadata sets on the device and initialize the metadata sets. + +Ex:: + + dmzadm --format /dev/sdxx + +For a formatted device, the target can be created normally with the +dmsetup utility. The only parameter that dm-zoned requires is the +underlying zoned block device name. Ex:: + + echo "0 `blockdev --getsize ${dev}` zoned ${dev}" | \ + dmsetup create dmz-`basename ${dev}` diff --git a/Documentation/admin-guide/device-mapper/era.rst b/Documentation/admin-guide/device-mapper/era.rst new file mode 100644 index 000000000000..90dd5c670b9f --- /dev/null +++ b/Documentation/admin-guide/device-mapper/era.rst @@ -0,0 +1,116 @@ +====== +dm-era +====== + +Introduction +============ + +dm-era is a target that behaves similar to the linear target. In +addition it keeps track of which blocks were written within a user +defined period of time called an 'era'. Each era target instance +maintains the current era as a monotonically increasing 32-bit +counter. + +Use cases include tracking changed blocks for backup software, and +partially invalidating the contents of a cache to restore cache +coherency after rolling back a vendor snapshot. + +Constructor +=========== + +era + + ================ ====================================================== + metadata dev fast device holding the persistent metadata + origin dev device holding data blocks that may change + block size block size of origin data device, granularity that is + tracked by the target + ================ ====================================================== + +Messages +======== + +None of the dm messages take any arguments. + +checkpoint +---------- + +Possibly move to a new era. You shouldn't assume the era has +incremented. After sending this message, you should check the +current era via the status line. + +take_metadata_snap +------------------ + +Create a clone of the metadata, to allow a userland process to read it. + +drop_metadata_snap +------------------ + +Drop the metadata snapshot. + +Status +====== + + <#used metadata blocks>/<#total metadata blocks> + + +========================= ============================================== +metadata block size Fixed block size for each metadata block in + sectors +#used metadata blocks Number of metadata blocks used +#total metadata blocks Total number of metadata blocks +current era The current era +held metadata root The location, in blocks, of the metadata root + that has been 'held' for userspace read + access. '-' indicates there is no held root +========================= ============================================== + +Detailed use case +================= + +The scenario of invalidating a cache when rolling back a vendor +snapshot was the primary use case when developing this target: + +Taking a vendor snapshot +------------------------ + +- Send a checkpoint message to the era target +- Make a note of the current era in its status line +- Take vendor snapshot (the era and snapshot should be forever + associated now). + +Rolling back to an vendor snapshot +---------------------------------- + +- Cache enters passthrough mode (see: dm-cache's docs in cache.txt) +- Rollback vendor storage +- Take metadata snapshot +- Ascertain which blocks have been written since the snapshot was taken + by checking each block's era +- Invalidate those blocks in the caching software +- Cache returns to writeback/writethrough mode + +Memory usage +============ + +The target uses a bitset to record writes in the current era. It also +has a spare bitset ready for switching over to a new era. Other than +that it uses a few 4k blocks for updating metadata:: + + (4 * nr_blocks) bytes + buffers + +Resilience +========== + +Metadata is updated on disk before a write to a previously unwritten +block is performed. As such dm-era should not be effected by a hard +crash such as power failure. + +Userland tools +============== + +Userland tools are found in the increasingly poorly named +thin-provisioning-tools project: + + https://github.com/jthornber/thin-provisioning-tools diff --git a/Documentation/admin-guide/device-mapper/index.rst b/Documentation/admin-guide/device-mapper/index.rst new file mode 100644 index 000000000000..c77c58b8f67b --- /dev/null +++ b/Documentation/admin-guide/device-mapper/index.rst @@ -0,0 +1,42 @@ +============= +Device Mapper +============= + +.. toctree:: + :maxdepth: 1 + + cache-policies + cache + delay + dm-crypt + dm-flakey + dm-init + dm-integrity + dm-io + dm-log + dm-queue-length + dm-raid + dm-service-time + dm-uevent + dm-zoned + era + kcopyd + linear + log-writes + persistent-data + snapshot + statistics + striped + switch + thin-provisioning + unstriped + verity + writecache + zero + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/admin-guide/device-mapper/kcopyd.rst b/Documentation/admin-guide/device-mapper/kcopyd.rst new file mode 100644 index 000000000000..7651d395127f --- /dev/null +++ b/Documentation/admin-guide/device-mapper/kcopyd.rst @@ -0,0 +1,47 @@ +====== +kcopyd +====== + +Kcopyd provides the ability to copy a range of sectors from one block-device +to one or more other block-devices, with an asynchronous completion +notification. It is used by dm-snapshot and dm-mirror. + +Users of kcopyd must first create a client and indicate how many memory pages +to set aside for their copy jobs. This is done with a call to +kcopyd_client_create():: + + int kcopyd_client_create(unsigned int num_pages, + struct kcopyd_client **result); + +To start a copy job, the user must set up io_region structures to describe +the source and destinations of the copy. Each io_region indicates a +block-device along with the starting sector and size of the region. The source +of the copy is given as one io_region structure, and the destinations of the +copy are given as an array of io_region structures:: + + struct io_region { + struct block_device *bdev; + sector_t sector; + sector_t count; + }; + +To start the copy, the user calls kcopyd_copy(), passing in the client +pointer, pointers to the source and destination io_regions, the name of a +completion callback routine, and a pointer to some context data for the copy:: + + int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, + unsigned int num_dests, struct io_region *dests, + unsigned int flags, kcopyd_notify_fn fn, void *context); + + typedef void (*kcopyd_notify_fn)(int read_err, unsigned int write_err, + void *context); + +When the copy completes, kcopyd will call the user's completion routine, +passing back the user's context pointer. It will also indicate if a read or +write error occurred during the copy. + +When a user is done with all their copy jobs, they should call +kcopyd_client_destroy() to delete the kcopyd client, which will release the +associated memory pages:: + + void kcopyd_client_destroy(struct kcopyd_client *kc); diff --git a/Documentation/admin-guide/device-mapper/linear.rst b/Documentation/admin-guide/device-mapper/linear.rst new file mode 100644 index 000000000000..9d17fc6e64a9 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/linear.rst @@ -0,0 +1,63 @@ +========= +dm-linear +========= + +Device-Mapper's "linear" target maps a linear range of the Device-Mapper +device onto a linear range of another device. This is the basic building +block of logical volume managers. + +Parameters: + : + Full pathname to the underlying block-device, or a + "major:minor" device-number. + : + Starting sector within the device. + + +Example scripts +=============== + +:: + + #!/bin/sh + # Create an identity mapping for a device + echo "0 `blockdev --getsz $1` linear $1 0" | dmsetup create identity + +:: + + #!/bin/sh + # Join 2 devices together + size1=`blockdev --getsz $1` + size2=`blockdev --getsz $2` + echo "0 $size1 linear $1 0 + $size1 $size2 linear $2 0" | dmsetup create joined + +:: + + #!/usr/bin/perl -w + # Split a device into 4M chunks and then join them together in reverse order. + + my $name = "reverse"; + my $extent_size = 4 * 1024 * 2; + my $dev = $ARGV[0]; + my $table = ""; + my $count = 0; + + if (!defined($dev)) { + die("Please specify a device.\n"); + } + + my $dev_size = `blockdev --getsz $dev`; + my $extents = int($dev_size / $extent_size) - + (($dev_size % $extent_size) ? 1 : 0); + + while ($extents > 0) { + my $this_start = $count * $extent_size; + $extents--; + $count++; + my $this_offset = $extents * $extent_size; + + $table .= "$this_start $extent_size linear $dev $this_offset\n"; + } + + `echo \"$table\" | dmsetup create $name`; diff --git a/Documentation/admin-guide/device-mapper/log-writes.rst b/Documentation/admin-guide/device-mapper/log-writes.rst new file mode 100644 index 000000000000..23141f2ffb7c --- /dev/null +++ b/Documentation/admin-guide/device-mapper/log-writes.rst @@ -0,0 +1,145 @@ +============= +dm-log-writes +============= + +This target takes 2 devices, one to pass all IO to normally, and one to log all +of the write operations to. This is intended for file system developers wishing +to verify the integrity of metadata or data as the file system is written to. +There is a log_write_entry written for every WRITE request and the target is +able to take arbitrary data from userspace to insert into the log. The data +that is in the WRITE requests is copied into the log to make the replay happen +exactly as it happened originally. + +Log Ordering +============ + +We log things in order of completion once we are sure the write is no longer in +cache. This means that normal WRITE requests are not actually logged until the +next REQ_PREFLUSH request. This is to make it easier for userspace to replay +the log in a way that correlates to what is on disk and not what is in cache, +to make it easier to detect improper waiting/flushing. + +This works by attaching all WRITE requests to a list once the write completes. +Once we see a REQ_PREFLUSH request we splice this list onto the request and once +the FLUSH request completes we log all of the WRITEs and then the FLUSH. Only +completed WRITEs, at the time the REQ_PREFLUSH is issued, are added in order to +simulate the worst case scenario with regard to power failures. Consider the +following example (W means write, C means complete): + + W1,W2,W3,C3,C2,Wflush,C1,Cflush + +The log would show the following: + + W3,W2,flush,W1.... + +Again this is to simulate what is actually on disk, this allows us to detect +cases where a power failure at a particular point in time would create an +inconsistent file system. + +Any REQ_FUA requests bypass this flushing mechanism and are logged as soon as +they complete as those requests will obviously bypass the device cache. + +Any REQ_OP_DISCARD requests are treated like WRITE requests. Otherwise we would +have all the DISCARD requests, and then the WRITE requests and then the FLUSH +request. Consider the following example: + + WRITE block 1, DISCARD block 1, FLUSH + +If we logged DISCARD when it completed, the replay would look like this: + + DISCARD 1, WRITE 1, FLUSH + +which isn't quite what happened and wouldn't be caught during the log replay. + +Target interface +================ + +i) Constructor + + log-writes + + ============= ============================================== + dev_path Device that all of the IO will go to normally. + log_dev_path Device where the log entries are written to. + ============= ============================================== + +ii) Status + + <#logged entries> + + =========================== ======================== + #logged entries Number of logged entries + highest allocated sector Highest allocated sector + =========================== ======================== + +iii) Messages + + mark + + You can use a dmsetup message to set an arbitrary mark in a log. + For example say you want to fsck a file system after every + write, but first you need to replay up to the mkfs to make sure + we're fsck'ing something reasonable, you would do something like + this:: + + mkfs.btrfs -f /dev/mapper/log + dmsetup message log 0 mark mkfs + + + This would allow you to replay the log up to the mkfs mark and + then replay from that point on doing the fsck check in the + interval that you want. + + Every log has a mark at the end labeled "dm-log-writes-end". + +Userspace component +=================== + +There is a userspace tool that will replay the log for you in various ways. +It can be found here: https://github.com/josefbacik/log-writes + +Example usage +============= + +Say you want to test fsync on your file system. You would do something like +this:: + + TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" + dmsetup create log --table "$TABLE" + mkfs.btrfs -f /dev/mapper/log + dmsetup message log 0 mark mkfs + + mount /dev/mapper/log /mnt/btrfs-test + + dmsetup message log 0 mark fsync + md5sum /mnt/btrfs-test/foo + umount /mnt/btrfs-test + + dmsetup remove log + replay-log --log /dev/sdc --replay /dev/sdb --end-mark fsync + mount /dev/sdb /mnt/btrfs-test + md5sum /mnt/btrfs-test/foo + + + Another option is to do a complicated file system operation and verify the file + system is consistent during the entire operation. You could do this with: + + TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" + dmsetup create log --table "$TABLE" + mkfs.btrfs -f /dev/mapper/log + dmsetup message log 0 mark mkfs + + mount /dev/mapper/log /mnt/btrfs-test + + btrfs filesystem balance /mnt/btrfs-test + umount /mnt/btrfs-test + dmsetup remove log + + replay-log --log /dev/sdc --replay /dev/sdb --end-mark mkfs + btrfsck /dev/sdb + replay-log --log /dev/sdc --replay /dev/sdb --start-mark mkfs \ + --fsck "btrfsck /dev/sdb" --check fua + +And that will replay the log until it sees a FUA request, run the fsck command +and if the fsck passes it will replay to the next FUA, until it is completed or +the fsck command exists abnormally. diff --git a/Documentation/admin-guide/device-mapper/persistent-data.rst b/Documentation/admin-guide/device-mapper/persistent-data.rst new file mode 100644 index 000000000000..2065c3c5a091 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/persistent-data.rst @@ -0,0 +1,88 @@ +=============== +Persistent data +=============== + +Introduction +============ + +The more-sophisticated device-mapper targets require complex metadata +that is managed in kernel. In late 2010 we were seeing that various +different targets were rolling their own data structures, for example: + +- Mikulas Patocka's multisnap implementation +- Heinz Mauelshagen's thin provisioning target +- Another btree-based caching target posted to dm-devel +- Another multi-snapshot target based on a design of Daniel Phillips + +Maintaining these data structures takes a lot of work, so if possible +we'd like to reduce the number. + +The persistent-data library is an attempt to provide a re-usable +framework for people who want to store metadata in device-mapper +targets. It's currently used by the thin-provisioning target and an +upcoming hierarchical storage target. + +Overview +======== + +The main documentation is in the header files which can all be found +under drivers/md/persistent-data. + +The block manager +----------------- + +dm-block-manager.[hc] + +This provides access to the data on disk in fixed sized-blocks. There +is a read/write locking interface to prevent concurrent accesses, and +keep data that is being used in the cache. + +Clients of persistent-data are unlikely to use this directly. + +The transaction manager +----------------------- + +dm-transaction-manager.[hc] + +This restricts access to blocks and enforces copy-on-write semantics. +The only way you can get hold of a writable block through the +transaction manager is by shadowing an existing block (ie. doing +copy-on-write) or allocating a fresh one. Shadowing is elided within +the same transaction so performance is reasonable. The commit method +ensures that all data is flushed before it writes the superblock. +On power failure your metadata will be as it was when last committed. + +The Space Maps +-------------- + +dm-space-map.h +dm-space-map-metadata.[hc] +dm-space-map-disk.[hc] + +On-disk data structures that keep track of reference counts of blocks. +Also acts as the allocator of new blocks. Currently two +implementations: a simpler one for managing blocks on a different +device (eg. thinly-provisioned data blocks); and one for managing +the metadata space. The latter is complicated by the need to store +its own data within the space it's managing. + +The data structures +------------------- + +dm-btree.[hc] +dm-btree-remove.c +dm-btree-spine.c +dm-btree-internal.h + +Currently there is only one data structure, a hierarchical btree. +There are plans to add more. For example, something with an +array-like interface would see a lot of use. + +The btree is 'hierarchical' in that you can define it to be composed +of nested btrees, and take multiple keys. For example, the +thin-provisioning target uses a btree with two levels of nesting. +The first maps a device id to a mapping tree, and that in turn maps a +virtual block to a physical block. + +Values stored in the btrees can have arbitrary size. Keys are always +64bits, although nesting allows you to use multiple keys. diff --git a/Documentation/admin-guide/device-mapper/snapshot.rst b/Documentation/admin-guide/device-mapper/snapshot.rst new file mode 100644 index 000000000000..ccdd8b587a74 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/snapshot.rst @@ -0,0 +1,196 @@ +============================== +Device-mapper snapshot support +============================== + +Device-mapper allows you, without massive data copying: + +- To create snapshots of any block device i.e. mountable, saved states of + the block device which are also writable without interfering with the + original content; +- To create device "forks", i.e. multiple different versions of the + same data stream. +- To merge a snapshot of a block device back into the snapshot's origin + device. + +In the first two cases, dm copies only the chunks of data that get +changed and uses a separate copy-on-write (COW) block device for +storage. + +For snapshot merge the contents of the COW storage are merged back into +the origin device. + + +There are three dm targets available: +snapshot, snapshot-origin, and snapshot-merge. + +- snapshot-origin + +which will normally have one or more snapshots based on it. +Reads will be mapped directly to the backing device. For each write, the +original data will be saved in the of each snapshot to keep +its visible content unchanged, at least until the fills up. + + +- snapshot + [<# feature args> []*] + +A snapshot of the block device is created. Changed chunks of + sectors will be stored on the . Writes will +only go to the . Reads will come from the or +from for unchanged data. will often be +smaller than the origin and if it fills up the snapshot will become +useless and be disabled, returning errors. So it is important to monitor +the amount of free space and expand the before it fills up. + + is P (Persistent) or N (Not persistent - will not survive +after reboot). O (Overflow) can be added as a persistent store option +to allow userspace to advertise its support for seeing "Overflow" in the +snapshot status. So supported store types are "P", "PO" and "N". + +The difference between persistent and transient is with transient +snapshots less metadata must be saved on disk - they can be kept in +memory by the kernel. + +When loading or unloading the snapshot target, the corresponding +snapshot-origin or snapshot-merge target must be suspended. A failure to +suspend the origin target could result in data corruption. + +Optional features: + + discard_zeroes_cow - a discard issued to the snapshot device that + maps to entire chunks to will zero the corresponding exception(s) in + the snapshot's exception store. + + discard_passdown_origin - a discard to the snapshot device is passed + down to the snapshot-origin's underlying device. This doesn't cause + copy-out to the snapshot exception store because the snapshot-origin + target is bypassed. + + The discard_passdown_origin feature depends on the discard_zeroes_cow + feature being enabled. + + +- snapshot-merge + [<# feature args> []*] + +takes the same table arguments as the snapshot target except it only +works with persistent snapshots. This target assumes the role of the +"snapshot-origin" target and must not be loaded if the "snapshot-origin" +is still present for . + +Creates a merging snapshot that takes control of the changed chunks +stored in the of an existing snapshot, through a handover +procedure, and merges these chunks back into the . Once merging +has started (in the background) the may be opened and the merge +will continue while I/O is flowing to it. Changes to the are +deferred until the merging snapshot's corresponding chunk(s) have been +merged. Once merging has started the snapshot device, associated with +the "snapshot" target, will return -EIO when accessed. + + +How snapshot is used by LVM2 +============================ +When you create the first LVM2 snapshot of a volume, four dm devices are used: + +1) a device containing the original mapping table of the source volume; +2) a device used as the ; +3) a "snapshot" device, combining #1 and #2, which is the visible snapshot + volume; +4) the "original" volume (which uses the device number used by the original + source volume), whose table is replaced by a "snapshot-origin" mapping + from device #1. + +A fixed naming scheme is used, so with the following commands:: + + lvcreate -L 1G -n base volumeGroup + lvcreate -L 100M --snapshot -n snap volumeGroup/base + +we'll have this situation (with volumes in above order):: + + # dmsetup table|grep volumeGroup + + volumeGroup-base-real: 0 2097152 linear 8:19 384 + volumeGroup-snap-cow: 0 204800 linear 8:19 2097536 + volumeGroup-snap: 0 2097152 snapshot 254:11 254:12 P 16 + volumeGroup-base: 0 2097152 snapshot-origin 254:11 + + # ls -lL /dev/mapper/volumeGroup-* + brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real + brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow + brw------- 1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap + brw------- 1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base + + +How snapshot-merge is used by LVM2 +================================== +A merging snapshot assumes the role of the "snapshot-origin" while +merging. As such the "snapshot-origin" is replaced with +"snapshot-merge". The "-real" device is not changed and the "-cow" +device is renamed to -cow to aid LVM2's cleanup of the +merging snapshot after it completes. The "snapshot" that hands over its +COW device to the "snapshot-merge" is deactivated (unless using lvchange +--refresh); but if it is left active it will simply return I/O errors. + +A snapshot will merge into its origin with the following command:: + + lvconvert --merge volumeGroup/snap + +we'll now have this situation:: + + # dmsetup table|grep volumeGroup + + volumeGroup-base-real: 0 2097152 linear 8:19 384 + volumeGroup-base-cow: 0 204800 linear 8:19 2097536 + volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16 + + # ls -lL /dev/mapper/volumeGroup-* + brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real + brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow + brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base + + +How to determine when a merging is complete +=========================================== +The snapshot-merge and snapshot status lines end with: + + / + +Both and include both data and metadata. +During merging, the number of sectors allocated gets smaller and +smaller. Merging has finished when the number of sectors holding data +is zero, in other words == . + +Here is a practical example (using a hybrid of lvm and dmsetup commands):: + + # lvs + LV VG Attr LSize Origin Snap% Move Log Copy% Convert + base volumeGroup owi-a- 4.00g + snap volumeGroup swi-a- 1.00g base 18.97 + + # dmsetup status volumeGroup-snap + 0 8388608 snapshot 397896/2097152 1560 + ^^^^ metadata sectors + + # lvconvert --merge -b volumeGroup/snap + Merging of volume snap started. + + # lvs volumeGroup/snap + LV VG Attr LSize Origin Snap% Move Log Copy% Convert + base volumeGroup Owi-a- 4.00g 17.23 + + # dmsetup status volumeGroup-base + 0 8388608 snapshot-merge 281688/2097152 1104 + + # dmsetup status volumeGroup-base + 0 8388608 snapshot-merge 180480/2097152 712 + + # dmsetup status volumeGroup-base + 0 8388608 snapshot-merge 16/2097152 16 + +Merging has finished. + +:: + + # lvs + LV VG Attr LSize Origin Snap% Move Log Copy% Convert + base volumeGroup owi-a- 4.00g diff --git a/Documentation/admin-guide/device-mapper/statistics.rst b/Documentation/admin-guide/device-mapper/statistics.rst new file mode 100644 index 000000000000..3d80a9f850cc --- /dev/null +++ b/Documentation/admin-guide/device-mapper/statistics.rst @@ -0,0 +1,225 @@ +============= +DM statistics +============= + +Device Mapper supports the collection of I/O statistics on user-defined +regions of a DM device. If no regions are defined no statistics are +collected so there isn't any performance impact. Only bio-based DM +devices are currently supported. + +Each user-defined region specifies a starting sector, length and step. +Individual statistics will be collected for each step-sized area within +the range specified. + +The I/O statistics counters for each step-sized area of a region are +in the same format as `/sys/block/*/stat` or `/proc/diskstats` (see: +Documentation/iostats.txt). But two extra counters (12 and 13) are +provided: total time spent reading and writing. When the histogram +argument is used, the 14th parameter is reported that represents the +histogram of latencies. All these counters may be accessed by sending +the @stats_print message to the appropriate DM device via dmsetup. + +The reported times are in milliseconds and the granularity depends on +the kernel ticks. When the option precise_timestamps is used, the +reported times are in nanoseconds. + +Each region has a corresponding unique identifier, which we call a +region_id, that is assigned when the region is created. The region_id +must be supplied when querying statistics about the region, deleting the +region, etc. Unique region_ids enable multiple userspace programs to +request and process statistics for the same DM device without stepping +on each other's data. + +The creation of DM statistics will allocate memory via kmalloc or +fallback to using vmalloc space. At most, 1/4 of the overall system +memory may be allocated by DM statistics. The admin can see how much +memory is used by reading: + + /sys/module/dm_mod/parameters/stats_current_allocated_bytes + +Messages +======== + + @stats_create [ ...] [ []] + Create a new region and return the region_id. + + + "-" + whole device + "+" + a range of 512-byte sectors + starting with . + + + "" + the range is subdivided into areas each containing + sectors. + "/" + the range is subdivided into the specified + number of areas. + + + The number of optional arguments + + + The following optional arguments are supported: + + precise_timestamps + use precise timer with nanosecond resolution + instead of the "jiffies" variable. When this argument is + used, the resulting times are in nanoseconds instead of + milliseconds. Precise timestamps are a little bit slower + to obtain than jiffies-based timestamps. + histogram:n1,n2,n3,n4,... + collect histogram of latencies. The + numbers n1, n2, etc are times that represent the boundaries + of the histogram. If precise_timestamps is not used, the + times are in milliseconds, otherwise they are in + nanoseconds. For each range, the kernel will report the + number of requests that completed within this range. For + example, if we use "histogram:10,20,30", the kernel will + report four numbers a:b:c:d. a is the number of requests + that took 0-10 ms to complete, b is the number of requests + that took 10-20 ms to complete, c is the number of requests + that took 20-30 ms to complete and d is the number of + requests that took more than 30 ms to complete. + + + An optional parameter. A name that uniquely identifies + the userspace owner of the range. This groups ranges together + so that userspace programs can identify the ranges they + created and ignore those created by others. + The kernel returns this string back in the output of + @stats_list message, but it doesn't use it for anything else. + If we omit the number of optional arguments, program id must not + be a number, otherwise it would be interpreted as the number of + optional arguments. + + + An optional parameter. A word that provides auxiliary data + that is useful to the client program that created the range. + The kernel returns this string back in the output of + @stats_list message, but it doesn't use this value for anything. + + @stats_delete + Delete the region with the specified id. + + + region_id returned from @stats_create + + @stats_clear + Clear all the counters except the in-flight i/o counters. + + + region_id returned from @stats_create + + @stats_list [] + List all regions registered with @stats_create. + + + An optional parameter. + If this parameter is specified, only matching regions + are returned. + If it is not specified, all regions are returned. + + Output format: + : + + precise_timestamps histogram:n1,n2,n3,... + + The strings "precise_timestamps" and "histogram" are printed only + if they were specified when creating the region. + + @stats_print [ ] + Print counters for each step-sized area of a region. + + + region_id returned from @stats_create + + + The index of the starting line in the output. + If omitted, all lines are returned. + + + The number of lines to include in the output. + If omitted, all lines are returned. + + Output format for each step-sized area of a region: + + + + counters + + The first 11 counters have the same meaning as + `/sys/block/*/stat or /proc/diskstats`. + + Please refer to Documentation/iostats.txt for details. + + 1. the number of reads completed + 2. the number of reads merged + 3. the number of sectors read + 4. the number of milliseconds spent reading + 5. the number of writes completed + 6. the number of writes merged + 7. the number of sectors written + 8. the number of milliseconds spent writing + 9. the number of I/Os currently in progress + 10. the number of milliseconds spent doing I/Os + 11. the weighted number of milliseconds spent doing I/Os + + Additional counters: + + 12. the total time spent reading in milliseconds + 13. the total time spent writing in milliseconds + + @stats_print_clear [ ] + Atomically print and then clear all the counters except the + in-flight i/o counters. Useful when the client consuming the + statistics does not want to lose any statistics (those updated + between printing and clearing). + + + region_id returned from @stats_create + + + The index of the starting line in the output. + If omitted, all lines are printed and then cleared. + + + The number of lines to process. + If omitted, all lines are printed and then cleared. + + @stats_set_aux + Store auxiliary data aux_data for the specified region. + + + region_id returned from @stats_create + + + The string that identifies data which is useful to the client + program that created the range. The kernel returns this + string back in the output of @stats_list message, but it + doesn't use this value for anything. + +Examples +======== + +Subdivide the DM device 'vol' into 100 pieces and start collecting +statistics on them:: + + dmsetup message vol 0 @stats_create - /100 + +Set the auxiliary data string to "foo bar baz" (the escape for each +space must also be escaped, otherwise the shell will consume them):: + + dmsetup message vol 0 @stats_set_aux 0 foo\\ bar\\ baz + +List the statistics:: + + dmsetup message vol 0 @stats_list + +Print the statistics:: + + dmsetup message vol 0 @stats_print 0 + +Delete the statistics:: + + dmsetup message vol 0 @stats_delete 0 diff --git a/Documentation/admin-guide/device-mapper/striped.rst b/Documentation/admin-guide/device-mapper/striped.rst new file mode 100644 index 000000000000..e9a8da192ae1 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/striped.rst @@ -0,0 +1,61 @@ +========= +dm-stripe +========= + +Device-Mapper's "striped" target is used to create a striped (i.e. RAID-0) +device across one or more underlying devices. Data is written in "chunks", +with consecutive chunks rotating among the underlying devices. This can +potentially provide improved I/O throughput by utilizing several physical +devices in parallel. + +Parameters: [ ]+ + : + Number of underlying devices. + : + Size of each chunk of data. Must be at least as + large as the system's PAGE_SIZE. + : + Full pathname to the underlying block-device, or a + "major:minor" device-number. + : + Starting sector within the device. + +One or more underlying devices can be specified. The striped device size must +be a multiple of the chunk size multiplied by the number of underlying devices. + + +Example scripts +=============== + +:: + + #!/usr/bin/perl -w + # Create a striped device across any number of underlying devices. The device + # will be called "stripe_dev" and have a chunk-size of 128k. + + my $chunk_size = 128 * 2; + my $dev_name = "stripe_dev"; + my $num_devs = @ARGV; + my @devs = @ARGV; + my ($min_dev_size, $stripe_dev_size, $i); + + if (!$num_devs) { + die("Specify at least one device\n"); + } + + $min_dev_size = `blockdev --getsz $devs[0]`; + for ($i = 1; $i < $num_devs; $i++) { + my $this_size = `blockdev --getsz $devs[$i]`; + $min_dev_size = ($min_dev_size < $this_size) ? + $min_dev_size : $this_size; + } + + $stripe_dev_size = $min_dev_size * $num_devs; + $stripe_dev_size -= $stripe_dev_size % ($chunk_size * $num_devs); + + $table = "0 $stripe_dev_size striped $num_devs $chunk_size"; + for ($i = 0; $i < $num_devs; $i++) { + $table .= " $devs[$i] 0"; + } + + `echo $table | dmsetup create $dev_name`; diff --git a/Documentation/admin-guide/device-mapper/switch.rst b/Documentation/admin-guide/device-mapper/switch.rst new file mode 100644 index 000000000000..7dde06be1a4f --- /dev/null +++ b/Documentation/admin-guide/device-mapper/switch.rst @@ -0,0 +1,141 @@ +========= +dm-switch +========= + +The device-mapper switch target creates a device that supports an +arbitrary mapping of fixed-size regions of I/O across a fixed set of +paths. The path used for any specific region can be switched +dynamically by sending the target a message. + +It maps I/O to underlying block devices efficiently when there is a large +number of fixed-sized address regions but there is no simple pattern +that would allow for a compact representation of the mapping such as +dm-stripe. + +Background +---------- + +Dell EqualLogic and some other iSCSI storage arrays use a distributed +frameless architecture. In this architecture, the storage group +consists of a number of distinct storage arrays ("members") each having +independent controllers, disk storage and network adapters. When a LUN +is created it is spread across multiple members. The details of the +spreading are hidden from initiators connected to this storage system. +The storage group exposes a single target discovery portal, no matter +how many members are being used. When iSCSI sessions are created, each +session is connected to an eth port on a single member. Data to a LUN +can be sent on any iSCSI session, and if the blocks being accessed are +stored on another member the I/O will be forwarded as required. This +forwarding is invisible to the initiator. The storage layout is also +dynamic, and the blocks stored on disk may be moved from member to +member as needed to balance the load. + +This architecture simplifies the management and configuration of both +the storage group and initiators. In a multipathing configuration, it +is possible to set up multiple iSCSI sessions to use multiple network +interfaces on both the host and target to take advantage of the +increased network bandwidth. An initiator could use a simple round +robin algorithm to send I/O across all paths and let the storage array +members forward it as necessary, but there is a performance advantage to +sending data directly to the correct member. + +A device-mapper table already lets you map different regions of a +device onto different targets. However in this architecture the LUN is +spread with an address region size on the order of 10s of MBs, which +means the resulting table could have more than a million entries and +consume far too much memory. + +Using this device-mapper switch target we can now build a two-layer +device hierarchy: + + Upper Tier - Determine which array member the I/O should be sent to. + Lower Tier - Load balance amongst paths to a particular member. + +The lower tier consists of a single dm multipath device for each member. +Each of these multipath devices contains the set of paths directly to +the array member in one priority group, and leverages existing path +selectors to load balance amongst these paths. We also build a +non-preferred priority group containing paths to other array members for +failover reasons. + +The upper tier consists of a single dm-switch device. This device uses +a bitmap to look up the location of the I/O and choose the appropriate +lower tier device to route the I/O. By using a bitmap we are able to +use 4 bits for each address range in a 16 member group (which is very +large for us). This is a much denser representation than the dm table +b-tree can achieve. + +Construction Parameters +======================= + + [...] [ ]+ + + The number of paths across which to distribute the I/O. + + + The number of 512-byte sectors in a region. Each region can be redirected + to any of the available paths. + + + The number of optional arguments. Currently, no optional arguments + are supported and so this must be zero. + + + The block device that represents a specific path to the device. + + + The offset of the start of data on the specific (in units + of 512-byte sectors). This number is added to the sector number when + forwarding the request to the specific path. Typically it is zero. + +Messages +======== + +set_region_mappings : []: []:... + +Modify the region table by specifying which regions are redirected to +which paths. + + + The region number (region size was specified in constructor parameters). + If index is omitted, the next region (previous index + 1) is used. + Expressed in hexadecimal (WITHOUT any prefix like 0x). + + + The path number in the range 0 ... ( - 1). + Expressed in hexadecimal (WITHOUT any prefix like 0x). + +R, + This parameter allows repetitive patterns to be loaded quickly. and + are hexadecimal numbers. The last mappings are repeated in the next + slots. + +Status +====== + +No status line is reported. + +Example +======= + +Assume that you have volumes vg1/switch0 vg1/switch1 vg1/switch2 with +the same size. + +Create a switch device with 64kB region size:: + + dmsetup create switch --table "0 `blockdev --getsz /dev/vg1/switch0` + switch 3 128 0 /dev/vg1/switch0 0 /dev/vg1/switch1 0 /dev/vg1/switch2 0" + +Set mappings for the first 7 entries to point to devices switch0, switch1, +switch2, switch0, switch1, switch2, switch1:: + + dmsetup message switch 0 set_region_mappings 0:0 :1 :2 :0 :1 :2 :1 + +Set repetitive mapping. This command:: + + dmsetup message switch 0 set_region_mappings 1000:1 :2 R2,10 + +is equivalent to:: + + dmsetup message switch 0 set_region_mappings 1000:1 :2 :1 :2 :1 :2 :1 :2 \ + :1 :2 :1 :2 :1 :2 :1 :2 :1 :2 diff --git a/Documentation/admin-guide/device-mapper/thin-provisioning.rst b/Documentation/admin-guide/device-mapper/thin-provisioning.rst new file mode 100644 index 000000000000..bafebf79da4b --- /dev/null +++ b/Documentation/admin-guide/device-mapper/thin-provisioning.rst @@ -0,0 +1,427 @@ +================= +Thin provisioning +================= + +Introduction +============ + +This document describes a collection of device-mapper targets that +between them implement thin-provisioning and snapshots. + +The main highlight of this implementation, compared to the previous +implementation of snapshots, is that it allows many virtual devices to +be stored on the same data volume. This simplifies administration and +allows the sharing of data between volumes, thus reducing disk usage. + +Another significant feature is support for an arbitrary depth of +recursive snapshots (snapshots of snapshots of snapshots ...). The +previous implementation of snapshots did this by chaining together +lookup tables, and so performance was O(depth). This new +implementation uses a single data structure to avoid this degradation +with depth. Fragmentation may still be an issue, however, in some +scenarios. + +Metadata is stored on a separate device from data, giving the +administrator some freedom, for example to: + +- Improve metadata resilience by storing metadata on a mirrored volume + but data on a non-mirrored one. + +- Improve performance by storing the metadata on SSD. + +Status +====== + +These targets are considered safe for production use. But different use +cases will have different performance characteristics, for example due +to fragmentation of the data volume. + +If you find this software is not performing as expected please mail +dm-devel@redhat.com with details and we'll try our best to improve +things for you. + +Userspace tools for checking and repairing the metadata have been fully +developed and are available as 'thin_check' and 'thin_repair'. The name +of the package that provides these utilities varies by distribution (on +a Red Hat distribution it is named 'device-mapper-persistent-data'). + +Cookbook +======== + +This section describes some quick recipes for using thin provisioning. +They use the dmsetup program to control the device-mapper driver +directly. End users will be advised to use a higher-level volume +manager such as LVM2 once support has been added. + +Pool device +----------- + +The pool device ties together the metadata volume and the data volume. +It maps I/O linearly to the data volume and updates the metadata via +two mechanisms: + +- Function calls from the thin targets + +- Device-mapper 'messages' from userspace which control the creation of new + virtual devices amongst other things. + +Setting up a fresh pool device +------------------------------ + +Setting up a pool device requires a valid metadata device, and a +data device. If you do not have an existing metadata device you can +make one by zeroing the first 4k to indicate empty metadata. + + dd if=/dev/zero of=$metadata_dev bs=4096 count=1 + +The amount of metadata you need will vary according to how many blocks +are shared between thin devices (i.e. through snapshots). If you have +less sharing than average you'll need a larger-than-average metadata device. + +As a guide, we suggest you calculate the number of bytes to use in the +metadata device as 48 * $data_dev_size / $data_block_size but round it up +to 2MB if the answer is smaller. If you're creating large numbers of +snapshots which are recording large amounts of change, you may find you +need to increase this. + +The largest size supported is 16GB: If the device is larger, +a warning will be issued and the excess space will not be used. + +Reloading a pool table +---------------------- + +You may reload a pool's table, indeed this is how the pool is resized +if it runs out of space. (N.B. While specifying a different metadata +device when reloading is not forbidden at the moment, things will go +wrong if it does not route I/O to exactly the same on-disk location as +previously.) + +Using an existing pool device +----------------------------- + +:: + + dmsetup create pool \ + --table "0 20971520 thin-pool $metadata_dev $data_dev \ + $data_block_size $low_water_mark" + +$data_block_size gives the smallest unit of disk space that can be +allocated at a time expressed in units of 512-byte sectors. +$data_block_size must be between 128 (64KB) and 2097152 (1GB) and a +multiple of 128 (64KB). $data_block_size cannot be changed after the +thin-pool is created. People primarily interested in thin provisioning +may want to use a value such as 1024 (512KB). People doing lots of +snapshotting may want a smaller value such as 128 (64KB). If you are +not zeroing newly-allocated data, a larger $data_block_size in the +region of 256000 (128MB) is suggested. + +$low_water_mark is expressed in blocks of size $data_block_size. If +free space on the data device drops below this level then a dm event +will be triggered which a userspace daemon should catch allowing it to +extend the pool device. Only one such event will be sent. + +No special event is triggered if a just resumed device's free space is below +the low water mark. However, resuming a device always triggers an +event; a userspace daemon should verify that free space exceeds the low +water mark when handling this event. + +A low water mark for the metadata device is maintained in the kernel and +will trigger a dm event if free space on the metadata device drops below +it. + +Updating on-disk metadata +------------------------- + +On-disk metadata is committed every time a FLUSH or FUA bio is written. +If no such requests are made then commits will occur every second. This +means the thin-provisioning target behaves like a physical disk that has +a volatile write cache. If power is lost you may lose some recent +writes. The metadata should always be consistent in spite of any crash. + +If data space is exhausted the pool will either error or queue IO +according to the configuration (see: error_if_no_space). If metadata +space is exhausted or a metadata operation fails: the pool will error IO +until the pool is taken offline and repair is performed to 1) fix any +potential inconsistencies and 2) clear the flag that imposes repair. +Once the pool's metadata device is repaired it may be resized, which +will allow the pool to return to normal operation. Note that if a pool +is flagged as needing repair, the pool's data and metadata devices +cannot be resized until repair is performed. It should also be noted +that when the pool's metadata space is exhausted the current metadata +transaction is aborted. Given that the pool will cache IO whose +completion may have already been acknowledged to upper IO layers +(e.g. filesystem) it is strongly suggested that consistency checks +(e.g. fsck) be performed on those layers when repair of the pool is +required. + +Thin provisioning +----------------- + +i) Creating a new thinly-provisioned volume. + + To create a new thinly- provisioned volume you must send a message to an + active pool device, /dev/mapper/pool in this example:: + + dmsetup message /dev/mapper/pool 0 "create_thin 0" + + Here '0' is an identifier for the volume, a 24-bit number. It's up + to the caller to allocate and manage these identifiers. If the + identifier is already in use, the message will fail with -EEXIST. + +ii) Using a thinly-provisioned volume. + + Thinly-provisioned volumes are activated using the 'thin' target:: + + dmsetup create thin --table "0 2097152 thin /dev/mapper/pool 0" + + The last parameter is the identifier for the thinp device. + +Internal snapshots +------------------ + +i) Creating an internal snapshot. + + Snapshots are created with another message to the pool. + + N.B. If the origin device that you wish to snapshot is active, you + must suspend it before creating the snapshot to avoid corruption. + This is NOT enforced at the moment, so please be careful! + + :: + + dmsetup suspend /dev/mapper/thin + dmsetup message /dev/mapper/pool 0 "create_snap 1 0" + dmsetup resume /dev/mapper/thin + + Here '1' is the identifier for the volume, a 24-bit number. '0' is the + identifier for the origin device. + +ii) Using an internal snapshot. + + Once created, the user doesn't have to worry about any connection + between the origin and the snapshot. Indeed the snapshot is no + different from any other thinly-provisioned device and can be + snapshotted itself via the same method. It's perfectly legal to + have only one of them active, and there's no ordering requirement on + activating or removing them both. (This differs from conventional + device-mapper snapshots.) + + Activate it exactly the same way as any other thinly-provisioned volume:: + + dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1" + +External snapshots +------------------ + +You can use an external **read only** device as an origin for a +thinly-provisioned volume. Any read to an unprovisioned area of the +thin device will be passed through to the origin. Writes trigger +the allocation of new blocks as usual. + +One use case for this is VM hosts that want to run guests on +thinly-provisioned volumes but have the base image on another device +(possibly shared between many VMs). + +You must not write to the origin device if you use this technique! +Of course, you may write to the thin device and take internal snapshots +of the thin volume. + +i) Creating a snapshot of an external device + + This is the same as creating a thin device. + You don't mention the origin at this stage. + + :: + + dmsetup message /dev/mapper/pool 0 "create_thin 0" + +ii) Using a snapshot of an external device. + + Append an extra parameter to the thin target specifying the origin:: + + dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 0 /dev/image" + + N.B. All descendants (internal snapshots) of this snapshot require the + same extra origin parameter. + +Deactivation +------------ + +All devices using a pool must be deactivated before the pool itself +can be. + +:: + + dmsetup remove thin + dmsetup remove snap + dmsetup remove pool + +Reference +========= + +'thin-pool' target +------------------ + +i) Constructor + + :: + + thin-pool \ + [ []*] + + Optional feature arguments: + + skip_block_zeroing: + Skip the zeroing of newly-provisioned blocks. + + ignore_discard: + Disable discard support. + + no_discard_passdown: + Don't pass discards down to the underlying + data device, but just remove the mapping. + + read_only: + Don't allow any changes to be made to the pool + metadata. This mode is only available after the + thin-pool has been created and first used in full + read/write mode. It cannot be specified on initial + thin-pool creation. + + error_if_no_space: + Error IOs, instead of queueing, if no space. + + Data block size must be between 64KB (128 sectors) and 1GB + (2097152 sectors) inclusive. + + +ii) Status + + :: + + / + / + ro|rw|out_of_data_space [no_]discard_passdown [error|queue]_if_no_space + needs_check|- metadata_low_watermark + + transaction id: + A 64-bit number used by userspace to help synchronise with metadata + from volume managers. + + used data blocks / total data blocks + If the number of free blocks drops below the pool's low water mark a + dm event will be sent to userspace. This event is edge-triggered and + it will occur only once after each resume so volume manager writers + should register for the event and then check the target's status. + + held metadata root: + The location, in blocks, of the metadata root that has been + 'held' for userspace read access. '-' indicates there is no + held root. + + discard_passdown|no_discard_passdown + Whether or not discards are actually being passed down to the + underlying device. When this is enabled when loading the table, + it can get disabled if the underlying device doesn't support it. + + ro|rw|out_of_data_space + If the pool encounters certain types of device failures it will + drop into a read-only metadata mode in which no changes to + the pool metadata (like allocating new blocks) are permitted. + + In serious cases where even a read-only mode is deemed unsafe + no further I/O will be permitted and the status will just + contain the string 'Fail'. The userspace recovery tools + should then be used. + + error_if_no_space|queue_if_no_space + If the pool runs out of data or metadata space, the pool will + either queue or error the IO destined to the data device. The + default is to queue the IO until more space is added or the + 'no_space_timeout' expires. The 'no_space_timeout' dm-thin-pool + module parameter can be used to change this timeout -- it + defaults to 60 seconds but may be disabled using a value of 0. + + needs_check + A metadata operation has failed, resulting in the needs_check + flag being set in the metadata's superblock. The metadata + device must be deactivated and checked/repaired before the + thin-pool can be made fully operational again. '-' indicates + needs_check is not set. + + metadata_low_watermark: + Value of metadata low watermark in blocks. The kernel sets this + value internally but userspace needs to know this value to + determine if an event was caused by crossing this threshold. + +iii) Messages + + create_thin + Create a new thinly-provisioned device. + is an arbitrary unique 24-bit identifier chosen by + the caller. + + create_snap + Create a new snapshot of another thinly-provisioned device. + is an arbitrary unique 24-bit identifier chosen by + the caller. + is the identifier of the thinly-provisioned device + of which the new device will be a snapshot. + + delete + Deletes a thin device. Irreversible. + + set_transaction_id + Userland volume managers, such as LVM, need a way to + synchronise their external metadata with the internal metadata of the + pool target. The thin-pool target offers to store an + arbitrary 64-bit transaction id and return it on the target's + status line. To avoid races you must provide what you think + the current transaction id is when you change it with this + compare-and-swap message. + + reserve_metadata_snap + Reserve a copy of the data mapping btree for use by userland. + This allows userland to inspect the mappings as they were when + this message was executed. Use the pool's status command to + get the root block associated with the metadata snapshot. + + release_metadata_snap + Release a previously reserved copy of the data mapping btree. + +'thin' target +------------- + +i) Constructor + + :: + + thin [] + + pool dev: + the thin-pool device, e.g. /dev/mapper/my_pool or 253:0 + + dev id: + the internal device identifier of the device to be + activated. + + external origin dev: + an optional block device outside the pool to be treated as a + read-only snapshot origin: reads to unprovisioned areas of the + thin target will be mapped to this device. + +The pool doesn't store any size against the thin devices. If you +load a thin target that is smaller than you've been using previously, +then you'll have no access to blocks mapped beyond the end. If you +load a target that is bigger than before, then extra blocks will be +provisioned as and when needed. + +ii) Status + + + If the pool has encountered device errors and failed, the status + will just contain the string 'Fail'. The userspace recovery + tools should then be used. + + In the case where is 0, there is no highest + mapped sector and the value of is unspecified. diff --git a/Documentation/admin-guide/device-mapper/unstriped.rst b/Documentation/admin-guide/device-mapper/unstriped.rst new file mode 100644 index 000000000000..0a8d3eb3f072 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/unstriped.rst @@ -0,0 +1,135 @@ +================================ +Device-mapper "unstriped" target +================================ + +Introduction +============ + +The device-mapper "unstriped" target provides a transparent mechanism to +unstripe a device-mapper "striped" target to access the underlying disks +without having to touch the true backing block-device. It can also be +used to unstripe a hardware RAID-0 to access backing disks. + +Parameters: + + + + The number of stripes in the RAID 0. + + + The amount of 512B sectors in the chunk striping. + + + The block device you wish to unstripe. + + + The stripe number within the device that corresponds to physical + drive you wish to unstripe. This must be 0 indexed. + + +Why use this module? +==================== + +An example of undoing an existing dm-stripe +------------------------------------------- + +This small bash script will setup 4 loop devices and use the existing +striped target to combine the 4 devices into one. It then will use +the unstriped target ontop of the striped device to access the +individual backing loop devices. We write data to the newly exposed +unstriped devices and verify the data written matches the correct +underlying device on the striped array:: + + #!/bin/bash + + MEMBER_SIZE=$((128 * 1024 * 1024)) + NUM=4 + SEQ_END=$((${NUM}-1)) + CHUNK=256 + BS=4096 + + RAID_SIZE=$((${MEMBER_SIZE}*${NUM}/512)) + DM_PARMS="0 ${RAID_SIZE} striped ${NUM} ${CHUNK}" + COUNT=$((${MEMBER_SIZE} / ${BS})) + + for i in $(seq 0 ${SEQ_END}); do + dd if=/dev/zero of=member-${i} bs=${MEMBER_SIZE} count=1 oflag=direct + losetup /dev/loop${i} member-${i} + DM_PARMS+=" /dev/loop${i} 0" + done + + echo $DM_PARMS | dmsetup create raid0 + for i in $(seq 0 ${SEQ_END}); do + echo "0 1 unstriped ${NUM} ${CHUNK} ${i} /dev/mapper/raid0 0" | dmsetup create set-${i} + done; + + for i in $(seq 0 ${SEQ_END}); do + dd if=/dev/urandom of=/dev/mapper/set-${i} bs=${BS} count=${COUNT} oflag=direct + diff /dev/mapper/set-${i} member-${i} + done; + + for i in $(seq 0 ${SEQ_END}); do + dmsetup remove set-${i} + done + + dmsetup remove raid0 + + for i in $(seq 0 ${SEQ_END}); do + losetup -d /dev/loop${i} + rm -f member-${i} + done + +Another example +--------------- + +Intel NVMe drives contain two cores on the physical device. +Each core of the drive has segregated access to its LBA range. +The current LBA model has a RAID 0 128k chunk on each core, resulting +in a 256k stripe across the two cores:: + + Core 0: Core 1: + __________ __________ + | LBA 512| | LBA 768| + | LBA 0 | | LBA 256| + ---------- ---------- + +The purpose of this unstriping is to provide better QoS in noisy +neighbor environments. When two partitions are created on the +aggregate drive without this unstriping, reads on one partition +can affect writes on another partition. This is because the partitions +are striped across the two cores. When we unstripe this hardware RAID 0 +and make partitions on each new exposed device the two partitions are now +physically separated. + +With the dm-unstriped target we're able to segregate an fio script that +has read and write jobs that are independent of each other. Compared to +when we run the test on a combined drive with partitions, we were able +to get a 92% reduction in read latency using this device mapper target. + + +Example dmsetup usage +===================== + +unstriped ontop of Intel NVMe device that has 2 cores +----------------------------------------------------- + +:: + + dmsetup create nvmset0 --table '0 512 unstriped 2 256 0 /dev/nvme0n1 0' + dmsetup create nvmset1 --table '0 512 unstriped 2 256 1 /dev/nvme0n1 0' + +There will now be two devices that expose Intel NVMe core 0 and 1 +respectively:: + + /dev/mapper/nvmset0 + /dev/mapper/nvmset1 + +unstriped ontop of striped with 4 drives using 128K chunk size +-------------------------------------------------------------- + +:: + + dmsetup create raid_disk0 --table '0 512 unstriped 4 256 0 /dev/mapper/striped 0' + dmsetup create raid_disk1 --table '0 512 unstriped 4 256 1 /dev/mapper/striped 0' + dmsetup create raid_disk2 --table '0 512 unstriped 4 256 2 /dev/mapper/striped 0' + dmsetup create raid_disk3 --table '0 512 unstriped 4 256 3 /dev/mapper/striped 0' diff --git a/Documentation/admin-guide/device-mapper/verity.rst b/Documentation/admin-guide/device-mapper/verity.rst new file mode 100644 index 000000000000..a4d1c1476d72 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/verity.rst @@ -0,0 +1,229 @@ +========= +dm-verity +========= + +Device-Mapper's "verity" target provides transparent integrity checking of +block devices using a cryptographic digest provided by the kernel crypto API. +This target is read-only. + +Construction Parameters +======================= + +:: + + + + + + [<#opt_params> ] + + + This is the type of the on-disk hash format. + + 0 is the original format used in the Chromium OS. + The salt is appended when hashing, digests are stored continuously and + the rest of the block is padded with zeroes. + + 1 is the current format that should be used for new devices. + The salt is prepended when hashing and each digest is + padded with zeroes to the power of two. + + + This is the device containing data, the integrity of which needs to be + checked. It may be specified as a path, like /dev/sdaX, or a device number, + :. + + + This is the device that supplies the hash tree data. It may be + specified similarly to the device path and may be the same device. If the + same device is used, the hash_start should be outside the configured + dm-verity device. + + + The block size on a data device in bytes. + Each block corresponds to one digest on the hash device. + + + The size of a hash block in bytes. + + + The number of data blocks on the data device. Additional blocks are + inaccessible. You can place hashes to the same partition as data, in this + case hashes are placed after . + + + This is the offset, in -blocks, from the start of hash_dev + to the root block of the hash tree. + + + The cryptographic hash algorithm used for this device. This should + be the name of the algorithm, like "sha1". + + + The hexadecimal encoding of the cryptographic hash of the root hash block + and the salt. This hash should be trusted as there is no other authenticity + beyond this point. + + + The hexadecimal encoding of the salt value. + +<#opt_params> + Number of optional parameters. If there are no optional parameters, + the optional paramaters section can be skipped or #opt_params can be zero. + Otherwise #opt_params is the number of following arguments. + + Example of optional parameters section: + 1 ignore_corruption + +ignore_corruption + Log corrupted blocks, but allow read operations to proceed normally. + +restart_on_corruption + Restart the system when a corrupted block is discovered. This option is + not compatible with ignore_corruption and requires user space support to + avoid restart loops. + +ignore_zero_blocks + Do not verify blocks that are expected to contain zeroes and always return + zeroes instead. This may be useful if the partition contains unused blocks + that are not guaranteed to contain zeroes. + +use_fec_from_device + Use forward error correction (FEC) to recover from corruption if hash + verification fails. Use encoding data from the specified device. This + may be the same device where data and hash blocks reside, in which case + fec_start must be outside data and hash areas. + + If the encoding data covers additional metadata, it must be accessible + on the hash device after the hash blocks. + + Note: block sizes for data and hash devices must match. Also, if the + verity is encrypted the should be too. + +fec_roots + Number of generator roots. This equals to the number of parity bytes in + the encoding data. For example, in RS(M, N) encoding, the number of roots + is M-N. + +fec_blocks + The number of encoding data blocks on the FEC device. The block size for + the FEC device is . + +fec_start + This is the offset, in blocks, from the start of the + FEC device to the beginning of the encoding data. + +check_at_most_once + Verify data blocks only the first time they are read from the data device, + rather than every time. This reduces the overhead of dm-verity so that it + can be used on systems that are memory and/or CPU constrained. However, it + provides a reduced level of security because only offline tampering of the + data device's content will be detected, not online tampering. + + Hash blocks are still verified each time they are read from the hash device, + since verification of hash blocks is less performance critical than data + blocks, and a hash block will not be verified any more after all the data + blocks it covers have been verified anyway. + +Theory of operation +=================== + +dm-verity is meant to be set up as part of a verified boot path. This +may be anything ranging from a boot using tboot or trustedgrub to just +booting from a known-good device (like a USB drive or CD). + +When a dm-verity device is configured, it is expected that the caller +has been authenticated in some way (cryptographic signatures, etc). +After instantiation, all hashes will be verified on-demand during +disk access. If they cannot be verified up to the root node of the +tree, the root hash, then the I/O will fail. This should detect +tampering with any data on the device and the hash data. + +Cryptographic hashes are used to assert the integrity of the device on a +per-block basis. This allows for a lightweight hash computation on first read +into the page cache. Block hashes are stored linearly, aligned to the nearest +block size. + +If forward error correction (FEC) support is enabled any recovery of +corrupted data will be verified using the cryptographic hash of the +corresponding data. This is why combining error correction with +integrity checking is essential. + +Hash Tree +--------- + +Each node in the tree is a cryptographic hash. If it is a leaf node, the hash +of some data block on disk is calculated. If it is an intermediary node, +the hash of a number of child nodes is calculated. + +Each entry in the tree is a collection of neighboring nodes that fit in one +block. The number is determined based on block_size and the size of the +selected cryptographic digest algorithm. The hashes are linearly-ordered in +this entry and any unaligned trailing space is ignored but included when +calculating the parent node. + +The tree looks something like: + + alg = sha256, num_blocks = 32768, block_size = 4096 + +:: + + [ root ] + / . . . \ + [entry_0] [entry_1] + / . . . \ . . . \ + [entry_0_0] . . . [entry_0_127] . . . . [entry_1_127] + / ... \ / . . . \ / \ + blk_0 ... blk_127 blk_16256 blk_16383 blk_32640 . . . blk_32767 + + +On-disk format +============== + +The verity kernel code does not read the verity metadata on-disk header. +It only reads the hash blocks which directly follow the header. +It is expected that a user-space tool will verify the integrity of the +verity header. + +Alternatively, the header can be omitted and the dmsetup parameters can +be passed via the kernel command-line in a rooted chain of trust where +the command-line is verified. + +Directly following the header (and with sector number padded to the next hash +block boundary) are the hash blocks which are stored a depth at a time +(starting from the root), sorted in order of increasing index. + +The full specification of kernel parameters and on-disk metadata format +is available at the cryptsetup project's wiki page + + https://gitlab.com/cryptsetup/cryptsetup/wikis/DMVerity + +Status +====== +V (for Valid) is returned if every check performed so far was valid. +If any check failed, C (for Corruption) is returned. + +Example +======= +Set up a device:: + + # dmsetup create vroot --readonly --table \ + "0 2097152 verity 1 /dev/sda1 /dev/sda2 4096 4096 262144 1 sha256 "\ + "4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 "\ + "1234000000000000000000000000000000000000000000000000000000000000" + +A command line tool veritysetup is available to compute or verify +the hash tree or activate the kernel device. This is available from +the cryptsetup upstream repository https://gitlab.com/cryptsetup/cryptsetup/ +(as a libcryptsetup extension). + +Create hash on the device:: + + # veritysetup format /dev/sda1 /dev/sda2 + ... + Root hash: 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 + +Activate the device:: + + # veritysetup create vroot /dev/sda1 /dev/sda2 \ + 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 diff --git a/Documentation/admin-guide/device-mapper/writecache.rst b/Documentation/admin-guide/device-mapper/writecache.rst new file mode 100644 index 000000000000..d3d7690f5e8d --- /dev/null +++ b/Documentation/admin-guide/device-mapper/writecache.rst @@ -0,0 +1,79 @@ +================= +Writecache target +================= + +The writecache target caches writes on persistent memory or on SSD. It +doesn't cache reads because reads are supposed to be cached in page cache +in normal RAM. + +When the device is constructed, the first sector should be zeroed or the +first sector should contain valid superblock from previous invocation. + +Constructor parameters: + +1. type of the cache device - "p" or "s" + + - p - persistent memory + - s - SSD +2. the underlying device that will be cached +3. the cache device +4. block size (4096 is recommended; the maximum block size is the page + size) +5. the number of optional parameters (the parameters with an argument + count as two) + + start_sector n (default: 0) + offset from the start of cache device in 512-byte sectors + high_watermark n (default: 50) + start writeback when the number of used blocks reach this + watermark + low_watermark x (default: 45) + stop writeback when the number of used blocks drops below + this watermark + writeback_jobs n (default: unlimited) + limit the number of blocks that are in flight during + writeback. Setting this value reduces writeback + throughput, but it may improve latency of read requests + autocommit_blocks n (default: 64 for pmem, 65536 for ssd) + when the application writes this amount of blocks without + issuing the FLUSH request, the blocks are automatically + commited + autocommit_time ms (default: 1000) + autocommit time in milliseconds. The data is automatically + commited if this time passes and no FLUSH request is + received + fua (by default on) + applicable only to persistent memory - use the FUA flag + when writing data from persistent memory back to the + underlying device + nofua + applicable only to persistent memory - don't use the FUA + flag when writing back data and send the FLUSH request + afterwards + + - some underlying devices perform better with fua, some + with nofua. The user should test it + +Status: +1. error indicator - 0 if there was no error, otherwise error number +2. the number of blocks +3. the number of free blocks +4. the number of blocks under writeback + +Messages: + flush + flush the cache device. The message returns successfully + if the cache device was flushed without an error + flush_on_suspend + flush the cache device on next suspend. Use this message + when you are going to remove the cache device. The proper + sequence for removing the cache device is: + + 1. send the "flush_on_suspend" message + 2. load an inactive table with a linear target that maps + to the underlying device + 3. suspend the device + 4. ask for status and verify that there are no errors + 5. resume the device, so that it will use the linear + target + 6. the cache device is now inactive and it can be deleted diff --git a/Documentation/admin-guide/device-mapper/zero.rst b/Documentation/admin-guide/device-mapper/zero.rst new file mode 100644 index 000000000000..11fb5cf4597c --- /dev/null +++ b/Documentation/admin-guide/device-mapper/zero.rst @@ -0,0 +1,37 @@ +======= +dm-zero +======= + +Device-Mapper's "zero" target provides a block-device that always returns +zero'd data on reads and silently drops writes. This is similar behavior to +/dev/zero, but as a block-device instead of a character-device. + +Dm-zero has no target-specific parameters. + +One very interesting use of dm-zero is for creating "sparse" devices in +conjunction with dm-snapshot. A sparse device reports a device-size larger +than the amount of actual storage space available for that device. A user can +write data anywhere within the sparse device and read it back like a normal +device. Reads to previously unwritten areas will return a zero'd buffer. When +enough data has been written to fill up the actual storage space, the sparse +device is deactivated. This can be very useful for testing device and +filesystem limitations. + +To create a sparse device, start by creating a dm-zero device that's the +desired size of the sparse device. For this example, we'll assume a 10TB +sparse device:: + + TEN_TERABYTES=`expr 10 \* 1024 \* 1024 \* 1024 \* 2` # 10 TB in sectors + echo "0 $TEN_TERABYTES zero" | dmsetup create zero1 + +Then create a snapshot of the zero device, using any available block-device as +the COW device. The size of the COW device will determine the amount of real +space available to the sparse device. For this example, we'll assume /dev/sdb1 +is an available 10GB partition:: + + echo "0 $TEN_TERABYTES snapshot /dev/mapper/zero1 /dev/sdb1 p 128" | \ + dmsetup create sparse1 + +This will create a 10TB sparse device called /dev/mapper/sparse1 that has +10GB of actual storage space available. If more than 10GB of data is written +to this device, it will start returning I/O errors. diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index abc2c4e83939..64e97a969857 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -80,6 +80,7 @@ configure specific aspects of kernel behavior to your liking. namespaces/index perf-security acpi/index + device-mapper/index .. only:: subproject and html diff --git a/Documentation/device-mapper/cache-policies.rst b/Documentation/device-mapper/cache-policies.rst deleted file mode 100644 index b17fe352fc41..000000000000 --- a/Documentation/device-mapper/cache-policies.rst +++ /dev/null @@ -1,131 +0,0 @@ -============================= -Guidance for writing policies -============================= - -Try to keep transactionality out of it. The core is careful to -avoid asking about anything that is migrating. This is a pain, but -makes it easier to write the policies. - -Mappings are loaded into the policy at construction time. - -Every bio that is mapped by the target is referred to the policy. -The policy can return a simple HIT or MISS or issue a migration. - -Currently there's no way for the policy to issue background work, -e.g. to start writing back dirty blocks that are going to be evicted -soon. - -Because we map bios, rather than requests it's easy for the policy -to get fooled by many small bios. For this reason the core target -issues periodic ticks to the policy. It's suggested that the policy -doesn't update states (eg, hit counts) for a block more than once -for each tick. The core ticks by watching bios complete, and so -trying to see when the io scheduler has let the ios run. - - -Overview of supplied cache replacement policies -=============================================== - -multiqueue (mq) ---------------- - -This policy is now an alias for smq (see below). - -The following tunables are accepted, but have no effect:: - - 'sequential_threshold <#nr_sequential_ios>' - 'random_threshold <#nr_random_ios>' - 'read_promote_adjustment ' - 'write_promote_adjustment ' - 'discard_promote_adjustment ' - -Stochastic multiqueue (smq) ---------------------------- - -This policy is the default. - -The stochastic multi-queue (smq) policy addresses some of the problems -with the multiqueue (mq) policy. - -The smq policy (vs mq) offers the promise of less memory utilization, -improved performance and increased adaptability in the face of changing -workloads. smq also does not have any cumbersome tuning knobs. - -Users may switch from "mq" to "smq" simply by appropriately reloading a -DM table that is using the cache target. Doing so will cause all of the -mq policy's hints to be dropped. Also, performance of the cache may -degrade slightly until smq recalculates the origin device's hotspots -that should be cached. - -Memory usage -^^^^^^^^^^^^ - -The mq policy used a lot of memory; 88 bytes per cache block on a 64 -bit machine. - -smq uses 28bit indexes to implement its data structures rather than -pointers. It avoids storing an explicit hit count for each block. It -has a 'hotspot' queue, rather than a pre-cache, which uses a quarter of -the entries (each hotspot block covers a larger area than a single -cache block). - -All this means smq uses ~25bytes per cache block. Still a lot of -memory, but a substantial improvement nontheless. - -Level balancing -^^^^^^^^^^^^^^^ - -mq placed entries in different levels of the multiqueue structures -based on their hit count (~ln(hit count)). This meant the bottom -levels generally had the most entries, and the top ones had very -few. Having unbalanced levels like this reduced the efficacy of the -multiqueue. - -smq does not maintain a hit count, instead it swaps hit entries with -the least recently used entry from the level above. The overall -ordering being a side effect of this stochastic process. With this -scheme we can decide how many entries occupy each multiqueue level, -resulting in better promotion/demotion decisions. - -Adaptability: -The mq policy maintained a hit count for each cache block. For a -different block to get promoted to the cache its hit count has to -exceed the lowest currently in the cache. This meant it could take a -long time for the cache to adapt between varying IO patterns. - -smq doesn't maintain hit counts, so a lot of this problem just goes -away. In addition it tracks performance of the hotspot queue, which -is used to decide which blocks to promote. If the hotspot queue is -performing badly then it starts moving entries more quickly between -levels. This lets it adapt to new IO patterns very quickly. - -Performance -^^^^^^^^^^^ - -Testing smq shows substantially better performance than mq. - -cleaner -------- - -The cleaner writes back all dirty blocks in a cache to decommission it. - -Examples -======== - -The syntax for a table is:: - - cache - <#feature_args> []* - <#policy_args> []* - -The syntax to send a message using the dmsetup command is:: - - dmsetup message 0 sequential_threshold 1024 - dmsetup message 0 random_threshold 8 - -Using dmsetup:: - - dmsetup create blah --table "0 268435456 cache /dev/sdb /dev/sdc \ - /dev/sdd 512 0 mq 4 sequential_threshold 1024 random_threshold 8" - creates a 128GB large mapped device named 'blah' with the - sequential threshold set to 1024 and the random_threshold set to 8. diff --git a/Documentation/device-mapper/cache.rst b/Documentation/device-mapper/cache.rst deleted file mode 100644 index f15e5254d05b..000000000000 --- a/Documentation/device-mapper/cache.rst +++ /dev/null @@ -1,337 +0,0 @@ -===== -Cache -===== - -Introduction -============ - -dm-cache is a device mapper target written by Joe Thornber, Heinz -Mauelshagen, and Mike Snitzer. - -It aims to improve performance of a block device (eg, a spindle) by -dynamically migrating some of its data to a faster, smaller device -(eg, an SSD). - -This device-mapper solution allows us to insert this caching at -different levels of the dm stack, for instance above the data device for -a thin-provisioning pool. Caching solutions that are integrated more -closely with the virtual memory system should give better performance. - -The target reuses the metadata library used in the thin-provisioning -library. - -The decision as to what data to migrate and when is left to a plug-in -policy module. Several of these have been written as we experiment, -and we hope other people will contribute others for specific io -scenarios (eg. a vm image server). - -Glossary -======== - - Migration - Movement of the primary copy of a logical block from one - device to the other. - Promotion - Migration from slow device to fast device. - Demotion - Migration from fast device to slow device. - -The origin device always contains a copy of the logical block, which -may be out of date or kept in sync with the copy on the cache device -(depending on policy). - -Design -====== - -Sub-devices ------------ - -The target is constructed by passing three devices to it (along with -other parameters detailed later): - -1. An origin device - the big, slow one. - -2. A cache device - the small, fast one. - -3. A small metadata device - records which blocks are in the cache, - which are dirty, and extra hints for use by the policy object. - This information could be put on the cache device, but having it - separate allows the volume manager to configure it differently, - e.g. as a mirror for extra robustness. This metadata device may only - be used by a single cache device. - -Fixed block size ----------------- - -The origin is divided up into blocks of a fixed size. This block size -is configurable when you first create the cache. Typically we've been -using block sizes of 256KB - 1024KB. The block size must be between 64 -sectors (32KB) and 2097152 sectors (1GB) and a multiple of 64 sectors (32KB). - -Having a fixed block size simplifies the target a lot. But it is -something of a compromise. For instance, a small part of a block may be -getting hit a lot, yet the whole block will be promoted to the cache. -So large block sizes are bad because they waste cache space. And small -block sizes are bad because they increase the amount of metadata (both -in core and on disk). - -Cache operating modes ---------------------- - -The cache has three operating modes: writeback, writethrough and -passthrough. - -If writeback, the default, is selected then a write to a block that is -cached will go only to the cache and the block will be marked dirty in -the metadata. - -If writethrough is selected then a write to a cached block will not -complete until it has hit both the origin and cache devices. Clean -blocks should remain clean. - -If passthrough is selected, useful when the cache contents are not known -to be coherent with the origin device, then all reads are served from -the origin device (all reads miss the cache) and all writes are -forwarded to the origin device; additionally, write hits cause cache -block invalidates. To enable passthrough mode the cache must be clean. -Passthrough mode allows a cache device to be activated without having to -worry about coherency. Coherency that exists is maintained, although -the cache will gradually cool as writes take place. If the coherency of -the cache can later be verified, or established through use of the -"invalidate_cblocks" message, the cache device can be transitioned to -writethrough or writeback mode while still warm. Otherwise, the cache -contents can be discarded prior to transitioning to the desired -operating mode. - -A simple cleaner policy is provided, which will clean (write back) all -dirty blocks in a cache. Useful for decommissioning a cache or when -shrinking a cache. Shrinking the cache's fast device requires all cache -blocks, in the area of the cache being removed, to be clean. If the -area being removed from the cache still contains dirty blocks the resize -will fail. Care must be taken to never reduce the volume used for the -cache's fast device until the cache is clean. This is of particular -importance if writeback mode is used. Writethrough and passthrough -modes already maintain a clean cache. Future support to partially clean -the cache, above a specified threshold, will allow for keeping the cache -warm and in writeback mode during resize. - -Migration throttling --------------------- - -Migrating data between the origin and cache device uses bandwidth. -The user can set a throttle to prevent more than a certain amount of -migration occurring at any one time. Currently we're not taking any -account of normal io traffic going to the devices. More work needs -doing here to avoid migrating during those peak io moments. - -For the time being, a message "migration_threshold <#sectors>" -can be used to set the maximum number of sectors being migrated, -the default being 2048 sectors (1MB). - -Updating on-disk metadata -------------------------- - -On-disk metadata is committed every time a FLUSH or FUA bio is written. -If no such requests are made then commits will occur every second. This -means the cache behaves like a physical disk that has a volatile write -cache. If power is lost you may lose some recent writes. The metadata -should always be consistent in spite of any crash. - -The 'dirty' state for a cache block changes far too frequently for us -to keep updating it on the fly. So we treat it as a hint. In normal -operation it will be written when the dm device is suspended. If the -system crashes all cache blocks will be assumed dirty when restarted. - -Per-block policy hints ----------------------- - -Policy plug-ins can store a chunk of data per cache block. It's up to -the policy how big this chunk is, but it should be kept small. Like the -dirty flags this data is lost if there's a crash so a safe fallback -value should always be possible. - -Policy hints affect performance, not correctness. - -Policy messaging ----------------- - -Policies will have different tunables, specific to each one, so we -need a generic way of getting and setting these. Device-mapper -messages are used. Refer to cache-policies.txt. - -Discard bitset resolution -------------------------- - -We can avoid copying data during migration if we know the block has -been discarded. A prime example of this is when mkfs discards the -whole block device. We store a bitset tracking the discard state of -blocks. However, we allow this bitset to have a different block size -from the cache blocks. This is because we need to track the discard -state for all of the origin device (compare with the dirty bitset -which is just for the smaller cache device). - -Target interface -================ - -Constructor ------------ - - :: - - cache - <#feature args> []* - <#policy args> [policy args]* - - ================ ======================================================= - metadata dev fast device holding the persistent metadata - cache dev fast device holding cached data blocks - origin dev slow device holding original data blocks - block size cache unit size in sectors - - #feature args number of feature arguments passed - feature args writethrough or passthrough (The default is writeback.) - - policy the replacement policy to use - #policy args an even number of arguments corresponding to - key/value pairs passed to the policy - policy args key/value pairs passed to the policy - E.g. 'sequential_threshold 1024' - See cache-policies.txt for details. - ================ ======================================================= - -Optional feature arguments are: - - - ==================== ======================================================== - writethrough write through caching that prohibits cache block - content from being different from origin block content. - Without this argument, the default behaviour is to write - back cache block contents later for performance reasons, - so they may differ from the corresponding origin blocks. - - passthrough a degraded mode useful for various cache coherency - situations (e.g., rolling back snapshots of - underlying storage). Reads and writes always go to - the origin. If a write goes to a cached origin - block, then the cache block is invalidated. - To enable passthrough mode the cache must be clean. - - metadata2 use version 2 of the metadata. This stores the dirty - bits in a separate btree, which improves speed of - shutting down the cache. - - no_discard_passdown disable passing down discards from the cache - to the origin's data device. - ==================== ======================================================== - -A policy called 'default' is always registered. This is an alias for -the policy we currently think is giving best all round performance. - -As the default policy could vary between kernels, if you are relying on -the characteristics of a specific policy, always request it by name. - -Status ------- - -:: - - <#used metadata blocks>/<#total metadata blocks> - <#used cache blocks>/<#total cache blocks> - <#read hits> <#read misses> <#write hits> <#write misses> - <#demotions> <#promotions> <#dirty> <#features> * - <#core args> * <#policy args> * - - - -========================= ===================================================== -metadata block size Fixed block size for each metadata block in - sectors -#used metadata blocks Number of metadata blocks used -#total metadata blocks Total number of metadata blocks -cache block size Configurable block size for the cache device - in sectors -#used cache blocks Number of blocks resident in the cache -#total cache blocks Total number of cache blocks -#read hits Number of times a READ bio has been mapped - to the cache -#read misses Number of times a READ bio has been mapped - to the origin -#write hits Number of times a WRITE bio has been mapped - to the cache -#write misses Number of times a WRITE bio has been - mapped to the origin -#demotions Number of times a block has been removed - from the cache -#promotions Number of times a block has been moved to - the cache -#dirty Number of blocks in the cache that differ - from the origin -#feature args Number of feature args to follow -feature args 'writethrough' (optional) -#core args Number of core arguments (must be even) -core args Key/value pairs for tuning the core - e.g. migration_threshold -policy name Name of the policy -#policy args Number of policy arguments to follow (must be even) -policy args Key/value pairs e.g. sequential_threshold -cache metadata mode ro if read-only, rw if read-write - - In serious cases where even a read-only mode is - deemed unsafe no further I/O will be permitted and - the status will just contain the string 'Fail'. - The userspace recovery tools should then be used. -needs_check 'needs_check' if set, '-' if not set - A metadata operation has failed, resulting in the - needs_check flag being set in the metadata's - superblock. The metadata device must be - deactivated and checked/repaired before the - cache can be made fully operational again. - '-' indicates needs_check is not set. -========================= ===================================================== - -Messages --------- - -Policies will have different tunables, specific to each one, so we -need a generic way of getting and setting these. Device-mapper -messages are used. (A sysfs interface would also be possible.) - -The message format is:: - - - -E.g.:: - - dmsetup message my_cache 0 sequential_threshold 1024 - - -Invalidation is removing an entry from the cache without writing it -back. Cache blocks can be invalidated via the invalidate_cblocks -message, which takes an arbitrary number of cblock ranges. Each cblock -range's end value is "one past the end", meaning 5-10 expresses a range -of values from 5 to 9. Each cblock must be expressed as a decimal -value, in the future a variant message that takes cblock ranges -expressed in hexadecimal may be needed to better support efficient -invalidation of larger caches. The cache must be in passthrough mode -when invalidate_cblocks is used:: - - invalidate_cblocks [|-]* - -E.g.:: - - dmsetup message my_cache 0 invalidate_cblocks 2345 3456-4567 5678-6789 - -Examples -======== - -The test suite can be found here: - -https://github.com/jthornber/device-mapper-test-suite - -:: - - dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \ - /dev/mapper/ssd /dev/mapper/origin 512 1 writeback default 0' - dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \ - /dev/mapper/ssd /dev/mapper/origin 1024 1 writeback \ - mq 4 sequential_threshold 1024 random_threshold 8' diff --git a/Documentation/device-mapper/delay.rst b/Documentation/device-mapper/delay.rst deleted file mode 100644 index 917ba8c33359..000000000000 --- a/Documentation/device-mapper/delay.rst +++ /dev/null @@ -1,31 +0,0 @@ -======== -dm-delay -======== - -Device-Mapper's "delay" target delays reads and/or writes -and maps them to different devices. - -Parameters:: - - [ - [ ]] - -With separate write parameters, the first set is only used for reads. -Offsets are specified in sectors. -Delays are specified in milliseconds. - -Example scripts -=============== - -:: - - #!/bin/sh - # Create device delaying rw operation for 500ms - echo "0 `blockdev --getsz $1` delay $1 0 500" | dmsetup create delayed - -:: - - #!/bin/sh - # Create device delaying only write operation for 500ms and - # splitting reads and writes to different devices $1 $2 - echo "0 `blockdev --getsz $1` delay $1 0 0 $2 0 500" | dmsetup create delayed diff --git a/Documentation/device-mapper/dm-crypt.rst b/Documentation/device-mapper/dm-crypt.rst deleted file mode 100644 index 8f4a3f889d43..000000000000 --- a/Documentation/device-mapper/dm-crypt.rst +++ /dev/null @@ -1,173 +0,0 @@ -======== -dm-crypt -======== - -Device-Mapper's "crypt" target provides transparent encryption of block devices -using the kernel crypto API. - -For a more detailed description of supported parameters see: -https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt - -Parameters:: - - \ - [<#opt_params> ] - - - Encryption cipher, encryption mode and Initial Vector (IV) generator. - - The cipher specifications format is:: - - cipher[:keycount]-chainmode-ivmode[:ivopts] - - Examples:: - - aes-cbc-essiv:sha256 - aes-xts-plain64 - serpent-xts-plain64 - - Cipher format also supports direct specification with kernel crypt API - format (selected by capi: prefix). The IV specification is the same - as for the first format type. - This format is mainly used for specification of authenticated modes. - - The crypto API cipher specifications format is:: - - capi:cipher_api_spec-ivmode[:ivopts] - - Examples:: - - capi:cbc(aes)-essiv:sha256 - capi:xts(aes)-plain64 - - Examples of authenticated modes:: - - capi:gcm(aes)-random - capi:authenc(hmac(sha256),xts(aes))-random - capi:rfc7539(chacha20,poly1305)-random - - The /proc/crypto contains a list of curently loaded crypto modes. - - - Key used for encryption. It is encoded either as a hexadecimal number - or it can be passed as prefixed with single colon - character (':') for keys residing in kernel keyring service. - You can only use key sizes that are valid for the selected cipher - in combination with the selected iv mode. - Note that for some iv modes the key string can contain additional - keys (for example IV seed) so the key contains more parts concatenated - into a single string. - - - The kernel keyring key is identified by string in following format: - ::. - - - The encryption key size in bytes. The kernel key payload size must match - the value passed in . - - - Either 'logon' or 'user' kernel key type. - - - The kernel keyring key description crypt target should look for - when loading key of . - - - Multi-key compatibility mode. You can define keys and - then sectors are encrypted according to their offsets (sector 0 uses key0; - sector 1 uses key1 etc.). must be a power of two. - - - The IV offset is a sector count that is added to the sector number - before creating the IV. - - - This is the device that is going to be used as backend and contains the - encrypted data. You can specify it as a path like /dev/xxx or a device - number :. - - - Starting sector within the device where the encrypted data begins. - -<#opt_params> - Number of optional parameters. If there are no optional parameters, - the optional paramaters section can be skipped or #opt_params can be zero. - Otherwise #opt_params is the number of following arguments. - - Example of optional parameters section: - 3 allow_discards same_cpu_crypt submit_from_crypt_cpus - -allow_discards - Block discard requests (a.k.a. TRIM) are passed through the crypt device. - The default is to ignore discard requests. - - WARNING: Assess the specific security risks carefully before enabling this - option. For example, allowing discards on encrypted devices may lead to - the leak of information about the ciphertext device (filesystem type, - used space etc.) if the discarded blocks can be located easily on the - device later. - -same_cpu_crypt - Perform encryption using the same cpu that IO was submitted on. - The default is to use an unbound workqueue so that encryption work - is automatically balanced between available CPUs. - -submit_from_crypt_cpus - Disable offloading writes to a separate thread after encryption. - There are some situations where offloading write bios from the - encryption threads to a single thread degrades performance - significantly. The default is to offload write bios to the same - thread because it benefits CFQ to have writes submitted using the - same context. - -integrity:: - The device requires additional metadata per-sector stored - in per-bio integrity structure. This metadata must by provided - by underlying dm-integrity target. - - The can be "none" if metadata is used only for persistent IV. - - For Authenticated Encryption with Additional Data (AEAD) - the is "aead". An AEAD mode additionally calculates and verifies - integrity for the encrypted device. The additional space is then - used for storing authentication tag (and persistent IV if needed). - -sector_size: - Use as the encryption unit instead of 512 bytes sectors. - This option can be in range 512 - 4096 bytes and must be power of two. - Virtual device will announce this size as a minimal IO and logical sector. - -iv_large_sectors - IV generators will use sector number counted in units - instead of default 512 bytes sectors. - - For example, if is 4096 bytes, plain64 IV for the second - sector will be 8 (without flag) and 1 if iv_large_sectors is present. - The must be multiple of (in 512 bytes units) - if this flag is specified. - -Example scripts -=============== -LUKS (Linux Unified Key Setup) is now the preferred way to set up disk -encryption with dm-crypt using the 'cryptsetup' utility, see -https://gitlab.com/cryptsetup/cryptsetup - -:: - - #!/bin/sh - # Create a crypt device using dmsetup - dmsetup create crypt1 --table "0 `blockdev --getsz $1` crypt aes-cbc-essiv:sha256 babebabebabebabebabebabebabebabe 0 $1 0" - -:: - - #!/bin/sh - # Create a crypt device using dmsetup when encryption key is stored in keyring service - dmsetup create crypt2 --table "0 `blockdev --getsize $1` crypt aes-cbc-essiv:sha256 :32:logon:my_prefix:my_key 0 $1 0" - -:: - - #!/bin/sh - # Create a crypt device using cryptsetup and LUKS header with default cipher - cryptsetup luksFormat $1 - cryptsetup luksOpen $1 crypt1 diff --git a/Documentation/device-mapper/dm-dust.txt b/Documentation/device-mapper/dm-dust.txt deleted file mode 100644 index 954d402a1f6a..000000000000 --- a/Documentation/device-mapper/dm-dust.txt +++ /dev/null @@ -1,272 +0,0 @@ -dm-dust -======= - -This target emulates the behavior of bad sectors at arbitrary -locations, and the ability to enable the emulation of the failures -at an arbitrary time. - -This target behaves similarly to a linear target. At a given time, -the user can send a message to the target to start failing read -requests on specific blocks (to emulate the behavior of a hard disk -drive with bad sectors). - -When the failure behavior is enabled (i.e.: when the output of -"dmsetup status" displays "fail_read_on_bad_block"), reads of blocks -in the "bad block list" will fail with EIO ("Input/output error"). - -Writes of blocks in the "bad block list will result in the following: - -1. Remove the block from the "bad block list". -2. Successfully complete the write. - -This emulates the "remapped sector" behavior of a drive with bad -sectors. - -Normally, a drive that is encountering bad sectors will most likely -encounter more bad sectors, at an unknown time or location. -With dm-dust, the user can use the "addbadblock" and "removebadblock" -messages to add arbitrary bad blocks at new locations, and the -"enable" and "disable" messages to modulate the state of whether the -configured "bad blocks" will be treated as bad, or bypassed. -This allows the pre-writing of test data and metadata prior to -simulating a "failure" event where bad sectors start to appear. - -Table parameters: ------------------ - - -Mandatory parameters: - : path to the block device. - : offset to data area from start of device_path - : block size in bytes - (minimum 512, maximum 1073741824, must be a power of 2) - -Usage instructions: -------------------- - -First, find the size (in 512-byte sectors) of the device to be used: - -$ sudo blockdev --getsz /dev/vdb1 -33552384 - -Create the dm-dust device: -(For a device with a block size of 512 bytes) -$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 512' - -(For a device with a block size of 4096 bytes) -$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 4096' - -Check the status of the read behavior ("bypass" indicates that all I/O -will be passed through to the underlying device): -$ sudo dmsetup status dust1 -0 33552384 dust 252:17 bypass - -$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=128 iflag=direct -128+0 records in -128+0 records out - -$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct -128+0 records in -128+0 records out - -Adding and removing bad blocks: -------------------------------- - -At any time (i.e.: whether the device has the "bad block" emulation -enabled or disabled), bad blocks may be added or removed from the -device via the "addbadblock" and "removebadblock" messages: - -$ sudo dmsetup message dust1 0 addbadblock 60 -kernel: device-mapper: dust: badblock added at block 60 - -$ sudo dmsetup message dust1 0 addbadblock 67 -kernel: device-mapper: dust: badblock added at block 67 - -$ sudo dmsetup message dust1 0 addbadblock 72 -kernel: device-mapper: dust: badblock added at block 72 - -These bad blocks will be stored in the "bad block list". -While the device is in "bypass" mode, reads and writes will succeed: - -$ sudo dmsetup status dust1 -0 33552384 dust 252:17 bypass - -Enabling block read failures: ------------------------------ - -To enable the "fail read on bad block" behavior, send the "enable" message: - -$ sudo dmsetup message dust1 0 enable -kernel: device-mapper: dust: enabling read failures on bad sectors - -$ sudo dmsetup status dust1 -0 33552384 dust 252:17 fail_read_on_bad_block - -With the device in "fail read on bad block" mode, attempting to read a -block will encounter an "Input/output error": - -$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=1 skip=67 iflag=direct -dd: error reading '/dev/mapper/dust1': Input/output error -0+0 records in -0+0 records out -0 bytes copied, 0.00040651 s, 0.0 kB/s - -...and writing to the bad blocks will remove the blocks from the list, -therefore emulating the "remap" behavior of hard disk drives: - -$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct -128+0 records in -128+0 records out - -kernel: device-mapper: dust: block 60 removed from badblocklist by write -kernel: device-mapper: dust: block 67 removed from badblocklist by write -kernel: device-mapper: dust: block 72 removed from badblocklist by write -kernel: device-mapper: dust: block 87 removed from badblocklist by write - -Bad block add/remove error handling: ------------------------------------- - -Attempting to add a bad block that already exists in the list will -result in an "Invalid argument" error, as well as a helpful message: - -$ sudo dmsetup message dust1 0 addbadblock 88 -device-mapper: message ioctl on dust1 failed: Invalid argument -kernel: device-mapper: dust: block 88 already in badblocklist - -Attempting to remove a bad block that doesn't exist in the list will -result in an "Invalid argument" error, as well as a helpful message: - -$ sudo dmsetup message dust1 0 removebadblock 87 -device-mapper: message ioctl on dust1 failed: Invalid argument -kernel: device-mapper: dust: block 87 not found in badblocklist - -Counting the number of bad blocks in the bad block list: --------------------------------------------------------- - -To count the number of bad blocks configured in the device, run the -following message command: - -$ sudo dmsetup message dust1 0 countbadblocks - -A message will print with the number of bad blocks currently -configured on the device: - -kernel: device-mapper: dust: countbadblocks: 895 badblock(s) found - -Querying for specific bad blocks: ---------------------------------- - -To find out if a specific block is in the bad block list, run the -following message command: - -$ sudo dmsetup message dust1 0 queryblock 72 - -The following message will print if the block is in the list: -device-mapper: dust: queryblock: block 72 found in badblocklist - -The following message will print if the block is in the list: -device-mapper: dust: queryblock: block 72 not found in badblocklist - -The "queryblock" message command will work in both the "enabled" -and "disabled" modes, allowing the verification of whether a block -will be treated as "bad" without having to issue I/O to the device, -or having to "enable" the bad block emulation. - -Clearing the bad block list: ----------------------------- - -To clear the bad block list (without needing to individually run -a "removebadblock" message command for every block), run the -following message command: - -$ sudo dmsetup message dust1 0 clearbadblocks - -After clearing the bad block list, the following message will appear: - -kernel: device-mapper: dust: clearbadblocks: badblocks cleared - -If there were no bad blocks to clear, the following message will -appear: - -kernel: device-mapper: dust: clearbadblocks: no badblocks found - -Message commands list: ----------------------- - -Below is a list of the messages that can be sent to a dust device: - -Operations on blocks (requires a argument): - -addbadblock -queryblock -removebadblock - -...where is a block number within range of the device - (corresponding to the block size of the device.) - -Single argument message commands: - -countbadblocks -clearbadblocks -disable -enable -quiet - -Device removal: ---------------- - -When finished, remove the device via the "dmsetup remove" command: - -$ sudo dmsetup remove dust1 - -Quiet mode: ------------ - -On test runs with many bad blocks, it may be desirable to avoid -excessive logging (from bad blocks added, removed, or "remapped"). -This can be done by enabling "quiet mode" via the following message: - -$ sudo dmsetup message dust1 0 quiet - -This will suppress log messages from add / remove / removed by write -operations. Log messages from "countbadblocks" or "queryblock" -message commands will still print in quiet mode. - -The status of quiet mode can be seen by running "dmsetup status": - -$ sudo dmsetup status dust1 -0 33552384 dust 252:17 fail_read_on_bad_block quiet - -To disable quiet mode, send the "quiet" message again: - -$ sudo dmsetup message dust1 0 quiet - -$ sudo dmsetup status dust1 -0 33552384 dust 252:17 fail_read_on_bad_block verbose - -(The presence of "verbose" indicates normal logging.) - -"Why not...?" -------------- - -scsi_debug has a "medium error" mode that can fail reads on one -specified sector (sector 0x1234, hardcoded in the source code), but -it uses RAM for the persistent storage, which drastically decreases -the potential device size. - -dm-flakey fails all I/O from all block locations at a specified time -frequency, and not a given point in time. - -When a bad sector occurs on a hard disk drive, reads to that sector -are failed by the device, usually resulting in an error code of EIO -("I/O error") or ENODATA ("No data available"). However, a write to -the sector may succeed, and result in the sector becoming readable -after the device controller no longer experiences errors reading the -sector (or after a reallocation of the sector). However, there may -be bad sectors that occur on the device in the future, in a different, -unpredictable location. - -This target seeks to provide a device that can exhibit the behavior -of a bad sector at a known sector location, at a known time, based -on a large storage device (at least tens of gigabytes, not occupying -system memory). diff --git a/Documentation/device-mapper/dm-flakey.rst b/Documentation/device-mapper/dm-flakey.rst deleted file mode 100644 index 86138735879d..000000000000 --- a/Documentation/device-mapper/dm-flakey.rst +++ /dev/null @@ -1,74 +0,0 @@ -========= -dm-flakey -========= - -This target is the same as the linear target except that it exhibits -unreliable behaviour periodically. It's been found useful in simulating -failing devices for testing purposes. - -Starting from the time the table is loaded, the device is available for - seconds, then exhibits unreliable behaviour for seconds, and then this cycle repeats. - -Also, consider using this in combination with the dm-delay target too, -which can delay reads and writes and/or send them to different -underlying devices. - -Table parameters ----------------- - -:: - - \ - [ []] - -Mandatory parameters: - - : - Full pathname to the underlying block-device, or a - "major:minor" device-number. - : - Starting sector within the device. - : - Number of seconds device is available. - : - Number of seconds device returns errors. - -Optional feature parameters: - - If no feature parameters are present, during the periods of - unreliability, all I/O returns errors. - - drop_writes: - All write I/O is silently ignored. - Read I/O is handled correctly. - - error_writes: - All write I/O is failed with an error signalled. - Read I/O is handled correctly. - - corrupt_bio_byte : - During , replace of the data of - each matching bio with . - - : - The offset of the byte to replace. - Counting starts at 1, to replace the first byte. - : - Either 'r' to corrupt reads or 'w' to corrupt writes. - 'w' is incompatible with drop_writes. - : - The value (from 0-255) to write. - : - Perform the replacement only if bio->bi_opf has all the - selected flags set. - -Examples: - -Replaces the 32nd byte of READ bios with the value 1:: - - corrupt_bio_byte 32 r 1 0 - -Replaces the 224th byte of REQ_META (=32) bios with the value 0:: - - corrupt_bio_byte 224 w 0 32 diff --git a/Documentation/device-mapper/dm-init.rst b/Documentation/device-mapper/dm-init.rst deleted file mode 100644 index e5242ff17e9b..000000000000 --- a/Documentation/device-mapper/dm-init.rst +++ /dev/null @@ -1,125 +0,0 @@ -================================ -Early creation of mapped devices -================================ - -It is possible to configure a device-mapper device to act as the root device for -your system in two ways. - -The first is to build an initial ramdisk which boots to a minimal userspace -which configures the device, then pivot_root(8) in to it. - -The second is to create one or more device-mappers using the module parameter -"dm-mod.create=" through the kernel boot command line argument. - -The format is specified as a string of data separated by commas and optionally -semi-colons, where: - - - a comma is used to separate fields like name, uuid, flags and table - (specifies one device) - - a semi-colon is used to separate devices. - -So the format will look like this:: - - dm-mod.create=,,,,
[,
+][;,,,,
[,
+]+] - -Where:: - - ::= The device name. - ::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | "" - ::= The device minor number | "" - ::= "ro" | "rw" -
::= - ::= "verity" | "linear" | ... (see list below) - -The dm line should be equivalent to the one used by the dmsetup tool with the -`--concise` argument. - -Target types -============ - -Not all target types are available as there are serious risks in allowing -activation of certain DM targets without first using userspace tools to check -the validity of associated metadata. - -======================= ======================================================= -`cache` constrained, userspace should verify cache device -`crypt` allowed -`delay` allowed -`era` constrained, userspace should verify metadata device -`flakey` constrained, meant for test -`linear` allowed -`log-writes` constrained, userspace should verify metadata device -`mirror` constrained, userspace should verify main/mirror device -`raid` constrained, userspace should verify metadata device -`snapshot` constrained, userspace should verify src/dst device -`snapshot-origin` allowed -`snapshot-merge` constrained, userspace should verify src/dst device -`striped` allowed -`switch` constrained, userspace should verify dev path -`thin` constrained, requires dm target message from userspace -`thin-pool` constrained, requires dm target message from userspace -`verity` allowed -`writecache` constrained, userspace should verify cache device -`zero` constrained, not meant for rootfs -======================= ======================================================= - -If the target is not listed above, it is constrained by default (not tested). - -Examples -======== -An example of booting to a linear array made up of user-mode linux block -devices:: - - dm-mod.create="lroot,,,rw, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" root=/dev/dm-0 - -This will boot to a rw dm-linear target of 8192 sectors split across two block -devices identified by their major:minor numbers. After boot, udev will rename -this target to /dev/mapper/lroot (depending on the rules). No uuid was assigned. - -An example of multiple device-mappers, with the dm-mod.create="..." contents -is shown here split on multiple lines for readability:: - - dm-linear,,1,rw, - 0 32768 linear 8:1 0, - 32768 1024000 linear 8:2 0; - dm-verity,,3,ro, - 0 1638400 verity 1 /dev/sdc1 /dev/sdc2 4096 4096 204800 1 sha256 - ac87db56303c9c1da433d7209b5a6ef3e4779df141200cbd7c157dcb8dd89c42 - 5ebfe87f7df3235b80a117ebc4078e44f55045487ad4a96581d1adb564615b51 - -Other examples (per target): - -"crypt":: - - dm-crypt,,8,ro, - 0 1048576 crypt aes-xts-plain64 - babebabebabebabebabebabebabebabebabebabebabebabebabebabebabebabe 0 - /dev/sda 0 1 allow_discards - -"delay":: - - dm-delay,,4,ro,0 409600 delay /dev/sda1 0 500 - -"linear":: - - dm-linear,,,rw, - 0 32768 linear /dev/sda1 0, - 32768 1024000 linear /dev/sda2 0, - 1056768 204800 linear /dev/sda3 0, - 1261568 512000 linear /dev/sda4 0 - -"snapshot-origin":: - - dm-snap-orig,,4,ro,0 409600 snapshot-origin 8:2 - -"striped":: - - dm-striped,,4,ro,0 1638400 striped 4 4096 - /dev/sda1 0 /dev/sda2 0 /dev/sda3 0 /dev/sda4 0 - -"verity":: - - dm-verity,,4,ro, - 0 1638400 verity 1 8:1 8:2 4096 4096 204800 1 sha256 - fb1a5a0f00deb908d8b53cb270858975e76cf64105d412ce764225d53b8f3cfd - 51934789604d1b92399c52e7cb149d1b3a1b74bbbcb103b2a0aaacbed5c08584 diff --git a/Documentation/device-mapper/dm-integrity.rst b/Documentation/device-mapper/dm-integrity.rst deleted file mode 100644 index a30aa91b5fbe..000000000000 --- a/Documentation/device-mapper/dm-integrity.rst +++ /dev/null @@ -1,259 +0,0 @@ -============ -dm-integrity -============ - -The dm-integrity target emulates a block device that has additional -per-sector tags that can be used for storing integrity information. - -A general problem with storing integrity tags with every sector is that -writing the sector and the integrity tag must be atomic - i.e. in case of -crash, either both sector and integrity tag or none of them is written. - -To guarantee write atomicity, the dm-integrity target uses journal, it -writes sector data and integrity tags into a journal, commits the journal -and then copies the data and integrity tags to their respective location. - -The dm-integrity target can be used with the dm-crypt target - in this -situation the dm-crypt target creates the integrity data and passes them -to the dm-integrity target via bio_integrity_payload attached to the bio. -In this mode, the dm-crypt and dm-integrity targets provide authenticated -disk encryption - if the attacker modifies the encrypted device, an I/O -error is returned instead of random data. - -The dm-integrity target can also be used as a standalone target, in this -mode it calculates and verifies the integrity tag internally. In this -mode, the dm-integrity target can be used to detect silent data -corruption on the disk or in the I/O path. - -There's an alternate mode of operation where dm-integrity uses bitmap -instead of a journal. If a bit in the bitmap is 1, the corresponding -region's data and integrity tags are not synchronized - if the machine -crashes, the unsynchronized regions will be recalculated. The bitmap mode -is faster than the journal mode, because we don't have to write the data -twice, but it is also less reliable, because if data corruption happens -when the machine crashes, it may not be detected. - -When loading the target for the first time, the kernel driver will format -the device. But it will only format the device if the superblock contains -zeroes. If the superblock is neither valid nor zeroed, the dm-integrity -target can't be loaded. - -To use the target for the first time: - -1. overwrite the superblock with zeroes -2. load the dm-integrity target with one-sector size, the kernel driver - will format the device -3. unload the dm-integrity target -4. read the "provided_data_sectors" value from the superblock -5. load the dm-integrity target with the the target size - "provided_data_sectors" -6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target - with the size "provided_data_sectors" - - -Target arguments: - -1. the underlying block device - -2. the number of reserved sector at the beginning of the device - the - dm-integrity won't read of write these sectors - -3. the size of the integrity tag (if "-" is used, the size is taken from - the internal-hash algorithm) - -4. mode: - - D - direct writes (without journal) - in this mode, journaling is - not used and data sectors and integrity tags are written - separately. In case of crash, it is possible that the data - and integrity tag doesn't match. - J - journaled writes - data and integrity tags are written to the - journal and atomicity is guaranteed. In case of crash, - either both data and tag or none of them are written. The - journaled mode degrades write throughput twice because the - data have to be written twice. - B - bitmap mode - data and metadata are written without any - synchronization, the driver maintains a bitmap of dirty - regions where data and metadata don't match. This mode can - only be used with internal hash. - R - recovery mode - in this mode, journal is not replayed, - checksums are not checked and writes to the device are not - allowed. This mode is useful for data recovery if the - device cannot be activated in any of the other standard - modes. - -5. the number of additional arguments - -Additional arguments: - -journal_sectors:number - The size of journal, this argument is used only if formatting the - device. If the device is already formatted, the value from the - superblock is used. - -interleave_sectors:number - The number of interleaved sectors. This values is rounded down to - a power of two. If the device is already formatted, the value from - the superblock is used. - -meta_device:device - Don't interleave the data and metadata on on device. Use a - separate device for metadata. - -buffer_sectors:number - The number of sectors in one buffer. The value is rounded down to - a power of two. - - The tag area is accessed using buffers, the buffer size is - configurable. The large buffer size means that the I/O size will - be larger, but there could be less I/Os issued. - -journal_watermark:number - The journal watermark in percents. When the size of the journal - exceeds this watermark, the thread that flushes the journal will - be started. - -commit_time:number - Commit time in milliseconds. When this time passes, the journal is - written. The journal is also written immediatelly if the FLUSH - request is received. - -internal_hash:algorithm(:key) (the key is optional) - Use internal hash or crc. - When this argument is used, the dm-integrity target won't accept - integrity tags from the upper target, but it will automatically - generate and verify the integrity tags. - - You can use a crc algorithm (such as crc32), then integrity target - will protect the data against accidental corruption. - You can also use a hmac algorithm (for example - "hmac(sha256):0123456789abcdef"), in this mode it will provide - cryptographic authentication of the data without encryption. - - When this argument is not used, the integrity tags are accepted - from an upper layer target, such as dm-crypt. The upper layer - target should check the validity of the integrity tags. - -recalculate - Recalculate the integrity tags automatically. It is only valid - when using internal hash. - -journal_crypt:algorithm(:key) (the key is optional) - Encrypt the journal using given algorithm to make sure that the - attacker can't read the journal. You can use a block cipher here - (such as "cbc(aes)") or a stream cipher (for example "chacha20", - "salsa20", "ctr(aes)" or "ecb(arc4)"). - - The journal contains history of last writes to the block device, - an attacker reading the journal could see the last sector nubmers - that were written. From the sector numbers, the attacker can infer - the size of files that were written. To protect against this - situation, you can encrypt the journal. - -journal_mac:algorithm(:key) (the key is optional) - Protect sector numbers in the journal from accidental or malicious - modification. To protect against accidental modification, use a - crc algorithm, to protect against malicious modification, use a - hmac algorithm with a key. - - This option is not needed when using internal-hash because in this - mode, the integrity of journal entries is checked when replaying - the journal. Thus, modified sector number would be detected at - this stage. - -block_size:number - The size of a data block in bytes. The larger the block size the - less overhead there is for per-block integrity metadata. - Supported values are 512, 1024, 2048 and 4096 bytes. If not - specified the default block size is 512 bytes. - -sectors_per_bit:number - In the bitmap mode, this parameter specifies the number of - 512-byte sectors that corresponds to one bitmap bit. - -bitmap_flush_interval:number - The bitmap flush interval in milliseconds. The metadata buffers - are synchronized when this interval expires. - - -The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can -be changed when reloading the target (load an inactive table and swap the -tables with suspend and resume). The other arguments should not be changed -when reloading the target because the layout of disk data depend on them -and the reloaded target would be non-functional. - - -The layout of the formatted block device: - -* reserved sectors - (they are not used by this target, they can be used for - storing LUKS metadata or for other purpose), the size of the reserved - area is specified in the target arguments - -* superblock (4kiB) - * magic string - identifies that the device was formatted - * version - * log2(interleave sectors) - * integrity tag size - * the number of journal sections - * provided data sectors - the number of sectors that this target - provides (i.e. the size of the device minus the size of all - metadata and padding). The user of this target should not send - bios that access data beyond the "provided data sectors" limit. - * flags - SB_FLAG_HAVE_JOURNAL_MAC - - a flag is set if journal_mac is used - SB_FLAG_RECALCULATING - - recalculating is in progress - SB_FLAG_DIRTY_BITMAP - - journal area contains the bitmap of dirty - blocks - * log2(sectors per block) - * a position where recalculating finished -* journal - The journal is divided into sections, each section contains: - - * metadata area (4kiB), it contains journal entries - - - every journal entry contains: - - * logical sector (specifies where the data and tag should - be written) - * last 8 bytes of data - * integrity tag (the size is specified in the superblock) - - - every metadata sector ends with - - * mac (8-bytes), all the macs in 8 metadata sectors form a - 64-byte value. It is used to store hmac of sector - numbers in the journal section, to protect against a - possibility that the attacker tampers with sector - numbers in the journal. - * commit id - - * data area (the size is variable; it depends on how many journal - entries fit into the metadata area) - - - every sector in the data area contains: - - * data (504 bytes of data, the last 8 bytes are stored in - the journal entry) - * commit id - - To test if the whole journal section was written correctly, every - 512-byte sector of the journal ends with 8-byte commit id. If the - commit id matches on all sectors in a journal section, then it is - assumed that the section was written correctly. If the commit id - doesn't match, the section was written partially and it should not - be replayed. - -* one or more runs of interleaved tags and data. - Each run contains: - - * tag area - it contains integrity tags. There is one tag for each - sector in the data area - * data area - it contains data sectors. The number of data sectors - in one run must be a power of two. log2 of this value is stored - in the superblock. diff --git a/Documentation/device-mapper/dm-io.rst b/Documentation/device-mapper/dm-io.rst deleted file mode 100644 index d2492917a1f5..000000000000 --- a/Documentation/device-mapper/dm-io.rst +++ /dev/null @@ -1,75 +0,0 @@ -===== -dm-io -===== - -Dm-io provides synchronous and asynchronous I/O services. There are three -types of I/O services available, and each type has a sync and an async -version. - -The user must set up an io_region structure to describe the desired location -of the I/O. Each io_region indicates a block-device along with the starting -sector and size of the region:: - - struct io_region { - struct block_device *bdev; - sector_t sector; - sector_t count; - }; - -Dm-io can read from one io_region or write to one or more io_regions. Writes -to multiple regions are specified by an array of io_region structures. - -The first I/O service type takes a list of memory pages as the data buffer for -the I/O, along with an offset into the first page:: - - struct page_list { - struct page_list *next; - struct page *page; - }; - - int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw, - struct page_list *pl, unsigned int offset, - unsigned long *error_bits); - int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, - struct page_list *pl, unsigned int offset, - io_notify_fn fn, void *context); - -The second I/O service type takes an array of bio vectors as the data buffer -for the I/O. This service can be handy if the caller has a pre-assembled bio, -but wants to direct different portions of the bio to different devices:: - - int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, - int rw, struct bio_vec *bvec, - unsigned long *error_bits); - int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, - int rw, struct bio_vec *bvec, - io_notify_fn fn, void *context); - -The third I/O service type takes a pointer to a vmalloc'd memory buffer as the -data buffer for the I/O. This service can be handy if the caller needs to do -I/O to a large region but doesn't want to allocate a large number of individual -memory pages:: - - int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw, - void *data, unsigned long *error_bits); - int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw, - void *data, io_notify_fn fn, void *context); - -Callers of the asynchronous I/O services must include the name of a completion -callback routine and a pointer to some context data for the I/O:: - - typedef void (*io_notify_fn)(unsigned long error, void *context); - -The "error" parameter in this callback, as well as the `*error` parameter in -all of the synchronous versions, is a bitset (instead of a simple error value). -In the case of an write-I/O to multiple regions, this bitset allows dm-io to -indicate success or failure on each individual region. - -Before using any of the dm-io services, the user should call dm_io_get() -and specify the number of pages they expect to perform I/O on concurrently. -Dm-io will attempt to resize its mempool to make sure enough pages are -always available in order to avoid unnecessary waiting while performing I/O. - -When the user is finished using the dm-io services, they should call -dm_io_put() and specify the same number of pages that were given on the -dm_io_get() call. diff --git a/Documentation/device-mapper/dm-log.rst b/Documentation/device-mapper/dm-log.rst deleted file mode 100644 index ba4fce39bc27..000000000000 --- a/Documentation/device-mapper/dm-log.rst +++ /dev/null @@ -1,57 +0,0 @@ -===================== -Device-Mapper Logging -===================== -The device-mapper logging code is used by some of the device-mapper -RAID targets to track regions of the disk that are not consistent. -A region (or portion of the address space) of the disk may be -inconsistent because a RAID stripe is currently being operated on or -a machine died while the region was being altered. In the case of -mirrors, a region would be considered dirty/inconsistent while you -are writing to it because the writes need to be replicated for all -the legs of the mirror and may not reach the legs at the same time. -Once all writes are complete, the region is considered clean again. - -There is a generic logging interface that the device-mapper RAID -implementations use to perform logging operations (see -dm_dirty_log_type in include/linux/dm-dirty-log.h). Various different -logging implementations are available and provide different -capabilities. The list includes: - -============== ============================================================== -Type Files -============== ============================================================== -disk drivers/md/dm-log.c -core drivers/md/dm-log.c -userspace drivers/md/dm-log-userspace* include/linux/dm-log-userspace.h -============== ============================================================== - -The "disk" log type -------------------- -This log implementation commits the log state to disk. This way, the -logging state survives reboots/crashes. - -The "core" log type -------------------- -This log implementation keeps the log state in memory. The log state -will not survive a reboot or crash, but there may be a small boost in -performance. This method can also be used if no storage device is -available for storing log state. - -The "userspace" log type ------------------------- -This log type simply provides a way to export the log API to userspace, -so log implementations can be done there. This is done by forwarding most -logging requests to userspace, where a daemon receives and processes the -request. - -The structure used for communication between kernel and userspace are -located in include/linux/dm-log-userspace.h. Due to the frequency, -diversity, and 2-way communication nature of the exchanges between -kernel and userspace, 'connector' is used as the interface for -communication. - -There are currently two userspace log implementations that leverage this -framework - "clustered-disk" and "clustered-core". These implementations -provide a cluster-coherent log for shared-storage. Device-mapper mirroring -can be used in a shared-storage environment when the cluster log implementations -are employed. diff --git a/Documentation/device-mapper/dm-queue-length.rst b/Documentation/device-mapper/dm-queue-length.rst deleted file mode 100644 index d8e381c1cb02..000000000000 --- a/Documentation/device-mapper/dm-queue-length.rst +++ /dev/null @@ -1,48 +0,0 @@ -=============== -dm-queue-length -=============== - -dm-queue-length is a path selector module for device-mapper targets, -which selects a path with the least number of in-flight I/Os. -The path selector name is 'queue-length'. - -Table parameters for each path: [] - -:: - - : The number of I/Os to dispatch using the selected - path before switching to the next path. - If not given, internal default is used. To check - the default value, see the activated table. - -Status for each path: - -:: - - : 'A' if the path is active, 'F' if the path is failed. - : The number of path failures. - : The number of in-flight I/Os on the path. - - -Algorithm -========= - -dm-queue-length increments/decrements 'in-flight' when an I/O is -dispatched/completed respectively. -dm-queue-length selects a path with the minimum 'in-flight'. - - -Examples -======== -In case that 2 paths (sda and sdb) are used with repeat_count == 128. - -:: - - # echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \ - dmsetup create test - # - # dmsetup table - test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128 - # - # dmsetup status - test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0 diff --git a/Documentation/device-mapper/dm-raid.rst b/Documentation/device-mapper/dm-raid.rst deleted file mode 100644 index 2fe255b130fb..000000000000 --- a/Documentation/device-mapper/dm-raid.rst +++ /dev/null @@ -1,419 +0,0 @@ -======= -dm-raid -======= - -The device-mapper RAID (dm-raid) target provides a bridge from DM to MD. -It allows the MD RAID drivers to be accessed using a device-mapper -interface. - - -Mapping Table Interface ------------------------ -The target is named "raid" and it accepts the following parameters:: - - <#raid_params> \ - <#raid_devs> [.. ] - -: - - ============= =============================================================== - raid0 RAID0 striping (no resilience) - raid1 RAID1 mirroring - raid4 RAID4 with dedicated last parity disk - raid5_n RAID5 with dedicated last parity disk supporting takeover - Same as raid4 - - - Transitory layout - raid5_la RAID5 left asymmetric - - - rotating parity 0 with data continuation - raid5_ra RAID5 right asymmetric - - - rotating parity N with data continuation - raid5_ls RAID5 left symmetric - - - rotating parity 0 with data restart - raid5_rs RAID5 right symmetric - - - rotating parity N with data restart - raid6_zr RAID6 zero restart - - - rotating parity zero (left-to-right) with data restart - raid6_nr RAID6 N restart - - - rotating parity N (right-to-left) with data restart - raid6_nc RAID6 N continue - - - rotating parity N (right-to-left) with data continuation - raid6_n_6 RAID6 with dedicate parity disks - - - parity and Q-syndrome on the last 2 disks; - layout for takeover from/to raid4/raid5_n - raid6_la_6 Same as "raid_la" plus dedicated last Q-syndrome disk - - - layout for takeover from raid5_la from/to raid6 - raid6_ra_6 Same as "raid5_ra" dedicated last Q-syndrome disk - - - layout for takeover from raid5_ra from/to raid6 - raid6_ls_6 Same as "raid5_ls" dedicated last Q-syndrome disk - - - layout for takeover from raid5_ls from/to raid6 - raid6_rs_6 Same as "raid5_rs" dedicated last Q-syndrome disk - - - layout for takeover from raid5_rs from/to raid6 - raid10 Various RAID10 inspired algorithms chosen by additional params - (see raid10_format and raid10_copies below) - - - RAID10: Striped Mirrors (aka 'Striping on top of mirrors') - - RAID1E: Integrated Adjacent Stripe Mirroring - - RAID1E: Integrated Offset Stripe Mirroring - - and other similar RAID10 variants - ============= =============================================================== - - Reference: Chapter 4 of - http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf - -<#raid_params>: The number of parameters that follow. - - consists of - - Mandatory parameters: - : - Chunk size in sectors. This parameter is often known as - "stripe size". It is the only mandatory parameter and - is placed first. - - followed by optional parameters (in any order): - [sync|nosync] - Force or prevent RAID initialization. - - [rebuild ] - Rebuild drive number 'idx' (first drive is 0). - - [daemon_sleep ] - Interval between runs of the bitmap daemon that - clear bits. A longer interval means less bitmap I/O but - resyncing after a failure is likely to take longer. - - [min_recovery_rate ] - Throttle RAID initialization - [max_recovery_rate ] - Throttle RAID initialization - [write_mostly ] - Mark drive index 'idx' write-mostly. - [max_write_behind ] - See '--write-behind=' (man mdadm) - [stripe_cache ] - Stripe cache size (RAID 4/5/6 only) - [region_size ] - The region_size multiplied by the number of regions is the - logical size of the array. The bitmap records the device - synchronisation state for each region. - - [raid10_copies <# copies>], [raid10_format ] - These two options are used to alter the default layout of - a RAID10 configuration. The number of copies is can be - specified, but the default is 2. There are also three - variations to how the copies are laid down - the default - is "near". Near copies are what most people think of with - respect to mirroring. If these options are left unspecified, - or 'raid10_copies 2' and/or 'raid10_format near' are given, - then the layouts for 2, 3 and 4 devices are: - - ======== ========== ============== - 2 drives 3 drives 4 drives - ======== ========== ============== - A1 A1 A1 A1 A2 A1 A1 A2 A2 - A2 A2 A2 A3 A3 A3 A3 A4 A4 - A3 A3 A4 A4 A5 A5 A5 A6 A6 - A4 A4 A5 A6 A6 A7 A7 A8 A8 - .. .. .. .. .. .. .. .. .. - ======== ========== ============== - - The 2-device layout is equivalent 2-way RAID1. The 4-device - layout is what a traditional RAID10 would look like. The - 3-device layout is what might be called a 'RAID1E - Integrated - Adjacent Stripe Mirroring'. - - If 'raid10_copies 2' and 'raid10_format far', then the layouts - for 2, 3 and 4 devices are: - - ======== ============ =================== - 2 drives 3 drives 4 drives - ======== ============ =================== - A1 A2 A1 A2 A3 A1 A2 A3 A4 - A3 A4 A4 A5 A6 A5 A6 A7 A8 - A5 A6 A7 A8 A9 A9 A10 A11 A12 - .. .. .. .. .. .. .. .. .. - A2 A1 A3 A1 A2 A2 A1 A4 A3 - A4 A3 A6 A4 A5 A6 A5 A8 A7 - A6 A5 A9 A7 A8 A10 A9 A12 A11 - .. .. .. .. .. .. .. .. .. - ======== ============ =================== - - If 'raid10_copies 2' and 'raid10_format offset', then the - layouts for 2, 3 and 4 devices are: - - ======== ========== ================ - 2 drives 3 drives 4 drives - ======== ========== ================ - A1 A2 A1 A2 A3 A1 A2 A3 A4 - A2 A1 A3 A1 A2 A2 A1 A4 A3 - A3 A4 A4 A5 A6 A5 A6 A7 A8 - A4 A3 A6 A4 A5 A6 A5 A8 A7 - A5 A6 A7 A8 A9 A9 A10 A11 A12 - A6 A5 A9 A7 A8 A10 A9 A12 A11 - .. .. .. .. .. .. .. .. .. - ======== ========== ================ - - Here we see layouts closely akin to 'RAID1E - Integrated - Offset Stripe Mirroring'. - - [delta_disks ] - The delta_disks option value (-251 < N < +251) triggers - device removal (negative value) or device addition (positive - value) to any reshape supporting raid levels 4/5/6 and 10. - RAID levels 4/5/6 allow for addition of devices (metadata - and data device tuple), raid10_near and raid10_offset only - allow for device addition. raid10_far does not support any - reshaping at all. - A minimum of devices have to be kept to enforce resilience, - which is 3 devices for raid4/5 and 4 devices for raid6. - - [data_offset ] - This option value defines the offset into each data device - where the data starts. This is used to provide out-of-place - reshaping space to avoid writing over data while - changing the layout of stripes, hence an interruption/crash - may happen at any time without the risk of losing data. - E.g. when adding devices to an existing raid set during - forward reshaping, the out-of-place space will be allocated - at the beginning of each raid device. The kernel raid4/5/6/10 - MD personalities supporting such device addition will read the data from - the existing first stripes (those with smaller number of stripes) - starting at data_offset to fill up a new stripe with the larger - number of stripes, calculate the redundancy blocks (CRC/Q-syndrome) - and write that new stripe to offset 0. Same will be applied to all - N-1 other new stripes. This out-of-place scheme is used to change - the RAID type (i.e. the allocation algorithm) as well, e.g. - changing from raid5_ls to raid5_n. - - [journal_dev ] - This option adds a journal device to raid4/5/6 raid sets and - uses it to close the 'write hole' caused by the non-atomic updates - to the component devices which can cause data loss during recovery. - The journal device is used as writethrough thus causing writes to - be throttled versus non-journaled raid4/5/6 sets. - Takeover/reshape is not possible with a raid4/5/6 journal device; - it has to be deconfigured before requesting these. - - [journal_mode ] - This option sets the caching mode on journaled raid4/5/6 raid sets - (see 'journal_dev ' above) to 'writethrough' or 'writeback'. - If 'writeback' is selected the journal device has to be resilient - and must not suffer from the 'write hole' problem itself (e.g. use - raid1 or raid10) to avoid a single point of failure. - -<#raid_devs>: The number of devices composing the array. - Each device consists of two entries. The first is the device - containing the metadata (if any); the second is the one containing the - data. A Maximum of 64 metadata/data device entries are supported - up to target version 1.8.0. - 1.9.0 supports up to 253 which is enforced by the used MD kernel runtime. - - If a drive has failed or is missing at creation time, a '-' can be - given for both the metadata and data drives for a given position. - - -Example Tables --------------- - -:: - - # RAID4 - 4 data drives, 1 parity (no metadata devices) - # No metadata devices specified to hold superblock/bitmap info - # Chunk size of 1MiB - # (Lines separated for easy reading) - - 0 1960893648 raid \ - raid4 1 2048 \ - 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 - - # RAID4 - 4 data drives, 1 parity (with metadata devices) - # Chunk size of 1MiB, force RAID initialization, - # min recovery rate at 20 kiB/sec/disk - - 0 1960893648 raid \ - raid4 4 2048 sync min_recovery_rate 20 \ - 5 8:17 8:18 8:33 8:34 8:49 8:50 8:65 8:66 8:81 8:82 - - -Status Output -------------- -'dmsetup table' displays the table used to construct the mapping. -The optional parameters are always printed in the order listed -above with "sync" or "nosync" always output ahead of the other -arguments, regardless of the order used when originally loading the table. -Arguments that can be repeated are ordered by value. - - -'dmsetup status' yields information on the state and health of the array. -The output is as follows (normally a single line, but expanded here for -clarity):: - - 1: raid \ - 2: <#devices> \ - 3: - -Line 1 is the standard output produced by device-mapper. - -Line 2 & 3 are produced by the raid target and are best explained by example:: - - 0 1960893648 raid raid4 5 AAAAA 2/490221568 init 0 - -Here we can see the RAID type is raid4, there are 5 devices - all of -which are 'A'live, and the array is 2/490221568 complete with its initial -recovery. Here is a fuller description of the individual fields: - - =============== ========================================================= - Same as the used to create the array. - One char for each device, indicating: - - - 'A' = alive and in-sync - - 'a' = alive but not in-sync - - 'D' = dead/failed. - The ratio indicating how much of the array has undergone - the process described by 'sync_action'. If the - 'sync_action' is "check" or "repair", then the process - of "resync" or "recover" can be considered complete. - One of the following possible states: - - idle - - No synchronization action is being performed. - frozen - - The current action has been halted. - resync - - Array is undergoing its initial synchronization - or is resynchronizing after an unclean shutdown - (possibly aided by a bitmap). - recover - - A device in the array is being rebuilt or - replaced. - check - - A user-initiated full check of the array is - being performed. All blocks are read and - checked for consistency. The number of - discrepancies found are recorded in - . No changes are made to the - array by this action. - repair - - The same as "check", but discrepancies are - corrected. - reshape - - The array is undergoing a reshape. - The number of discrepancies found between mirror copies - in RAID1/10 or wrong parity values found in RAID4/5/6. - This value is valid only after a "check" of the array - is performed. A healthy array has a 'mismatch_cnt' of 0. - The current data offset to the start of the user data on - each component device of a raid set (see the respective - raid parameter to support out-of-place reshaping). - - 'A' - active write-through journal device. - - 'a' - active write-back journal device. - - 'D' - dead journal device. - - '-' - no journal device. - =============== ========================================================= - - -Message Interface ------------------ -The dm-raid target will accept certain actions through the 'message' interface. -('man dmsetup' for more information on the message interface.) These actions -include: - - ========= ================================================ - "idle" Halt the current sync action. - "frozen" Freeze the current sync action. - "resync" Initiate/continue a resync. - "recover" Initiate/continue a recover process. - "check" Initiate a check (i.e. a "scrub") of the array. - "repair" Initiate a repair of the array. - ========= ================================================ - - -Discard Support ---------------- -The implementation of discard support among hardware vendors varies. -When a block is discarded, some storage devices will return zeroes when -the block is read. These devices set the 'discard_zeroes_data' -attribute. Other devices will return random data. Confusingly, some -devices that advertise 'discard_zeroes_data' will not reliably return -zeroes when discarded blocks are read! Since RAID 4/5/6 uses blocks -from a number of devices to calculate parity blocks and (for performance -reasons) relies on 'discard_zeroes_data' being reliable, it is important -that the devices be consistent. Blocks may be discarded in the middle -of a RAID 4/5/6 stripe and if subsequent read results are not -consistent, the parity blocks may be calculated differently at any time; -making the parity blocks useless for redundancy. It is important to -understand how your hardware behaves with discards if you are going to -enable discards with RAID 4/5/6. - -Since the behavior of storage devices is unreliable in this respect, -even when reporting 'discard_zeroes_data', by default RAID 4/5/6 -discard support is disabled -- this ensures data integrity at the -expense of losing some performance. - -Storage devices that properly support 'discard_zeroes_data' are -increasingly whitelisted in the kernel and can thus be trusted. - -For trusted devices, the following dm-raid module parameter can be set -to safely enable discard support for RAID 4/5/6: - - 'devices_handle_discards_safely' - - -Version History ---------------- - -:: - - 1.0.0 Initial version. Support for RAID 4/5/6 - 1.1.0 Added support for RAID 1 - 1.2.0 Handle creation of arrays that contain failed devices. - 1.3.0 Added support for RAID 10 - 1.3.1 Allow device replacement/rebuild for RAID 10 - 1.3.2 Fix/improve redundancy checking for RAID10 - 1.4.0 Non-functional change. Removes arg from mapping function. - 1.4.1 RAID10 fix redundancy validation checks (commit 55ebbb5). - 1.4.2 Add RAID10 "far" and "offset" algorithm support. - 1.5.0 Add message interface to allow manipulation of the sync_action. - New status (STATUSTYPE_INFO) fields: sync_action and mismatch_cnt. - 1.5.1 Add ability to restore transiently failed devices on resume. - 1.5.2 'mismatch_cnt' is zero unless [last_]sync_action is "check". - 1.6.0 Add discard support (and devices_handle_discard_safely module param). - 1.7.0 Add support for MD RAID0 mappings. - 1.8.0 Explicitly check for compatible flags in the superblock metadata - and reject to start the raid set if any are set by a newer - target version, thus avoiding data corruption on a raid set - with a reshape in progress. - 1.9.0 Add support for RAID level takeover/reshape/region size - and set size reduction. - 1.9.1 Fix activation of existing RAID 4/10 mapped devices - 1.9.2 Don't emit '- -' on the status table line in case the constructor - fails reading a superblock. Correctly emit 'maj:min1 maj:min2' and - 'D' on the status line. If '- -' is passed into the constructor, emit - '- -' on the table line and '-' as the status line health character. - 1.10.0 Add support for raid4/5/6 journal device - 1.10.1 Fix data corruption on reshape request - 1.11.0 Fix table line argument order - (wrong raid10_copies/raid10_format sequence) - 1.11.1 Add raid4/5/6 journal write-back support via journal_mode option - 1.12.1 Fix for MD deadlock between mddev_suspend() and md_write_start() available - 1.13.0 Fix dev_health status at end of "recover" (was 'a', now 'A') - 1.13.1 Fix deadlock caused by early md_stop_writes(). Also fix size an - state races. - 1.13.2 Fix raid redundancy validation and avoid keeping raid set frozen - 1.14.0 Fix reshape race on small devices. Fix stripe adding reshape - deadlock/potential data corruption. Update superblock when - specific devices are requested via rebuild. Fix RAID leg - rebuild errors. diff --git a/Documentation/device-mapper/dm-service-time.rst b/Documentation/device-mapper/dm-service-time.rst deleted file mode 100644 index facf277fc13c..000000000000 --- a/Documentation/device-mapper/dm-service-time.rst +++ /dev/null @@ -1,101 +0,0 @@ -=============== -dm-service-time -=============== - -dm-service-time is a path selector module for device-mapper targets, -which selects a path with the shortest estimated service time for -the incoming I/O. - -The service time for each path is estimated by dividing the total size -of in-flight I/Os on a path with the performance value of the path. -The performance value is a relative throughput value among all paths -in a path-group, and it can be specified as a table argument. - -The path selector name is 'service-time'. - -Table parameters for each path: - - [ []] - : - The number of I/Os to dispatch using the selected - path before switching to the next path. - If not given, internal default is used. To check - the default value, see the activated table. - : - The relative throughput value of the path - among all paths in the path-group. - The valid range is 0-100. - If not given, minimum value '1' is used. - If '0' is given, the path isn't selected while - other paths having a positive value are available. - -Status for each path: - - - : - 'A' if the path is active, 'F' if the path is failed. - : - The number of path failures. - : - The size of in-flight I/Os on the path. - : - The relative throughput value of the path - among all paths in the path-group. - - -Algorithm -========= - -dm-service-time adds the I/O size to 'in-flight-size' when the I/O is -dispatched and subtracts when completed. -Basically, dm-service-time selects a path having minimum service time -which is calculated by:: - - ('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput' - -However, some optimizations below are used to reduce the calculation -as much as possible. - - 1. If the paths have the same 'relative_throughput', skip - the division and just compare the 'in-flight-size'. - - 2. If the paths have the same 'in-flight-size', skip the division - and just compare the 'relative_throughput'. - - 3. If some paths have non-zero 'relative_throughput' and others - have zero 'relative_throughput', ignore those paths with zero - 'relative_throughput'. - -If such optimizations can't be applied, calculate service time, and -compare service time. -If calculated service time is equal, the path having maximum -'relative_throughput' may be better. So compare 'relative_throughput' -then. - - -Examples -======== -In case that 2 paths (sda and sdb) are used with repeat_count == 128 -and sda has an average throughput 1GB/s and sdb has 4GB/s, -'relative_throughput' value may be '1' for sda and '4' for sdb:: - - # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \ - dmsetup create test - # - # dmsetup table - test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4 - # - # dmsetup status - test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4 - - -Or '2' for sda and '8' for sdb would be also true:: - - # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \ - dmsetup create test - # - # dmsetup table - test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8 - # - # dmsetup status - test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8 diff --git a/Documentation/device-mapper/dm-uevent.rst b/Documentation/device-mapper/dm-uevent.rst deleted file mode 100644 index 4a8ee8d069c9..000000000000 --- a/Documentation/device-mapper/dm-uevent.rst +++ /dev/null @@ -1,110 +0,0 @@ -==================== -device-mapper uevent -==================== - -The device-mapper uevent code adds the capability to device-mapper to create -and send kobject uevents (uevents). Previously device-mapper events were only -available through the ioctl interface. The advantage of the uevents interface -is the event contains environment attributes providing increased context for -the event avoiding the need to query the state of the device-mapper device after -the event is received. - -There are two functions currently for device-mapper events. The first function -listed creates the event and the second function sends the event(s):: - - void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, - const char *path, unsigned nr_valid_paths) - - void dm_send_uevents(struct list_head *events, struct kobject *kobj) - - -The variables added to the uevent environment are: - -Variable Name: DM_TARGET ------------------------- -:Uevent Action(s): KOBJ_CHANGE -:Type: string -:Description: -:Value: Name of device-mapper target that generated the event. - -Variable Name: DM_ACTION ------------------------- -:Uevent Action(s): KOBJ_CHANGE -:Type: string -:Description: -:Value: Device-mapper specific action that caused the uevent action. - PATH_FAILED - A path has failed; - PATH_REINSTATED - A path has been reinstated. - -Variable Name: DM_SEQNUM ------------------------- -:Uevent Action(s): KOBJ_CHANGE -:Type: unsigned integer -:Description: A sequence number for this specific device-mapper device. -:Value: Valid unsigned integer range. - -Variable Name: DM_PATH ----------------------- -:Uevent Action(s): KOBJ_CHANGE -:Type: string -:Description: Major and minor number of the path device pertaining to this - event. -:Value: Path name in the form of "Major:Minor" - -Variable Name: DM_NR_VALID_PATHS --------------------------------- -:Uevent Action(s): KOBJ_CHANGE -:Type: unsigned integer -:Description: -:Value: Valid unsigned integer range. - -Variable Name: DM_NAME ----------------------- -:Uevent Action(s): KOBJ_CHANGE -:Type: string -:Description: Name of the device-mapper device. -:Value: Name - -Variable Name: DM_UUID ----------------------- -:Uevent Action(s): KOBJ_CHANGE -:Type: string -:Description: UUID of the device-mapper device. -:Value: UUID. (Empty string if there isn't one.) - -An example of the uevents generated as captured by udevmonitor is shown -below - -1.) Path failure:: - - UEVENT[1192521009.711215] change@/block/dm-3 - ACTION=change - DEVPATH=/block/dm-3 - SUBSYSTEM=block - DM_TARGET=multipath - DM_ACTION=PATH_FAILED - DM_SEQNUM=1 - DM_PATH=8:32 - DM_NR_VALID_PATHS=0 - DM_NAME=mpath2 - DM_UUID=mpath-35333333000002328 - MINOR=3 - MAJOR=253 - SEQNUM=1130 - -2.) Path reinstate:: - - UEVENT[1192521132.989927] change@/block/dm-3 - ACTION=change - DEVPATH=/block/dm-3 - SUBSYSTEM=block - DM_TARGET=multipath - DM_ACTION=PATH_REINSTATED - DM_SEQNUM=2 - DM_PATH=8:32 - DM_NR_VALID_PATHS=1 - DM_NAME=mpath2 - DM_UUID=mpath-35333333000002328 - MINOR=3 - MAJOR=253 - SEQNUM=1131 diff --git a/Documentation/device-mapper/dm-zoned.rst b/Documentation/device-mapper/dm-zoned.rst deleted file mode 100644 index 07f56ebc1730..000000000000 --- a/Documentation/device-mapper/dm-zoned.rst +++ /dev/null @@ -1,146 +0,0 @@ -======== -dm-zoned -======== - -The dm-zoned device mapper target exposes a zoned block device (ZBC and -ZAC compliant devices) as a regular block device without any write -pattern constraints. In effect, it implements a drive-managed zoned -block device which hides from the user (a file system or an application -doing raw block device accesses) the sequential write constraints of -host-managed zoned block devices and can mitigate the potential -device-side performance degradation due to excessive random writes on -host-aware zoned block devices. - -For a more detailed description of the zoned block device models and -their constraints see (for SCSI devices): - -http://www.t10.org/drafts.htm#ZBC_Family - -and (for ATA devices): - -http://www.t13.org/Documents/UploadedDocuments/docs2015/di537r05-Zoned_Device_ATA_Command_Set_ZAC.pdf - -The dm-zoned implementation is simple and minimizes system overhead (CPU -and memory usage as well as storage capacity loss). For a 10TB -host-managed disk with 256 MB zones, dm-zoned memory usage per disk -instance is at most 4.5 MB and as little as 5 zones will be used -internally for storing metadata and performaing reclaim operations. - -dm-zoned target devices are formatted and checked using the dmzadm -utility available at: - -https://github.com/hgst/dm-zoned-tools - -Algorithm -========= - -dm-zoned implements an on-disk buffering scheme to handle non-sequential -write accesses to the sequential zones of a zoned block device. -Conventional zones are used for caching as well as for storing internal -metadata. - -The zones of the device are separated into 2 types: - -1) Metadata zones: these are conventional zones used to store metadata. -Metadata zones are not reported as useable capacity to the user. - -2) Data zones: all remaining zones, the vast majority of which will be -sequential zones used exclusively to store user data. The conventional -zones of the device may be used also for buffering user random writes. -Data in these zones may be directly mapped to the conventional zone, but -later moved to a sequential zone so that the conventional zone can be -reused for buffering incoming random writes. - -dm-zoned exposes a logical device with a sector size of 4096 bytes, -irrespective of the physical sector size of the backend zoned block -device being used. This allows reducing the amount of metadata needed to -manage valid blocks (blocks written). - -The on-disk metadata format is as follows: - -1) The first block of the first conventional zone found contains the -super block which describes the on disk amount and position of metadata -blocks. - -2) Following the super block, a set of blocks is used to describe the -mapping of the logical device blocks. The mapping is done per chunk of -blocks, with the chunk size equal to the zoned block device size. The -mapping table is indexed by chunk number and each mapping entry -indicates the zone number of the device storing the chunk of data. Each -mapping entry may also indicate if the zone number of a conventional -zone used to buffer random modification to the data zone. - -3) A set of blocks used to store bitmaps indicating the validity of -blocks in the data zones follows the mapping table. A valid block is -defined as a block that was written and not discarded. For a buffered -data chunk, a block is always valid only in the data zone mapping the -chunk or in the buffer zone of the chunk. - -For a logical chunk mapped to a conventional zone, all write operations -are processed by directly writing to the zone. If the mapping zone is a -sequential zone, the write operation is processed directly only if the -write offset within the logical chunk is equal to the write pointer -offset within of the sequential data zone (i.e. the write operation is -aligned on the zone write pointer). Otherwise, write operations are -processed indirectly using a buffer zone. In that case, an unused -conventional zone is allocated and assigned to the chunk being -accessed. Writing a block to the buffer zone of a chunk will -automatically invalidate the same block in the sequential zone mapping -the chunk. If all blocks of the sequential zone become invalid, the zone -is freed and the chunk buffer zone becomes the primary zone mapping the -chunk, resulting in native random write performance similar to a regular -block device. - -Read operations are processed according to the block validity -information provided by the bitmaps. Valid blocks are read either from -the sequential zone mapping a chunk, or if the chunk is buffered, from -the buffer zone assigned. If the accessed chunk has no mapping, or the -accessed blocks are invalid, the read buffer is zeroed and the read -operation terminated. - -After some time, the limited number of convnetional zones available may -be exhausted (all used to map chunks or buffer sequential zones) and -unaligned writes to unbuffered chunks become impossible. To avoid this -situation, a reclaim process regularly scans used conventional zones and -tries to reclaim the least recently used zones by copying the valid -blocks of the buffer zone to a free sequential zone. Once the copy -completes, the chunk mapping is updated to point to the sequential zone -and the buffer zone freed for reuse. - -Metadata Protection -=================== - -To protect metadata against corruption in case of sudden power loss or -system crash, 2 sets of metadata zones are used. One set, the primary -set, is used as the main metadata region, while the secondary set is -used as a staging area. Modified metadata is first written to the -secondary set and validated by updating the super block in the secondary -set, a generation counter is used to indicate that this set contains the -newest metadata. Once this operation completes, in place of metadata -block updates can be done in the primary metadata set. This ensures that -one of the set is always consistent (all modifications committed or none -at all). Flush operations are used as a commit point. Upon reception of -a flush request, metadata modification activity is temporarily blocked -(for both incoming BIO processing and reclaim process) and all dirty -metadata blocks are staged and updated. Normal operation is then -resumed. Flushing metadata thus only temporarily delays write and -discard requests. Read requests can be processed concurrently while -metadata flush is being executed. - -Usage -===== - -A zoned block device must first be formatted using the dmzadm tool. This -will analyze the device zone configuration, determine where to place the -metadata sets on the device and initialize the metadata sets. - -Ex:: - - dmzadm --format /dev/sdxx - -For a formatted device, the target can be created normally with the -dmsetup utility. The only parameter that dm-zoned requires is the -underlying zoned block device name. Ex:: - - echo "0 `blockdev --getsize ${dev}` zoned ${dev}" | \ - dmsetup create dmz-`basename ${dev}` diff --git a/Documentation/device-mapper/era.rst b/Documentation/device-mapper/era.rst deleted file mode 100644 index 90dd5c670b9f..000000000000 --- a/Documentation/device-mapper/era.rst +++ /dev/null @@ -1,116 +0,0 @@ -====== -dm-era -====== - -Introduction -============ - -dm-era is a target that behaves similar to the linear target. In -addition it keeps track of which blocks were written within a user -defined period of time called an 'era'. Each era target instance -maintains the current era as a monotonically increasing 32-bit -counter. - -Use cases include tracking changed blocks for backup software, and -partially invalidating the contents of a cache to restore cache -coherency after rolling back a vendor snapshot. - -Constructor -=========== - -era - - ================ ====================================================== - metadata dev fast device holding the persistent metadata - origin dev device holding data blocks that may change - block size block size of origin data device, granularity that is - tracked by the target - ================ ====================================================== - -Messages -======== - -None of the dm messages take any arguments. - -checkpoint ----------- - -Possibly move to a new era. You shouldn't assume the era has -incremented. After sending this message, you should check the -current era via the status line. - -take_metadata_snap ------------------- - -Create a clone of the metadata, to allow a userland process to read it. - -drop_metadata_snap ------------------- - -Drop the metadata snapshot. - -Status -====== - - <#used metadata blocks>/<#total metadata blocks> - - -========================= ============================================== -metadata block size Fixed block size for each metadata block in - sectors -#used metadata blocks Number of metadata blocks used -#total metadata blocks Total number of metadata blocks -current era The current era -held metadata root The location, in blocks, of the metadata root - that has been 'held' for userspace read - access. '-' indicates there is no held root -========================= ============================================== - -Detailed use case -================= - -The scenario of invalidating a cache when rolling back a vendor -snapshot was the primary use case when developing this target: - -Taking a vendor snapshot ------------------------- - -- Send a checkpoint message to the era target -- Make a note of the current era in its status line -- Take vendor snapshot (the era and snapshot should be forever - associated now). - -Rolling back to an vendor snapshot ----------------------------------- - -- Cache enters passthrough mode (see: dm-cache's docs in cache.txt) -- Rollback vendor storage -- Take metadata snapshot -- Ascertain which blocks have been written since the snapshot was taken - by checking each block's era -- Invalidate those blocks in the caching software -- Cache returns to writeback/writethrough mode - -Memory usage -============ - -The target uses a bitset to record writes in the current era. It also -has a spare bitset ready for switching over to a new era. Other than -that it uses a few 4k blocks for updating metadata:: - - (4 * nr_blocks) bytes + buffers - -Resilience -========== - -Metadata is updated on disk before a write to a previously unwritten -block is performed. As such dm-era should not be effected by a hard -crash such as power failure. - -Userland tools -============== - -Userland tools are found in the increasingly poorly named -thin-provisioning-tools project: - - https://github.com/jthornber/thin-provisioning-tools diff --git a/Documentation/device-mapper/index.rst b/Documentation/device-mapper/index.rst deleted file mode 100644 index 105e253bc231..000000000000 --- a/Documentation/device-mapper/index.rst +++ /dev/null @@ -1,44 +0,0 @@ -:orphan: - -============= -Device Mapper -============= - -.. toctree:: - :maxdepth: 1 - - cache-policies - cache - delay - dm-crypt - dm-flakey - dm-init - dm-integrity - dm-io - dm-log - dm-queue-length - dm-raid - dm-service-time - dm-uevent - dm-zoned - era - kcopyd - linear - log-writes - persistent-data - snapshot - statistics - striped - switch - thin-provisioning - unstriped - verity - writecache - zero - -.. only:: subproject and html - - Indices - ======= - - * :ref:`genindex` diff --git a/Documentation/device-mapper/kcopyd.rst b/Documentation/device-mapper/kcopyd.rst deleted file mode 100644 index 7651d395127f..000000000000 --- a/Documentation/device-mapper/kcopyd.rst +++ /dev/null @@ -1,47 +0,0 @@ -====== -kcopyd -====== - -Kcopyd provides the ability to copy a range of sectors from one block-device -to one or more other block-devices, with an asynchronous completion -notification. It is used by dm-snapshot and dm-mirror. - -Users of kcopyd must first create a client and indicate how many memory pages -to set aside for their copy jobs. This is done with a call to -kcopyd_client_create():: - - int kcopyd_client_create(unsigned int num_pages, - struct kcopyd_client **result); - -To start a copy job, the user must set up io_region structures to describe -the source and destinations of the copy. Each io_region indicates a -block-device along with the starting sector and size of the region. The source -of the copy is given as one io_region structure, and the destinations of the -copy are given as an array of io_region structures:: - - struct io_region { - struct block_device *bdev; - sector_t sector; - sector_t count; - }; - -To start the copy, the user calls kcopyd_copy(), passing in the client -pointer, pointers to the source and destination io_regions, the name of a -completion callback routine, and a pointer to some context data for the copy:: - - int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, - unsigned int num_dests, struct io_region *dests, - unsigned int flags, kcopyd_notify_fn fn, void *context); - - typedef void (*kcopyd_notify_fn)(int read_err, unsigned int write_err, - void *context); - -When the copy completes, kcopyd will call the user's completion routine, -passing back the user's context pointer. It will also indicate if a read or -write error occurred during the copy. - -When a user is done with all their copy jobs, they should call -kcopyd_client_destroy() to delete the kcopyd client, which will release the -associated memory pages:: - - void kcopyd_client_destroy(struct kcopyd_client *kc); diff --git a/Documentation/device-mapper/linear.rst b/Documentation/device-mapper/linear.rst deleted file mode 100644 index 9d17fc6e64a9..000000000000 --- a/Documentation/device-mapper/linear.rst +++ /dev/null @@ -1,63 +0,0 @@ -========= -dm-linear -========= - -Device-Mapper's "linear" target maps a linear range of the Device-Mapper -device onto a linear range of another device. This is the basic building -block of logical volume managers. - -Parameters: - : - Full pathname to the underlying block-device, or a - "major:minor" device-number. - : - Starting sector within the device. - - -Example scripts -=============== - -:: - - #!/bin/sh - # Create an identity mapping for a device - echo "0 `blockdev --getsz $1` linear $1 0" | dmsetup create identity - -:: - - #!/bin/sh - # Join 2 devices together - size1=`blockdev --getsz $1` - size2=`blockdev --getsz $2` - echo "0 $size1 linear $1 0 - $size1 $size2 linear $2 0" | dmsetup create joined - -:: - - #!/usr/bin/perl -w - # Split a device into 4M chunks and then join them together in reverse order. - - my $name = "reverse"; - my $extent_size = 4 * 1024 * 2; - my $dev = $ARGV[0]; - my $table = ""; - my $count = 0; - - if (!defined($dev)) { - die("Please specify a device.\n"); - } - - my $dev_size = `blockdev --getsz $dev`; - my $extents = int($dev_size / $extent_size) - - (($dev_size % $extent_size) ? 1 : 0); - - while ($extents > 0) { - my $this_start = $count * $extent_size; - $extents--; - $count++; - my $this_offset = $extents * $extent_size; - - $table .= "$this_start $extent_size linear $dev $this_offset\n"; - } - - `echo \"$table\" | dmsetup create $name`; diff --git a/Documentation/device-mapper/log-writes.rst b/Documentation/device-mapper/log-writes.rst deleted file mode 100644 index 23141f2ffb7c..000000000000 --- a/Documentation/device-mapper/log-writes.rst +++ /dev/null @@ -1,145 +0,0 @@ -============= -dm-log-writes -============= - -This target takes 2 devices, one to pass all IO to normally, and one to log all -of the write operations to. This is intended for file system developers wishing -to verify the integrity of metadata or data as the file system is written to. -There is a log_write_entry written for every WRITE request and the target is -able to take arbitrary data from userspace to insert into the log. The data -that is in the WRITE requests is copied into the log to make the replay happen -exactly as it happened originally. - -Log Ordering -============ - -We log things in order of completion once we are sure the write is no longer in -cache. This means that normal WRITE requests are not actually logged until the -next REQ_PREFLUSH request. This is to make it easier for userspace to replay -the log in a way that correlates to what is on disk and not what is in cache, -to make it easier to detect improper waiting/flushing. - -This works by attaching all WRITE requests to a list once the write completes. -Once we see a REQ_PREFLUSH request we splice this list onto the request and once -the FLUSH request completes we log all of the WRITEs and then the FLUSH. Only -completed WRITEs, at the time the REQ_PREFLUSH is issued, are added in order to -simulate the worst case scenario with regard to power failures. Consider the -following example (W means write, C means complete): - - W1,W2,W3,C3,C2,Wflush,C1,Cflush - -The log would show the following: - - W3,W2,flush,W1.... - -Again this is to simulate what is actually on disk, this allows us to detect -cases where a power failure at a particular point in time would create an -inconsistent file system. - -Any REQ_FUA requests bypass this flushing mechanism and are logged as soon as -they complete as those requests will obviously bypass the device cache. - -Any REQ_OP_DISCARD requests are treated like WRITE requests. Otherwise we would -have all the DISCARD requests, and then the WRITE requests and then the FLUSH -request. Consider the following example: - - WRITE block 1, DISCARD block 1, FLUSH - -If we logged DISCARD when it completed, the replay would look like this: - - DISCARD 1, WRITE 1, FLUSH - -which isn't quite what happened and wouldn't be caught during the log replay. - -Target interface -================ - -i) Constructor - - log-writes - - ============= ============================================== - dev_path Device that all of the IO will go to normally. - log_dev_path Device where the log entries are written to. - ============= ============================================== - -ii) Status - - <#logged entries> - - =========================== ======================== - #logged entries Number of logged entries - highest allocated sector Highest allocated sector - =========================== ======================== - -iii) Messages - - mark - - You can use a dmsetup message to set an arbitrary mark in a log. - For example say you want to fsck a file system after every - write, but first you need to replay up to the mkfs to make sure - we're fsck'ing something reasonable, you would do something like - this:: - - mkfs.btrfs -f /dev/mapper/log - dmsetup message log 0 mark mkfs - - - This would allow you to replay the log up to the mkfs mark and - then replay from that point on doing the fsck check in the - interval that you want. - - Every log has a mark at the end labeled "dm-log-writes-end". - -Userspace component -=================== - -There is a userspace tool that will replay the log for you in various ways. -It can be found here: https://github.com/josefbacik/log-writes - -Example usage -============= - -Say you want to test fsync on your file system. You would do something like -this:: - - TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" - dmsetup create log --table "$TABLE" - mkfs.btrfs -f /dev/mapper/log - dmsetup message log 0 mark mkfs - - mount /dev/mapper/log /mnt/btrfs-test - - dmsetup message log 0 mark fsync - md5sum /mnt/btrfs-test/foo - umount /mnt/btrfs-test - - dmsetup remove log - replay-log --log /dev/sdc --replay /dev/sdb --end-mark fsync - mount /dev/sdb /mnt/btrfs-test - md5sum /mnt/btrfs-test/foo - - - Another option is to do a complicated file system operation and verify the file - system is consistent during the entire operation. You could do this with: - - TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" - dmsetup create log --table "$TABLE" - mkfs.btrfs -f /dev/mapper/log - dmsetup message log 0 mark mkfs - - mount /dev/mapper/log /mnt/btrfs-test - - btrfs filesystem balance /mnt/btrfs-test - umount /mnt/btrfs-test - dmsetup remove log - - replay-log --log /dev/sdc --replay /dev/sdb --end-mark mkfs - btrfsck /dev/sdb - replay-log --log /dev/sdc --replay /dev/sdb --start-mark mkfs \ - --fsck "btrfsck /dev/sdb" --check fua - -And that will replay the log until it sees a FUA request, run the fsck command -and if the fsck passes it will replay to the next FUA, until it is completed or -the fsck command exists abnormally. diff --git a/Documentation/device-mapper/persistent-data.rst b/Documentation/device-mapper/persistent-data.rst deleted file mode 100644 index 2065c3c5a091..000000000000 --- a/Documentation/device-mapper/persistent-data.rst +++ /dev/null @@ -1,88 +0,0 @@ -=============== -Persistent data -=============== - -Introduction -============ - -The more-sophisticated device-mapper targets require complex metadata -that is managed in kernel. In late 2010 we were seeing that various -different targets were rolling their own data structures, for example: - -- Mikulas Patocka's multisnap implementation -- Heinz Mauelshagen's thin provisioning target -- Another btree-based caching target posted to dm-devel -- Another multi-snapshot target based on a design of Daniel Phillips - -Maintaining these data structures takes a lot of work, so if possible -we'd like to reduce the number. - -The persistent-data library is an attempt to provide a re-usable -framework for people who want to store metadata in device-mapper -targets. It's currently used by the thin-provisioning target and an -upcoming hierarchical storage target. - -Overview -======== - -The main documentation is in the header files which can all be found -under drivers/md/persistent-data. - -The block manager ------------------ - -dm-block-manager.[hc] - -This provides access to the data on disk in fixed sized-blocks. There -is a read/write locking interface to prevent concurrent accesses, and -keep data that is being used in the cache. - -Clients of persistent-data are unlikely to use this directly. - -The transaction manager ------------------------ - -dm-transaction-manager.[hc] - -This restricts access to blocks and enforces copy-on-write semantics. -The only way you can get hold of a writable block through the -transaction manager is by shadowing an existing block (ie. doing -copy-on-write) or allocating a fresh one. Shadowing is elided within -the same transaction so performance is reasonable. The commit method -ensures that all data is flushed before it writes the superblock. -On power failure your metadata will be as it was when last committed. - -The Space Maps --------------- - -dm-space-map.h -dm-space-map-metadata.[hc] -dm-space-map-disk.[hc] - -On-disk data structures that keep track of reference counts of blocks. -Also acts as the allocator of new blocks. Currently two -implementations: a simpler one for managing blocks on a different -device (eg. thinly-provisioned data blocks); and one for managing -the metadata space. The latter is complicated by the need to store -its own data within the space it's managing. - -The data structures -------------------- - -dm-btree.[hc] -dm-btree-remove.c -dm-btree-spine.c -dm-btree-internal.h - -Currently there is only one data structure, a hierarchical btree. -There are plans to add more. For example, something with an -array-like interface would see a lot of use. - -The btree is 'hierarchical' in that you can define it to be composed -of nested btrees, and take multiple keys. For example, the -thin-provisioning target uses a btree with two levels of nesting. -The first maps a device id to a mapping tree, and that in turn maps a -virtual block to a physical block. - -Values stored in the btrees can have arbitrary size. Keys are always -64bits, although nesting allows you to use multiple keys. diff --git a/Documentation/device-mapper/snapshot.rst b/Documentation/device-mapper/snapshot.rst deleted file mode 100644 index ccdd8b587a74..000000000000 --- a/Documentation/device-mapper/snapshot.rst +++ /dev/null @@ -1,196 +0,0 @@ -============================== -Device-mapper snapshot support -============================== - -Device-mapper allows you, without massive data copying: - -- To create snapshots of any block device i.e. mountable, saved states of - the block device which are also writable without interfering with the - original content; -- To create device "forks", i.e. multiple different versions of the - same data stream. -- To merge a snapshot of a block device back into the snapshot's origin - device. - -In the first two cases, dm copies only the chunks of data that get -changed and uses a separate copy-on-write (COW) block device for -storage. - -For snapshot merge the contents of the COW storage are merged back into -the origin device. - - -There are three dm targets available: -snapshot, snapshot-origin, and snapshot-merge. - -- snapshot-origin - -which will normally have one or more snapshots based on it. -Reads will be mapped directly to the backing device. For each write, the -original data will be saved in the of each snapshot to keep -its visible content unchanged, at least until the fills up. - - -- snapshot - [<# feature args> []*] - -A snapshot of the block device is created. Changed chunks of - sectors will be stored on the . Writes will -only go to the . Reads will come from the or -from for unchanged data. will often be -smaller than the origin and if it fills up the snapshot will become -useless and be disabled, returning errors. So it is important to monitor -the amount of free space and expand the before it fills up. - - is P (Persistent) or N (Not persistent - will not survive -after reboot). O (Overflow) can be added as a persistent store option -to allow userspace to advertise its support for seeing "Overflow" in the -snapshot status. So supported store types are "P", "PO" and "N". - -The difference between persistent and transient is with transient -snapshots less metadata must be saved on disk - they can be kept in -memory by the kernel. - -When loading or unloading the snapshot target, the corresponding -snapshot-origin or snapshot-merge target must be suspended. A failure to -suspend the origin target could result in data corruption. - -Optional features: - - discard_zeroes_cow - a discard issued to the snapshot device that - maps to entire chunks to will zero the corresponding exception(s) in - the snapshot's exception store. - - discard_passdown_origin - a discard to the snapshot device is passed - down to the snapshot-origin's underlying device. This doesn't cause - copy-out to the snapshot exception store because the snapshot-origin - target is bypassed. - - The discard_passdown_origin feature depends on the discard_zeroes_cow - feature being enabled. - - -- snapshot-merge - [<# feature args> []*] - -takes the same table arguments as the snapshot target except it only -works with persistent snapshots. This target assumes the role of the -"snapshot-origin" target and must not be loaded if the "snapshot-origin" -is still present for . - -Creates a merging snapshot that takes control of the changed chunks -stored in the of an existing snapshot, through a handover -procedure, and merges these chunks back into the . Once merging -has started (in the background) the may be opened and the merge -will continue while I/O is flowing to it. Changes to the are -deferred until the merging snapshot's corresponding chunk(s) have been -merged. Once merging has started the snapshot device, associated with -the "snapshot" target, will return -EIO when accessed. - - -How snapshot is used by LVM2 -============================ -When you create the first LVM2 snapshot of a volume, four dm devices are used: - -1) a device containing the original mapping table of the source volume; -2) a device used as the ; -3) a "snapshot" device, combining #1 and #2, which is the visible snapshot - volume; -4) the "original" volume (which uses the device number used by the original - source volume), whose table is replaced by a "snapshot-origin" mapping - from device #1. - -A fixed naming scheme is used, so with the following commands:: - - lvcreate -L 1G -n base volumeGroup - lvcreate -L 100M --snapshot -n snap volumeGroup/base - -we'll have this situation (with volumes in above order):: - - # dmsetup table|grep volumeGroup - - volumeGroup-base-real: 0 2097152 linear 8:19 384 - volumeGroup-snap-cow: 0 204800 linear 8:19 2097536 - volumeGroup-snap: 0 2097152 snapshot 254:11 254:12 P 16 - volumeGroup-base: 0 2097152 snapshot-origin 254:11 - - # ls -lL /dev/mapper/volumeGroup-* - brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real - brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow - brw------- 1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap - brw------- 1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base - - -How snapshot-merge is used by LVM2 -================================== -A merging snapshot assumes the role of the "snapshot-origin" while -merging. As such the "snapshot-origin" is replaced with -"snapshot-merge". The "-real" device is not changed and the "-cow" -device is renamed to -cow to aid LVM2's cleanup of the -merging snapshot after it completes. The "snapshot" that hands over its -COW device to the "snapshot-merge" is deactivated (unless using lvchange ---refresh); but if it is left active it will simply return I/O errors. - -A snapshot will merge into its origin with the following command:: - - lvconvert --merge volumeGroup/snap - -we'll now have this situation:: - - # dmsetup table|grep volumeGroup - - volumeGroup-base-real: 0 2097152 linear 8:19 384 - volumeGroup-base-cow: 0 204800 linear 8:19 2097536 - volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16 - - # ls -lL /dev/mapper/volumeGroup-* - brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real - brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow - brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base - - -How to determine when a merging is complete -=========================================== -The snapshot-merge and snapshot status lines end with: - - / - -Both and include both data and metadata. -During merging, the number of sectors allocated gets smaller and -smaller. Merging has finished when the number of sectors holding data -is zero, in other words == . - -Here is a practical example (using a hybrid of lvm and dmsetup commands):: - - # lvs - LV VG Attr LSize Origin Snap% Move Log Copy% Convert - base volumeGroup owi-a- 4.00g - snap volumeGroup swi-a- 1.00g base 18.97 - - # dmsetup status volumeGroup-snap - 0 8388608 snapshot 397896/2097152 1560 - ^^^^ metadata sectors - - # lvconvert --merge -b volumeGroup/snap - Merging of volume snap started. - - # lvs volumeGroup/snap - LV VG Attr LSize Origin Snap% Move Log Copy% Convert - base volumeGroup Owi-a- 4.00g 17.23 - - # dmsetup status volumeGroup-base - 0 8388608 snapshot-merge 281688/2097152 1104 - - # dmsetup status volumeGroup-base - 0 8388608 snapshot-merge 180480/2097152 712 - - # dmsetup status volumeGroup-base - 0 8388608 snapshot-merge 16/2097152 16 - -Merging has finished. - -:: - - # lvs - LV VG Attr LSize Origin Snap% Move Log Copy% Convert - base volumeGroup owi-a- 4.00g diff --git a/Documentation/device-mapper/statistics.rst b/Documentation/device-mapper/statistics.rst deleted file mode 100644 index 3d80a9f850cc..000000000000 --- a/Documentation/device-mapper/statistics.rst +++ /dev/null @@ -1,225 +0,0 @@ -============= -DM statistics -============= - -Device Mapper supports the collection of I/O statistics on user-defined -regions of a DM device. If no regions are defined no statistics are -collected so there isn't any performance impact. Only bio-based DM -devices are currently supported. - -Each user-defined region specifies a starting sector, length and step. -Individual statistics will be collected for each step-sized area within -the range specified. - -The I/O statistics counters for each step-sized area of a region are -in the same format as `/sys/block/*/stat` or `/proc/diskstats` (see: -Documentation/iostats.txt). But two extra counters (12 and 13) are -provided: total time spent reading and writing. When the histogram -argument is used, the 14th parameter is reported that represents the -histogram of latencies. All these counters may be accessed by sending -the @stats_print message to the appropriate DM device via dmsetup. - -The reported times are in milliseconds and the granularity depends on -the kernel ticks. When the option precise_timestamps is used, the -reported times are in nanoseconds. - -Each region has a corresponding unique identifier, which we call a -region_id, that is assigned when the region is created. The region_id -must be supplied when querying statistics about the region, deleting the -region, etc. Unique region_ids enable multiple userspace programs to -request and process statistics for the same DM device without stepping -on each other's data. - -The creation of DM statistics will allocate memory via kmalloc or -fallback to using vmalloc space. At most, 1/4 of the overall system -memory may be allocated by DM statistics. The admin can see how much -memory is used by reading: - - /sys/module/dm_mod/parameters/stats_current_allocated_bytes - -Messages -======== - - @stats_create [ ...] [ []] - Create a new region and return the region_id. - - - "-" - whole device - "+" - a range of 512-byte sectors - starting with . - - - "" - the range is subdivided into areas each containing - sectors. - "/" - the range is subdivided into the specified - number of areas. - - - The number of optional arguments - - - The following optional arguments are supported: - - precise_timestamps - use precise timer with nanosecond resolution - instead of the "jiffies" variable. When this argument is - used, the resulting times are in nanoseconds instead of - milliseconds. Precise timestamps are a little bit slower - to obtain than jiffies-based timestamps. - histogram:n1,n2,n3,n4,... - collect histogram of latencies. The - numbers n1, n2, etc are times that represent the boundaries - of the histogram. If precise_timestamps is not used, the - times are in milliseconds, otherwise they are in - nanoseconds. For each range, the kernel will report the - number of requests that completed within this range. For - example, if we use "histogram:10,20,30", the kernel will - report four numbers a:b:c:d. a is the number of requests - that took 0-10 ms to complete, b is the number of requests - that took 10-20 ms to complete, c is the number of requests - that took 20-30 ms to complete and d is the number of - requests that took more than 30 ms to complete. - - - An optional parameter. A name that uniquely identifies - the userspace owner of the range. This groups ranges together - so that userspace programs can identify the ranges they - created and ignore those created by others. - The kernel returns this string back in the output of - @stats_list message, but it doesn't use it for anything else. - If we omit the number of optional arguments, program id must not - be a number, otherwise it would be interpreted as the number of - optional arguments. - - - An optional parameter. A word that provides auxiliary data - that is useful to the client program that created the range. - The kernel returns this string back in the output of - @stats_list message, but it doesn't use this value for anything. - - @stats_delete - Delete the region with the specified id. - - - region_id returned from @stats_create - - @stats_clear - Clear all the counters except the in-flight i/o counters. - - - region_id returned from @stats_create - - @stats_list [] - List all regions registered with @stats_create. - - - An optional parameter. - If this parameter is specified, only matching regions - are returned. - If it is not specified, all regions are returned. - - Output format: - : + - precise_timestamps histogram:n1,n2,n3,... - - The strings "precise_timestamps" and "histogram" are printed only - if they were specified when creating the region. - - @stats_print [ ] - Print counters for each step-sized area of a region. - - - region_id returned from @stats_create - - - The index of the starting line in the output. - If omitted, all lines are returned. - - - The number of lines to include in the output. - If omitted, all lines are returned. - - Output format for each step-sized area of a region: - - + - counters - - The first 11 counters have the same meaning as - `/sys/block/*/stat or /proc/diskstats`. - - Please refer to Documentation/iostats.txt for details. - - 1. the number of reads completed - 2. the number of reads merged - 3. the number of sectors read - 4. the number of milliseconds spent reading - 5. the number of writes completed - 6. the number of writes merged - 7. the number of sectors written - 8. the number of milliseconds spent writing - 9. the number of I/Os currently in progress - 10. the number of milliseconds spent doing I/Os - 11. the weighted number of milliseconds spent doing I/Os - - Additional counters: - - 12. the total time spent reading in milliseconds - 13. the total time spent writing in milliseconds - - @stats_print_clear [ ] - Atomically print and then clear all the counters except the - in-flight i/o counters. Useful when the client consuming the - statistics does not want to lose any statistics (those updated - between printing and clearing). - - - region_id returned from @stats_create - - - The index of the starting line in the output. - If omitted, all lines are printed and then cleared. - - - The number of lines to process. - If omitted, all lines are printed and then cleared. - - @stats_set_aux - Store auxiliary data aux_data for the specified region. - - - region_id returned from @stats_create - - - The string that identifies data which is useful to the client - program that created the range. The kernel returns this - string back in the output of @stats_list message, but it - doesn't use this value for anything. - -Examples -======== - -Subdivide the DM device 'vol' into 100 pieces and start collecting -statistics on them:: - - dmsetup message vol 0 @stats_create - /100 - -Set the auxiliary data string to "foo bar baz" (the escape for each -space must also be escaped, otherwise the shell will consume them):: - - dmsetup message vol 0 @stats_set_aux 0 foo\\ bar\\ baz - -List the statistics:: - - dmsetup message vol 0 @stats_list - -Print the statistics:: - - dmsetup message vol 0 @stats_print 0 - -Delete the statistics:: - - dmsetup message vol 0 @stats_delete 0 diff --git a/Documentation/device-mapper/striped.rst b/Documentation/device-mapper/striped.rst deleted file mode 100644 index e9a8da192ae1..000000000000 --- a/Documentation/device-mapper/striped.rst +++ /dev/null @@ -1,61 +0,0 @@ -========= -dm-stripe -========= - -Device-Mapper's "striped" target is used to create a striped (i.e. RAID-0) -device across one or more underlying devices. Data is written in "chunks", -with consecutive chunks rotating among the underlying devices. This can -potentially provide improved I/O throughput by utilizing several physical -devices in parallel. - -Parameters: [ ]+ - : - Number of underlying devices. - : - Size of each chunk of data. Must be at least as - large as the system's PAGE_SIZE. - : - Full pathname to the underlying block-device, or a - "major:minor" device-number. - : - Starting sector within the device. - -One or more underlying devices can be specified. The striped device size must -be a multiple of the chunk size multiplied by the number of underlying devices. - - -Example scripts -=============== - -:: - - #!/usr/bin/perl -w - # Create a striped device across any number of underlying devices. The device - # will be called "stripe_dev" and have a chunk-size of 128k. - - my $chunk_size = 128 * 2; - my $dev_name = "stripe_dev"; - my $num_devs = @ARGV; - my @devs = @ARGV; - my ($min_dev_size, $stripe_dev_size, $i); - - if (!$num_devs) { - die("Specify at least one device\n"); - } - - $min_dev_size = `blockdev --getsz $devs[0]`; - for ($i = 1; $i < $num_devs; $i++) { - my $this_size = `blockdev --getsz $devs[$i]`; - $min_dev_size = ($min_dev_size < $this_size) ? - $min_dev_size : $this_size; - } - - $stripe_dev_size = $min_dev_size * $num_devs; - $stripe_dev_size -= $stripe_dev_size % ($chunk_size * $num_devs); - - $table = "0 $stripe_dev_size striped $num_devs $chunk_size"; - for ($i = 0; $i < $num_devs; $i++) { - $table .= " $devs[$i] 0"; - } - - `echo $table | dmsetup create $dev_name`; diff --git a/Documentation/device-mapper/switch.rst b/Documentation/device-mapper/switch.rst deleted file mode 100644 index 7dde06be1a4f..000000000000 --- a/Documentation/device-mapper/switch.rst +++ /dev/null @@ -1,141 +0,0 @@ -========= -dm-switch -========= - -The device-mapper switch target creates a device that supports an -arbitrary mapping of fixed-size regions of I/O across a fixed set of -paths. The path used for any specific region can be switched -dynamically by sending the target a message. - -It maps I/O to underlying block devices efficiently when there is a large -number of fixed-sized address regions but there is no simple pattern -that would allow for a compact representation of the mapping such as -dm-stripe. - -Background ----------- - -Dell EqualLogic and some other iSCSI storage arrays use a distributed -frameless architecture. In this architecture, the storage group -consists of a number of distinct storage arrays ("members") each having -independent controllers, disk storage and network adapters. When a LUN -is created it is spread across multiple members. The details of the -spreading are hidden from initiators connected to this storage system. -The storage group exposes a single target discovery portal, no matter -how many members are being used. When iSCSI sessions are created, each -session is connected to an eth port on a single member. Data to a LUN -can be sent on any iSCSI session, and if the blocks being accessed are -stored on another member the I/O will be forwarded as required. This -forwarding is invisible to the initiator. The storage layout is also -dynamic, and the blocks stored on disk may be moved from member to -member as needed to balance the load. - -This architecture simplifies the management and configuration of both -the storage group and initiators. In a multipathing configuration, it -is possible to set up multiple iSCSI sessions to use multiple network -interfaces on both the host and target to take advantage of the -increased network bandwidth. An initiator could use a simple round -robin algorithm to send I/O across all paths and let the storage array -members forward it as necessary, but there is a performance advantage to -sending data directly to the correct member. - -A device-mapper table already lets you map different regions of a -device onto different targets. However in this architecture the LUN is -spread with an address region size on the order of 10s of MBs, which -means the resulting table could have more than a million entries and -consume far too much memory. - -Using this device-mapper switch target we can now build a two-layer -device hierarchy: - - Upper Tier - Determine which array member the I/O should be sent to. - Lower Tier - Load balance amongst paths to a particular member. - -The lower tier consists of a single dm multipath device for each member. -Each of these multipath devices contains the set of paths directly to -the array member in one priority group, and leverages existing path -selectors to load balance amongst these paths. We also build a -non-preferred priority group containing paths to other array members for -failover reasons. - -The upper tier consists of a single dm-switch device. This device uses -a bitmap to look up the location of the I/O and choose the appropriate -lower tier device to route the I/O. By using a bitmap we are able to -use 4 bits for each address range in a 16 member group (which is very -large for us). This is a much denser representation than the dm table -b-tree can achieve. - -Construction Parameters -======================= - - [...] [ ]+ - - The number of paths across which to distribute the I/O. - - - The number of 512-byte sectors in a region. Each region can be redirected - to any of the available paths. - - - The number of optional arguments. Currently, no optional arguments - are supported and so this must be zero. - - - The block device that represents a specific path to the device. - - - The offset of the start of data on the specific (in units - of 512-byte sectors). This number is added to the sector number when - forwarding the request to the specific path. Typically it is zero. - -Messages -======== - -set_region_mappings : []: []:... - -Modify the region table by specifying which regions are redirected to -which paths. - - - The region number (region size was specified in constructor parameters). - If index is omitted, the next region (previous index + 1) is used. - Expressed in hexadecimal (WITHOUT any prefix like 0x). - - - The path number in the range 0 ... ( - 1). - Expressed in hexadecimal (WITHOUT any prefix like 0x). - -R, - This parameter allows repetitive patterns to be loaded quickly. and - are hexadecimal numbers. The last mappings are repeated in the next - slots. - -Status -====== - -No status line is reported. - -Example -======= - -Assume that you have volumes vg1/switch0 vg1/switch1 vg1/switch2 with -the same size. - -Create a switch device with 64kB region size:: - - dmsetup create switch --table "0 `blockdev --getsz /dev/vg1/switch0` - switch 3 128 0 /dev/vg1/switch0 0 /dev/vg1/switch1 0 /dev/vg1/switch2 0" - -Set mappings for the first 7 entries to point to devices switch0, switch1, -switch2, switch0, switch1, switch2, switch1:: - - dmsetup message switch 0 set_region_mappings 0:0 :1 :2 :0 :1 :2 :1 - -Set repetitive mapping. This command:: - - dmsetup message switch 0 set_region_mappings 1000:1 :2 R2,10 - -is equivalent to:: - - dmsetup message switch 0 set_region_mappings 1000:1 :2 :1 :2 :1 :2 :1 :2 \ - :1 :2 :1 :2 :1 :2 :1 :2 :1 :2 diff --git a/Documentation/device-mapper/thin-provisioning.rst b/Documentation/device-mapper/thin-provisioning.rst deleted file mode 100644 index bafebf79da4b..000000000000 --- a/Documentation/device-mapper/thin-provisioning.rst +++ /dev/null @@ -1,427 +0,0 @@ -================= -Thin provisioning -================= - -Introduction -============ - -This document describes a collection of device-mapper targets that -between them implement thin-provisioning and snapshots. - -The main highlight of this implementation, compared to the previous -implementation of snapshots, is that it allows many virtual devices to -be stored on the same data volume. This simplifies administration and -allows the sharing of data between volumes, thus reducing disk usage. - -Another significant feature is support for an arbitrary depth of -recursive snapshots (snapshots of snapshots of snapshots ...). The -previous implementation of snapshots did this by chaining together -lookup tables, and so performance was O(depth). This new -implementation uses a single data structure to avoid this degradation -with depth. Fragmentation may still be an issue, however, in some -scenarios. - -Metadata is stored on a separate device from data, giving the -administrator some freedom, for example to: - -- Improve metadata resilience by storing metadata on a mirrored volume - but data on a non-mirrored one. - -- Improve performance by storing the metadata on SSD. - -Status -====== - -These targets are considered safe for production use. But different use -cases will have different performance characteristics, for example due -to fragmentation of the data volume. - -If you find this software is not performing as expected please mail -dm-devel@redhat.com with details and we'll try our best to improve -things for you. - -Userspace tools for checking and repairing the metadata have been fully -developed and are available as 'thin_check' and 'thin_repair'. The name -of the package that provides these utilities varies by distribution (on -a Red Hat distribution it is named 'device-mapper-persistent-data'). - -Cookbook -======== - -This section describes some quick recipes for using thin provisioning. -They use the dmsetup program to control the device-mapper driver -directly. End users will be advised to use a higher-level volume -manager such as LVM2 once support has been added. - -Pool device ------------ - -The pool device ties together the metadata volume and the data volume. -It maps I/O linearly to the data volume and updates the metadata via -two mechanisms: - -- Function calls from the thin targets - -- Device-mapper 'messages' from userspace which control the creation of new - virtual devices amongst other things. - -Setting up a fresh pool device ------------------------------- - -Setting up a pool device requires a valid metadata device, and a -data device. If you do not have an existing metadata device you can -make one by zeroing the first 4k to indicate empty metadata. - - dd if=/dev/zero of=$metadata_dev bs=4096 count=1 - -The amount of metadata you need will vary according to how many blocks -are shared between thin devices (i.e. through snapshots). If you have -less sharing than average you'll need a larger-than-average metadata device. - -As a guide, we suggest you calculate the number of bytes to use in the -metadata device as 48 * $data_dev_size / $data_block_size but round it up -to 2MB if the answer is smaller. If you're creating large numbers of -snapshots which are recording large amounts of change, you may find you -need to increase this. - -The largest size supported is 16GB: If the device is larger, -a warning will be issued and the excess space will not be used. - -Reloading a pool table ----------------------- - -You may reload a pool's table, indeed this is how the pool is resized -if it runs out of space. (N.B. While specifying a different metadata -device when reloading is not forbidden at the moment, things will go -wrong if it does not route I/O to exactly the same on-disk location as -previously.) - -Using an existing pool device ------------------------------ - -:: - - dmsetup create pool \ - --table "0 20971520 thin-pool $metadata_dev $data_dev \ - $data_block_size $low_water_mark" - -$data_block_size gives the smallest unit of disk space that can be -allocated at a time expressed in units of 512-byte sectors. -$data_block_size must be between 128 (64KB) and 2097152 (1GB) and a -multiple of 128 (64KB). $data_block_size cannot be changed after the -thin-pool is created. People primarily interested in thin provisioning -may want to use a value such as 1024 (512KB). People doing lots of -snapshotting may want a smaller value such as 128 (64KB). If you are -not zeroing newly-allocated data, a larger $data_block_size in the -region of 256000 (128MB) is suggested. - -$low_water_mark is expressed in blocks of size $data_block_size. If -free space on the data device drops below this level then a dm event -will be triggered which a userspace daemon should catch allowing it to -extend the pool device. Only one such event will be sent. - -No special event is triggered if a just resumed device's free space is below -the low water mark. However, resuming a device always triggers an -event; a userspace daemon should verify that free space exceeds the low -water mark when handling this event. - -A low water mark for the metadata device is maintained in the kernel and -will trigger a dm event if free space on the metadata device drops below -it. - -Updating on-disk metadata -------------------------- - -On-disk metadata is committed every time a FLUSH or FUA bio is written. -If no such requests are made then commits will occur every second. This -means the thin-provisioning target behaves like a physical disk that has -a volatile write cache. If power is lost you may lose some recent -writes. The metadata should always be consistent in spite of any crash. - -If data space is exhausted the pool will either error or queue IO -according to the configuration (see: error_if_no_space). If metadata -space is exhausted or a metadata operation fails: the pool will error IO -until the pool is taken offline and repair is performed to 1) fix any -potential inconsistencies and 2) clear the flag that imposes repair. -Once the pool's metadata device is repaired it may be resized, which -will allow the pool to return to normal operation. Note that if a pool -is flagged as needing repair, the pool's data and metadata devices -cannot be resized until repair is performed. It should also be noted -that when the pool's metadata space is exhausted the current metadata -transaction is aborted. Given that the pool will cache IO whose -completion may have already been acknowledged to upper IO layers -(e.g. filesystem) it is strongly suggested that consistency checks -(e.g. fsck) be performed on those layers when repair of the pool is -required. - -Thin provisioning ------------------ - -i) Creating a new thinly-provisioned volume. - - To create a new thinly- provisioned volume you must send a message to an - active pool device, /dev/mapper/pool in this example:: - - dmsetup message /dev/mapper/pool 0 "create_thin 0" - - Here '0' is an identifier for the volume, a 24-bit number. It's up - to the caller to allocate and manage these identifiers. If the - identifier is already in use, the message will fail with -EEXIST. - -ii) Using a thinly-provisioned volume. - - Thinly-provisioned volumes are activated using the 'thin' target:: - - dmsetup create thin --table "0 2097152 thin /dev/mapper/pool 0" - - The last parameter is the identifier for the thinp device. - -Internal snapshots ------------------- - -i) Creating an internal snapshot. - - Snapshots are created with another message to the pool. - - N.B. If the origin device that you wish to snapshot is active, you - must suspend it before creating the snapshot to avoid corruption. - This is NOT enforced at the moment, so please be careful! - - :: - - dmsetup suspend /dev/mapper/thin - dmsetup message /dev/mapper/pool 0 "create_snap 1 0" - dmsetup resume /dev/mapper/thin - - Here '1' is the identifier for the volume, a 24-bit number. '0' is the - identifier for the origin device. - -ii) Using an internal snapshot. - - Once created, the user doesn't have to worry about any connection - between the origin and the snapshot. Indeed the snapshot is no - different from any other thinly-provisioned device and can be - snapshotted itself via the same method. It's perfectly legal to - have only one of them active, and there's no ordering requirement on - activating or removing them both. (This differs from conventional - device-mapper snapshots.) - - Activate it exactly the same way as any other thinly-provisioned volume:: - - dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1" - -External snapshots ------------------- - -You can use an external **read only** device as an origin for a -thinly-provisioned volume. Any read to an unprovisioned area of the -thin device will be passed through to the origin. Writes trigger -the allocation of new blocks as usual. - -One use case for this is VM hosts that want to run guests on -thinly-provisioned volumes but have the base image on another device -(possibly shared between many VMs). - -You must not write to the origin device if you use this technique! -Of course, you may write to the thin device and take internal snapshots -of the thin volume. - -i) Creating a snapshot of an external device - - This is the same as creating a thin device. - You don't mention the origin at this stage. - - :: - - dmsetup message /dev/mapper/pool 0 "create_thin 0" - -ii) Using a snapshot of an external device. - - Append an extra parameter to the thin target specifying the origin:: - - dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 0 /dev/image" - - N.B. All descendants (internal snapshots) of this snapshot require the - same extra origin parameter. - -Deactivation ------------- - -All devices using a pool must be deactivated before the pool itself -can be. - -:: - - dmsetup remove thin - dmsetup remove snap - dmsetup remove pool - -Reference -========= - -'thin-pool' target ------------------- - -i) Constructor - - :: - - thin-pool \ - [ []*] - - Optional feature arguments: - - skip_block_zeroing: - Skip the zeroing of newly-provisioned blocks. - - ignore_discard: - Disable discard support. - - no_discard_passdown: - Don't pass discards down to the underlying - data device, but just remove the mapping. - - read_only: - Don't allow any changes to be made to the pool - metadata. This mode is only available after the - thin-pool has been created and first used in full - read/write mode. It cannot be specified on initial - thin-pool creation. - - error_if_no_space: - Error IOs, instead of queueing, if no space. - - Data block size must be between 64KB (128 sectors) and 1GB - (2097152 sectors) inclusive. - - -ii) Status - - :: - - / - / - ro|rw|out_of_data_space [no_]discard_passdown [error|queue]_if_no_space - needs_check|- metadata_low_watermark - - transaction id: - A 64-bit number used by userspace to help synchronise with metadata - from volume managers. - - used data blocks / total data blocks - If the number of free blocks drops below the pool's low water mark a - dm event will be sent to userspace. This event is edge-triggered and - it will occur only once after each resume so volume manager writers - should register for the event and then check the target's status. - - held metadata root: - The location, in blocks, of the metadata root that has been - 'held' for userspace read access. '-' indicates there is no - held root. - - discard_passdown|no_discard_passdown - Whether or not discards are actually being passed down to the - underlying device. When this is enabled when loading the table, - it can get disabled if the underlying device doesn't support it. - - ro|rw|out_of_data_space - If the pool encounters certain types of device failures it will - drop into a read-only metadata mode in which no changes to - the pool metadata (like allocating new blocks) are permitted. - - In serious cases where even a read-only mode is deemed unsafe - no further I/O will be permitted and the status will just - contain the string 'Fail'. The userspace recovery tools - should then be used. - - error_if_no_space|queue_if_no_space - If the pool runs out of data or metadata space, the pool will - either queue or error the IO destined to the data device. The - default is to queue the IO until more space is added or the - 'no_space_timeout' expires. The 'no_space_timeout' dm-thin-pool - module parameter can be used to change this timeout -- it - defaults to 60 seconds but may be disabled using a value of 0. - - needs_check - A metadata operation has failed, resulting in the needs_check - flag being set in the metadata's superblock. The metadata - device must be deactivated and checked/repaired before the - thin-pool can be made fully operational again. '-' indicates - needs_check is not set. - - metadata_low_watermark: - Value of metadata low watermark in blocks. The kernel sets this - value internally but userspace needs to know this value to - determine if an event was caused by crossing this threshold. - -iii) Messages - - create_thin - Create a new thinly-provisioned device. - is an arbitrary unique 24-bit identifier chosen by - the caller. - - create_snap - Create a new snapshot of another thinly-provisioned device. - is an arbitrary unique 24-bit identifier chosen by - the caller. - is the identifier of the thinly-provisioned device - of which the new device will be a snapshot. - - delete - Deletes a thin device. Irreversible. - - set_transaction_id - Userland volume managers, such as LVM, need a way to - synchronise their external metadata with the internal metadata of the - pool target. The thin-pool target offers to store an - arbitrary 64-bit transaction id and return it on the target's - status line. To avoid races you must provide what you think - the current transaction id is when you change it with this - compare-and-swap message. - - reserve_metadata_snap - Reserve a copy of the data mapping btree for use by userland. - This allows userland to inspect the mappings as they were when - this message was executed. Use the pool's status command to - get the root block associated with the metadata snapshot. - - release_metadata_snap - Release a previously reserved copy of the data mapping btree. - -'thin' target -------------- - -i) Constructor - - :: - - thin [] - - pool dev: - the thin-pool device, e.g. /dev/mapper/my_pool or 253:0 - - dev id: - the internal device identifier of the device to be - activated. - - external origin dev: - an optional block device outside the pool to be treated as a - read-only snapshot origin: reads to unprovisioned areas of the - thin target will be mapped to this device. - -The pool doesn't store any size against the thin devices. If you -load a thin target that is smaller than you've been using previously, -then you'll have no access to blocks mapped beyond the end. If you -load a target that is bigger than before, then extra blocks will be -provisioned as and when needed. - -ii) Status - - - If the pool has encountered device errors and failed, the status - will just contain the string 'Fail'. The userspace recovery - tools should then be used. - - In the case where is 0, there is no highest - mapped sector and the value of is unspecified. diff --git a/Documentation/device-mapper/unstriped.rst b/Documentation/device-mapper/unstriped.rst deleted file mode 100644 index 0a8d3eb3f072..000000000000 --- a/Documentation/device-mapper/unstriped.rst +++ /dev/null @@ -1,135 +0,0 @@ -================================ -Device-mapper "unstriped" target -================================ - -Introduction -============ - -The device-mapper "unstriped" target provides a transparent mechanism to -unstripe a device-mapper "striped" target to access the underlying disks -without having to touch the true backing block-device. It can also be -used to unstripe a hardware RAID-0 to access backing disks. - -Parameters: - - - - The number of stripes in the RAID 0. - - - The amount of 512B sectors in the chunk striping. - - - The block device you wish to unstripe. - - - The stripe number within the device that corresponds to physical - drive you wish to unstripe. This must be 0 indexed. - - -Why use this module? -==================== - -An example of undoing an existing dm-stripe -------------------------------------------- - -This small bash script will setup 4 loop devices and use the existing -striped target to combine the 4 devices into one. It then will use -the unstriped target ontop of the striped device to access the -individual backing loop devices. We write data to the newly exposed -unstriped devices and verify the data written matches the correct -underlying device on the striped array:: - - #!/bin/bash - - MEMBER_SIZE=$((128 * 1024 * 1024)) - NUM=4 - SEQ_END=$((${NUM}-1)) - CHUNK=256 - BS=4096 - - RAID_SIZE=$((${MEMBER_SIZE}*${NUM}/512)) - DM_PARMS="0 ${RAID_SIZE} striped ${NUM} ${CHUNK}" - COUNT=$((${MEMBER_SIZE} / ${BS})) - - for i in $(seq 0 ${SEQ_END}); do - dd if=/dev/zero of=member-${i} bs=${MEMBER_SIZE} count=1 oflag=direct - losetup /dev/loop${i} member-${i} - DM_PARMS+=" /dev/loop${i} 0" - done - - echo $DM_PARMS | dmsetup create raid0 - for i in $(seq 0 ${SEQ_END}); do - echo "0 1 unstriped ${NUM} ${CHUNK} ${i} /dev/mapper/raid0 0" | dmsetup create set-${i} - done; - - for i in $(seq 0 ${SEQ_END}); do - dd if=/dev/urandom of=/dev/mapper/set-${i} bs=${BS} count=${COUNT} oflag=direct - diff /dev/mapper/set-${i} member-${i} - done; - - for i in $(seq 0 ${SEQ_END}); do - dmsetup remove set-${i} - done - - dmsetup remove raid0 - - for i in $(seq 0 ${SEQ_END}); do - losetup -d /dev/loop${i} - rm -f member-${i} - done - -Another example ---------------- - -Intel NVMe drives contain two cores on the physical device. -Each core of the drive has segregated access to its LBA range. -The current LBA model has a RAID 0 128k chunk on each core, resulting -in a 256k stripe across the two cores:: - - Core 0: Core 1: - __________ __________ - | LBA 512| | LBA 768| - | LBA 0 | | LBA 256| - ---------- ---------- - -The purpose of this unstriping is to provide better QoS in noisy -neighbor environments. When two partitions are created on the -aggregate drive without this unstriping, reads on one partition -can affect writes on another partition. This is because the partitions -are striped across the two cores. When we unstripe this hardware RAID 0 -and make partitions on each new exposed device the two partitions are now -physically separated. - -With the dm-unstriped target we're able to segregate an fio script that -has read and write jobs that are independent of each other. Compared to -when we run the test on a combined drive with partitions, we were able -to get a 92% reduction in read latency using this device mapper target. - - -Example dmsetup usage -===================== - -unstriped ontop of Intel NVMe device that has 2 cores ------------------------------------------------------ - -:: - - dmsetup create nvmset0 --table '0 512 unstriped 2 256 0 /dev/nvme0n1 0' - dmsetup create nvmset1 --table '0 512 unstriped 2 256 1 /dev/nvme0n1 0' - -There will now be two devices that expose Intel NVMe core 0 and 1 -respectively:: - - /dev/mapper/nvmset0 - /dev/mapper/nvmset1 - -unstriped ontop of striped with 4 drives using 128K chunk size --------------------------------------------------------------- - -:: - - dmsetup create raid_disk0 --table '0 512 unstriped 4 256 0 /dev/mapper/striped 0' - dmsetup create raid_disk1 --table '0 512 unstriped 4 256 1 /dev/mapper/striped 0' - dmsetup create raid_disk2 --table '0 512 unstriped 4 256 2 /dev/mapper/striped 0' - dmsetup create raid_disk3 --table '0 512 unstriped 4 256 3 /dev/mapper/striped 0' diff --git a/Documentation/device-mapper/verity.rst b/Documentation/device-mapper/verity.rst deleted file mode 100644 index a4d1c1476d72..000000000000 --- a/Documentation/device-mapper/verity.rst +++ /dev/null @@ -1,229 +0,0 @@ -========= -dm-verity -========= - -Device-Mapper's "verity" target provides transparent integrity checking of -block devices using a cryptographic digest provided by the kernel crypto API. -This target is read-only. - -Construction Parameters -======================= - -:: - - - - - - [<#opt_params> ] - - - This is the type of the on-disk hash format. - - 0 is the original format used in the Chromium OS. - The salt is appended when hashing, digests are stored continuously and - the rest of the block is padded with zeroes. - - 1 is the current format that should be used for new devices. - The salt is prepended when hashing and each digest is - padded with zeroes to the power of two. - - - This is the device containing data, the integrity of which needs to be - checked. It may be specified as a path, like /dev/sdaX, or a device number, - :. - - - This is the device that supplies the hash tree data. It may be - specified similarly to the device path and may be the same device. If the - same device is used, the hash_start should be outside the configured - dm-verity device. - - - The block size on a data device in bytes. - Each block corresponds to one digest on the hash device. - - - The size of a hash block in bytes. - - - The number of data blocks on the data device. Additional blocks are - inaccessible. You can place hashes to the same partition as data, in this - case hashes are placed after . - - - This is the offset, in -blocks, from the start of hash_dev - to the root block of the hash tree. - - - The cryptographic hash algorithm used for this device. This should - be the name of the algorithm, like "sha1". - - - The hexadecimal encoding of the cryptographic hash of the root hash block - and the salt. This hash should be trusted as there is no other authenticity - beyond this point. - - - The hexadecimal encoding of the salt value. - -<#opt_params> - Number of optional parameters. If there are no optional parameters, - the optional paramaters section can be skipped or #opt_params can be zero. - Otherwise #opt_params is the number of following arguments. - - Example of optional parameters section: - 1 ignore_corruption - -ignore_corruption - Log corrupted blocks, but allow read operations to proceed normally. - -restart_on_corruption - Restart the system when a corrupted block is discovered. This option is - not compatible with ignore_corruption and requires user space support to - avoid restart loops. - -ignore_zero_blocks - Do not verify blocks that are expected to contain zeroes and always return - zeroes instead. This may be useful if the partition contains unused blocks - that are not guaranteed to contain zeroes. - -use_fec_from_device - Use forward error correction (FEC) to recover from corruption if hash - verification fails. Use encoding data from the specified device. This - may be the same device where data and hash blocks reside, in which case - fec_start must be outside data and hash areas. - - If the encoding data covers additional metadata, it must be accessible - on the hash device after the hash blocks. - - Note: block sizes for data and hash devices must match. Also, if the - verity is encrypted the should be too. - -fec_roots - Number of generator roots. This equals to the number of parity bytes in - the encoding data. For example, in RS(M, N) encoding, the number of roots - is M-N. - -fec_blocks - The number of encoding data blocks on the FEC device. The block size for - the FEC device is . - -fec_start - This is the offset, in blocks, from the start of the - FEC device to the beginning of the encoding data. - -check_at_most_once - Verify data blocks only the first time they are read from the data device, - rather than every time. This reduces the overhead of dm-verity so that it - can be used on systems that are memory and/or CPU constrained. However, it - provides a reduced level of security because only offline tampering of the - data device's content will be detected, not online tampering. - - Hash blocks are still verified each time they are read from the hash device, - since verification of hash blocks is less performance critical than data - blocks, and a hash block will not be verified any more after all the data - blocks it covers have been verified anyway. - -Theory of operation -=================== - -dm-verity is meant to be set up as part of a verified boot path. This -may be anything ranging from a boot using tboot or trustedgrub to just -booting from a known-good device (like a USB drive or CD). - -When a dm-verity device is configured, it is expected that the caller -has been authenticated in some way (cryptographic signatures, etc). -After instantiation, all hashes will be verified on-demand during -disk access. If they cannot be verified up to the root node of the -tree, the root hash, then the I/O will fail. This should detect -tampering with any data on the device and the hash data. - -Cryptographic hashes are used to assert the integrity of the device on a -per-block basis. This allows for a lightweight hash computation on first read -into the page cache. Block hashes are stored linearly, aligned to the nearest -block size. - -If forward error correction (FEC) support is enabled any recovery of -corrupted data will be verified using the cryptographic hash of the -corresponding data. This is why combining error correction with -integrity checking is essential. - -Hash Tree ---------- - -Each node in the tree is a cryptographic hash. If it is a leaf node, the hash -of some data block on disk is calculated. If it is an intermediary node, -the hash of a number of child nodes is calculated. - -Each entry in the tree is a collection of neighboring nodes that fit in one -block. The number is determined based on block_size and the size of the -selected cryptographic digest algorithm. The hashes are linearly-ordered in -this entry and any unaligned trailing space is ignored but included when -calculating the parent node. - -The tree looks something like: - - alg = sha256, num_blocks = 32768, block_size = 4096 - -:: - - [ root ] - / . . . \ - [entry_0] [entry_1] - / . . . \ . . . \ - [entry_0_0] . . . [entry_0_127] . . . . [entry_1_127] - / ... \ / . . . \ / \ - blk_0 ... blk_127 blk_16256 blk_16383 blk_32640 . . . blk_32767 - - -On-disk format -============== - -The verity kernel code does not read the verity metadata on-disk header. -It only reads the hash blocks which directly follow the header. -It is expected that a user-space tool will verify the integrity of the -verity header. - -Alternatively, the header can be omitted and the dmsetup parameters can -be passed via the kernel command-line in a rooted chain of trust where -the command-line is verified. - -Directly following the header (and with sector number padded to the next hash -block boundary) are the hash blocks which are stored a depth at a time -(starting from the root), sorted in order of increasing index. - -The full specification of kernel parameters and on-disk metadata format -is available at the cryptsetup project's wiki page - - https://gitlab.com/cryptsetup/cryptsetup/wikis/DMVerity - -Status -====== -V (for Valid) is returned if every check performed so far was valid. -If any check failed, C (for Corruption) is returned. - -Example -======= -Set up a device:: - - # dmsetup create vroot --readonly --table \ - "0 2097152 verity 1 /dev/sda1 /dev/sda2 4096 4096 262144 1 sha256 "\ - "4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 "\ - "1234000000000000000000000000000000000000000000000000000000000000" - -A command line tool veritysetup is available to compute or verify -the hash tree or activate the kernel device. This is available from -the cryptsetup upstream repository https://gitlab.com/cryptsetup/cryptsetup/ -(as a libcryptsetup extension). - -Create hash on the device:: - - # veritysetup format /dev/sda1 /dev/sda2 - ... - Root hash: 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 - -Activate the device:: - - # veritysetup create vroot /dev/sda1 /dev/sda2 \ - 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 diff --git a/Documentation/device-mapper/writecache.rst b/Documentation/device-mapper/writecache.rst deleted file mode 100644 index d3d7690f5e8d..000000000000 --- a/Documentation/device-mapper/writecache.rst +++ /dev/null @@ -1,79 +0,0 @@ -================= -Writecache target -================= - -The writecache target caches writes on persistent memory or on SSD. It -doesn't cache reads because reads are supposed to be cached in page cache -in normal RAM. - -When the device is constructed, the first sector should be zeroed or the -first sector should contain valid superblock from previous invocation. - -Constructor parameters: - -1. type of the cache device - "p" or "s" - - - p - persistent memory - - s - SSD -2. the underlying device that will be cached -3. the cache device -4. block size (4096 is recommended; the maximum block size is the page - size) -5. the number of optional parameters (the parameters with an argument - count as two) - - start_sector n (default: 0) - offset from the start of cache device in 512-byte sectors - high_watermark n (default: 50) - start writeback when the number of used blocks reach this - watermark - low_watermark x (default: 45) - stop writeback when the number of used blocks drops below - this watermark - writeback_jobs n (default: unlimited) - limit the number of blocks that are in flight during - writeback. Setting this value reduces writeback - throughput, but it may improve latency of read requests - autocommit_blocks n (default: 64 for pmem, 65536 for ssd) - when the application writes this amount of blocks without - issuing the FLUSH request, the blocks are automatically - commited - autocommit_time ms (default: 1000) - autocommit time in milliseconds. The data is automatically - commited if this time passes and no FLUSH request is - received - fua (by default on) - applicable only to persistent memory - use the FUA flag - when writing data from persistent memory back to the - underlying device - nofua - applicable only to persistent memory - don't use the FUA - flag when writing back data and send the FLUSH request - afterwards - - - some underlying devices perform better with fua, some - with nofua. The user should test it - -Status: -1. error indicator - 0 if there was no error, otherwise error number -2. the number of blocks -3. the number of free blocks -4. the number of blocks under writeback - -Messages: - flush - flush the cache device. The message returns successfully - if the cache device was flushed without an error - flush_on_suspend - flush the cache device on next suspend. Use this message - when you are going to remove the cache device. The proper - sequence for removing the cache device is: - - 1. send the "flush_on_suspend" message - 2. load an inactive table with a linear target that maps - to the underlying device - 3. suspend the device - 4. ask for status and verify that there are no errors - 5. resume the device, so that it will use the linear - target - 6. the cache device is now inactive and it can be deleted diff --git a/Documentation/device-mapper/zero.rst b/Documentation/device-mapper/zero.rst deleted file mode 100644 index 11fb5cf4597c..000000000000 --- a/Documentation/device-mapper/zero.rst +++ /dev/null @@ -1,37 +0,0 @@ -======= -dm-zero -======= - -Device-Mapper's "zero" target provides a block-device that always returns -zero'd data on reads and silently drops writes. This is similar behavior to -/dev/zero, but as a block-device instead of a character-device. - -Dm-zero has no target-specific parameters. - -One very interesting use of dm-zero is for creating "sparse" devices in -conjunction with dm-snapshot. A sparse device reports a device-size larger -than the amount of actual storage space available for that device. A user can -write data anywhere within the sparse device and read it back like a normal -device. Reads to previously unwritten areas will return a zero'd buffer. When -enough data has been written to fill up the actual storage space, the sparse -device is deactivated. This can be very useful for testing device and -filesystem limitations. - -To create a sparse device, start by creating a dm-zero device that's the -desired size of the sparse device. For this example, we'll assume a 10TB -sparse device:: - - TEN_TERABYTES=`expr 10 \* 1024 \* 1024 \* 1024 \* 2` # 10 TB in sectors - echo "0 $TEN_TERABYTES zero" | dmsetup create zero1 - -Then create a snapshot of the zero device, using any available block-device as -the COW device. The size of the COW device will determine the amount of real -space available to the sparse device. For this example, we'll assume /dev/sdb1 -is an available 10GB partition:: - - echo "0 $TEN_TERABYTES snapshot /dev/mapper/zero1 /dev/sdb1 p 128" | \ - dmsetup create sparse1 - -This will create a 10TB sparse device called /dev/mapper/sparse1 that has -10GB of actual storage space available. If more than 10GB of data is written -to this device, it will start returning I/O errors. diff --git a/MAINTAINERS b/MAINTAINERS index 49e9a58f4799..b0e044be81ac 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4735,7 +4735,7 @@ Q: http://patchwork.kernel.org/project/dm-devel/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git T: quilt http://people.redhat.com/agk/patches/linux/editing/ S: Maintained -F: Documentation/device-mapper/ +F: Documentation/admin-guide/device-mapper/ F: drivers/md/Makefile F: drivers/md/Kconfig F: drivers/md/dm* diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 5ccac0b77f17..3834332f4963 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -453,7 +453,7 @@ config DM_INIT Enable "dm-mod.create=" parameter to create mapped devices at init time. This option is useful to allow mounting rootfs without requiring an initramfs. - See Documentation/device-mapper/dm-init.rst for dm-mod.create="..." + See Documentation/admin-guide/device-mapper/dm-init.rst for dm-mod.create="..." format. If unsure, say N. diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c index b65faef2c4b5..b869316d3722 100644 --- a/drivers/md/dm-init.c +++ b/drivers/md/dm-init.c @@ -25,7 +25,7 @@ static char *create; * Format: dm-mod.create=,,,,
[,
+][;,,,,
[,
+]+] * Table format: * - * See Documentation/device-mapper/dm-init.rst for dm-mod.create="..." format + * See Documentation/admin-guide/device-mapper/dm-init.rst for dm-mod.create="..." format * details. */ diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 7a87a640f8ba..8a60a4a070ac 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3558,7 +3558,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, * v1.5.0+: * * Sync action: - * See Documentation/device-mapper/dm-raid.rst for + * See Documentation/admin-guide/device-mapper/dm-raid.rst for * information on each of these states. */ DMEMIT(" %s", sync_action); -- cgit